This project was a part of a kaggle challenge which was able to get a public score of 15865.38007 and was able to secure a rank in top 11.13% or a percentile  88.86

# IMPORTING LIBRARIES

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error

# READING FILES

In [2]:
# Read the data
train=pd.read_csv('train.csv', index_col='Id')
X_original = train
test = pd.read_csv('test.csv', index_col='Id')

# Remove rows with missing target, separate target from predictors
y=train['SalePrice']
X_original=train.drop(['SalePrice'], axis=1)
obj=X_original.select_dtypes(include=object).columns

# SELECTING COLUMNS

In [3]:
good_label_cols = [col for col in obj if 
                   set(train[col]) == set(test[col])]
        
# Problematic columns that will be dropped from the dataset
bad_label_cols = list(set(obj)-set(good_label_cols))
        
print('Categorical columns that will be label encoded:', good_label_cols)
print('\nCategorical columns that will be dropped from the dataset:', bad_label_cols)
obj=good_label_cols

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
hot_cols = [cname for cname in obj if
                    X_original[cname].nunique() < 10 ]

#select categorical columns with high cardinality
ordinary_cols = [cname for cname in obj if
                    X_original[cname].nunique() >= 10 ] 
        

# Select numerical columns
numerical_cols = [cname for cname in X_original if 
                X_original[cname].dtype in ['int64', 'float64']]

my_cols = hot_cols + ordinary_cols + numerical_cols
#X_train,X_val,y_train,y_val=train_test_split(X_original,y,random_state=0)

#selecting only required cooumns
#X_train=X_train[my_cols]
#X_val=X_val[my_cols]
X=X_original[my_cols]
test=test[my_cols]

Categorical columns that will be label encoded: ['Street', 'Alley', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'BldgType', 'RoofStyle', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'CentralAir', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageCond', 'PavedDrive', 'Fence', 'SaleCondition']

Categorical columns that will be dropped from the dataset: ['MiscFeature', 'Functional', 'RoofMatl', 'Heating', 'Electrical', 'SaleType', 'HouseStyle', 'MSZoning', 'Exterior1st', 'GarageQual', 'PoolQC', 'KitchenQual', 'Exterior2nd', 'Utilities', 'Condition2']


# DEFINING TRANSFORMERS

In [4]:
num_transformer=SimpleImputer(strategy='constant')
hot_transformer=Pipeline(steps=[('imputer',SimpleImputer(strategy='constant')),
                                ('Encoder',OneHotEncoder(handle_unknown='ignore'))])

ord_transformer=Pipeline(steps=[('imputer',SimpleImputer(strategy='constant')),
                                ('Encoder',OrdinalEncoder(dtype='float64'))])

Preprocessor=ColumnTransformer(transformers=[('numerical',num_transformer,numerical_cols),
                                             ('ord',ord_transformer,ordinary_cols),
                                             ('hot',hot_transformer,hot_cols)])

scale=MinMaxScaler()

# DEFINING PIPELINE

In [5]:
model1=xgb.XGBRegressor(max_depth=2,n_estimators=400,n_jobs=-1,reg_lambda=2,reg_alpha=6)
pipeline1=Pipeline(steps=[('preprocessing',Preprocessor),
                          ('scale',scale),
                         ('model',model1)])

In [6]:
pipeline1.fit(X,y)
#pred=pipeline1.predict(X_val)
#mean_absolute_error(pred,y_val)

Pipeline(memory=None,
         steps=[('preprocessing',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('numerical',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='constant',
                                                                verbose=0),
                                                  ['MSSubClass', 'LotFrontage',
                                                   'LotArea', 'OverallQual',
                                                   'Ov

In [7]:
pred=pipeline1.predict(test)

In [8]:
output = pd.DataFrame({'Id': test.index, 'SalePrice': pred})
output.to_csv('submission.csv', index=False)