In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split


In [2]:
df_train_full=pd.read_csv('../input/train.csv')
print(len(df_train_full))

df_train_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
print(len(df_train_full))

df_test_full=pd.read_csv('../input/test.csv')

1460
1460


In [3]:
y = df_train_full.SalePrice
df_train_full.drop(['SalePrice'], axis=1, inplace=True)
X_train, X_valid, y_train, y_valid = train_test_split(df_train_full, y, train_size=0.8, test_size=0.2,random_state=0)


In [4]:
print(len(X_train.index))
print(len(df_train_full.columns))
print(len(y_train))


1168
80
1168




**Taking care of null values**

  Those columns that have more than 75% missing values will be dropped


In [5]:
missing_val_count_by_column = (X_train.isnull().sum(axis=0))
# print(missing_val_count_by_column['Fence'])

cols_with_missing_values=np.array(missing_val_count_by_column[missing_val_count_by_column > 0].index)
print('Columns with missing values: {}'.format((cols_with_missing_values)))
 
cols_with_missing_more_than_75_percent =np.array(missing_val_count_by_column[missing_val_count_by_column > len(X_train.index)*(3/4)].index)
print('Columns with more than 75 percent missing values: {}'.format((cols_with_missing_more_than_75_percent)))
X_train=X_train.drop(cols_with_missing_more_than_75_percent,axis=1)
X_valid=X_valid.drop(cols_with_missing_more_than_75_percent,axis=1)
X_test=df_test_full.drop(cols_with_missing_more_than_75_percent,axis=1)


remaining_missing_cols=np.setdiff1d(cols_with_missing_values,cols_with_missing_more_than_75_percent)
print(remaining_missing_cols)

Columns with missing values: ['LotFrontage' 'Alley' 'MasVnrType' 'MasVnrArea' 'BsmtQual' 'BsmtCond'
 'BsmtExposure' 'BsmtFinType1' 'BsmtFinType2' 'Electrical' 'FireplaceQu'
 'GarageType' 'GarageYrBlt' 'GarageFinish' 'GarageQual' 'GarageCond'
 'PoolQC' 'Fence' 'MiscFeature']
Columns with more than 75 percent missing values: ['Alley' 'PoolQC' 'Fence' 'MiscFeature']
['BsmtCond' 'BsmtExposure' 'BsmtFinType1' 'BsmtFinType2' 'BsmtQual'
 'Electrical' 'FireplaceQu' 'GarageCond' 'GarageFinish' 'GarageQual'
 'GarageType' 'GarageYrBlt' 'LotFrontage' 'MasVnrArea' 'MasVnrType']


In [6]:
# remove columns with high correlation
X_train=X_train.drop(['GarageArea','GarageYrBlt'],axis=1)
X_valid=X_valid.drop(['GarageArea','GarageYrBlt'],axis=1)
X_test=df_test_full.drop(['GarageArea','GarageYrBlt'],axis=1)

In [7]:

categorical_cols = [cname for cname in X_train.columns if X_train[cname].dtype == "object"]
numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']]
print(len(categorical_cols))
print(len(numerical_cols))

39
35


In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

numerical_transformer = SimpleImputer(strategy='constant')

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [9]:
from xgboost import XGBRegressor

xgb_model = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=4)




# my_model_1 = XGBRegressor(random_state=0) 
# model = RandomForestRegressor(n_estimators=100, criterion='mae',random_state=0)


pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', xgb_model)
                             ])

pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='constant',
                                                                verbose=0),
                                                  ['Id', 'MSSubClass',
                                                   'LotFrontage', 'LotArea',
                                                   'OverallQual', 'Ove

In [10]:
from sklearn.metrics import mean_absolute_error
preds = pipeline.predict(X_valid)
# Evaluate the model
score = mean_absolute_error(y_valid, preds)
print('MAE:', score)

MAE: 15997.046099101028


In [11]:
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

In [15]:
mae_compare = pd.Series()
mae_compare.index.name = 'Algorithm'

mae_compare['XGBRegressor'] = score


rf_model = RandomForestRegressor(random_state=3)

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('rf_model', rf_model)
                          ])
pipeline.fit(X_train, y_train)
rf_val_mae = mean_absolute_error(pipeline.predict(X_valid), y_valid)
mae_compare['RandomForest'] = rf_val_mae




linear_model = LinearRegression()
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('linear_model', linear_model)
                          ])
pipeline.fit(X_train, y_train)
linear_val_mae = mean_absolute_error(pipeline.predict(X_valid), y_valid)
mae_compare['LinearRegression'] = linear_val_mae




lasso_model = Lasso(alpha=0.0005, random_state=5)
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('lasso_model', lasso_model)
                          ])
pipeline.fit(X_train, y_train)
lasso_val_mae = mean_absolute_error(pipeline.predict(X_valid), y_valid)
mae_compare['Lasso'] = lasso_val_mae



print('MAE values for different algorithms:')
mae_compare.sort_values(ascending=True).round()


  mae_compare = pd.Series()


MAE values for different algorithms:


  model = cd_fast.sparse_enet_coordinate_descent(


Algorithm
XGBRegressor        15997.0
RandomForest        17351.0
Lasso               22192.0
LinearRegression    22856.0
dtype: float64