In [5]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
from pycaret.regression import *

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [12]:
#Correlation matrix to see the colums and there effect on the target variable
#Can only include numbers so make sure the number dypes are selected and their corralation
#Use the made core matrix to then compare cor

corr_matrix = train_data.select_dtypes(include=['number']).corr()
corr_saleprice = corr_matrix['SalePrice'].sort_values(ascending=False)
print(corr_saleprice)

SalePrice        1.000000
OverallQual      0.790982
GrLivArea        0.708624
GarageCars       0.640409
GarageArea       0.623431
TotalBsmtSF      0.613581
1stFlrSF         0.605852
FullBath         0.560664
TotRmsAbvGrd     0.533723
YearBuilt        0.522897
YearRemodAdd     0.507101
GarageYrBlt      0.486362
MasVnrArea       0.477493
Fireplaces       0.466929
BsmtFinSF1       0.386420
LotFrontage      0.351799
WoodDeckSF       0.324413
2ndFlrSF         0.319334
OpenPorchSF      0.315856
HalfBath         0.284108
LotArea          0.263843
BsmtFullBath     0.227122
BsmtUnfSF        0.214479
BedroomAbvGr     0.168213
ScreenPorch      0.111447
PoolArea         0.092404
MoSold           0.046432
3SsnPorch        0.044584
BsmtFinSF2      -0.011378
BsmtHalfBath    -0.016844
MiscVal         -0.021190
Id              -0.021917
LowQualFinSF    -0.025606
YrSold          -0.028923
OverallCond     -0.077856
MSSubClass      -0.084284
EnclosedPorch   -0.128578
KitchenAbvGr    -0.135907
Name: SalePr

In [6]:
exp = setup(data=train_data, target='SalePrice', session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,SalePrice
2,Target type,Regression
3,Original data shape,"(1460, 81)"
4,Transformed data shape,"(1460, 279)"
5,Transformed train set shape,"(1021, 279)"
6,Transformed test set shape,"(439, 279)"
7,Numeric features,37
8,Categorical features,43
9,Rows with missing values,100.0%


In [7]:
best = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,17281.3655,847391120.4738,28314.1589,0.8723,0.1376,0.1,0.191
lightgbm,Light Gradient Boosting Machine,17701.1514,1019922835.2337,31002.5633,0.8484,0.1449,0.102,0.374
xgboost,Extreme Gradient Boosting,19068.137,1120869534.7547,32187.503,0.8345,0.153,0.1102,0.16
rf,Random Forest Regressor,19042.9144,1117550843.2233,32426.1601,0.8345,0.1545,0.1114,0.318
et,Extra Trees Regressor,18873.1767,1163678875.5551,32786.948,0.8327,0.1513,0.1095,0.32
ada,AdaBoost Regressor,25963.6459,1417400709.1888,37051.1993,0.7868,0.2056,0.1679,0.147
llar,Lasso Least Angle Regression,19390.337,1501646360.0166,35912.6764,0.7681,0.1743,0.1146,0.096
ridge,Ridge Regression,20081.3998,1526032661.7945,36561.0633,0.7668,0.2087,0.1213,0.079
en,Elastic Net,21115.3002,1782962549.6151,38958.876,0.7382,0.1739,0.1218,0.124
omp,Orthogonal Matching Pursuit,22617.0267,1808808421.5105,39503.146,0.7344,0.1856,0.1343,0.089


In [14]:
model = create_model('gbr')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,20896.1246,2122654023.4617,46072.2696,0.7407,0.1807,0.117
1,17670.7246,828232437.246,28779.0277,0.8708,0.1297,0.093
2,15562.0048,499906555.5035,22358.5902,0.8918,0.1296,0.0965
3,15490.9462,595034884.3796,24393.3369,0.8853,0.1267,0.0915
4,18684.5103,923253676.228,30385.0897,0.9061,0.1707,0.1214
5,15675.2317,809716969.6842,28455.5262,0.8878,0.1273,0.0912
6,18212.9154,703928920.3464,26531.6588,0.8507,0.122,0.0986
7,16206.7383,553931560.3588,23535.7507,0.9187,0.1336,0.0985
8,19545.0859,987700126.5012,31427.6968,0.8526,0.1503,0.1118
9,14869.3734,449552051.028,21202.6425,0.9185,0.1055,0.0802


In [17]:
final_model = finalize_model(model)

In [18]:
save_model(tuned_model, 'final_model_pycaret')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['Id', 'MSSubClass', 'LotFrontage',
                                              'LotArea', 'OverallQual',
                                              'OverallCond', 'YearBuilt',
                                              'YearRemodAdd', 'MasVnrArea',
                                              'BsmtFinSF1', 'BsmtFinSF2',
                                              'BsmtUnfSF', 'TotalBsmtSF',
                                              '1stFlrSF', '2ndFlrSF',
                                              'LowQualFinSF', 'GrLivArea',
                                              'BsmtFullBath', 'BsmtHalfBath',
                                              'FullBath', 'Hal...
                                                                     'Exterior1st',
                                                                     'Exterior2nd',
             

In [20]:
predictions = predict_model(final_model, data=test_data)

In [23]:
submission = pd.DataFrame({
    'Id': test_data['Id'],
    'SalePrice': predictions['prediction_label']
})

In [24]:
submission.to_csv('submission.csv', index=False)