In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline

In [2]:
df_train=pd.read_csv("train.csv")
df_test=pd.read_csv("test.csv")
df_all=pd.concat((df_train,df_test),sort=False)


In [3]:
df_all.shape

(2919, 81)

In [4]:
df_all.select_dtypes(include="object").isnull().sum()[(df_all.isnull().sum()>0)]

MSZoning           4
Alley           2721
Utilities          2
Exterior1st        1
Exterior2nd        1
MasVnrType        24
BsmtQual          81
BsmtCond          82
BsmtExposure      82
BsmtFinType1      79
BsmtFinType2      80
Electrical         1
KitchenQual        1
Functional         2
FireplaceQu     1420
GarageType       157
GarageFinish     159
GarageQual       159
GarageCond       159
PoolQC          2909
Fence           2348
MiscFeature     2814
SaleType           1
dtype: int64

In [5]:
#categorical na
#fill na that mean 0, as none and fill the na that are truly unavaiable as the mode of the class
for col in ('Alley','Utilities','MasVnrType','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1',
            'BsmtFinType2','Electrical','FireplaceQu','GarageType','GarageFinish','GarageQual','GarageCond',
           'PoolQC','Fence','MiscFeature'):
    df_all[col]=df_all[col].fillna("None")
for col in ('MSZoning','Exterior1st','Exterior2nd','KitchenQual','SaleType','Functional'):
    df_all[col]=df_all[col].fillna(df_all[col].mode()[0])
    

In [6]:
df_all.isnull().sum()[(df_all.isnull().sum()>0)]

LotFrontage      486
MasVnrArea        23
BsmtFinSF1         1
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
BsmtFullBath       2
BsmtHalfBath       2
GarageYrBlt      159
GarageCars         1
GarageArea         1
SalePrice       1459
dtype: int64

In [7]:
#numerical na
#fill na that mean 0, as none and fill the na that are truly unavaiable as the average of the class
df_all['LotFrontage']=df_all['LotFrontage'].fillna(df_all['LotFrontage'].mean())

for col in ('MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','BsmtFullBath','BsmtHalfBath','GarageYrBlt','GarageCars','GarageArea'):
    df_all[col]=df_all[col].fillna(0)


In [8]:
df_all.isnull().sum()[(df_all.isnull().sum()>0)]

SalePrice    1459
dtype: int64

In [9]:
#transforming "numerical" variables that are categorical
for col in ["MSSubClass","OverallCond",'YrSold','MoSold']:
    df_all[col]=df_all[col].astype(str)

In [10]:
from sklearn.preprocessing import LabelEncoder
cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold')

for col in cols:
    lbl = LabelEncoder() 
    lbl.fit(list(df_all[col].values)) 
    df_all[col]=lbl.transform(list(df_all[col].values))

In [11]:
df_all=pd.get_dummies(df_all)
df_train=df_all[np.invert(df_all["SalePrice"].isna())]
df_test=df_all[df_all["SalePrice"].isna()]

In [21]:
df_train["SalePrice"]

0       208500.0
1       181500.0
2       223500.0
3       140000.0
4       250000.0
5       143000.0
6       307000.0
7       200000.0
8       129900.0
9       118000.0
10      129500.0
11      345000.0
12      144000.0
13      279500.0
14      157000.0
15      132000.0
16      149000.0
17       90000.0
18      159000.0
19      139000.0
20      325300.0
21      139400.0
22      230000.0
23      129900.0
24      154000.0
25      256300.0
26      134800.0
27      306000.0
28      207500.0
29       68500.0
          ...   
1430    192140.0
1431    143750.0
1432     64500.0
1433    186500.0
1434    160000.0
1435    174000.0
1436    120500.0
1437    394617.0
1438    149700.0
1439    197000.0
1440    191000.0
1441    149300.0
1442    310000.0
1443    121000.0
1444    179600.0
1445    129000.0
1446    157900.0
1447    240000.0
1448    112000.0
1449     92000.0
1450    136000.0
1451    287090.0
1452    145000.0
1453     84500.0
1454    185000.0
1455    175000.0
1456    210000.0
1457    266500

In [18]:
df_train.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,Street,Alley,LotShape,LandSlope,OverallQual,OverallCond,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,10,65.0,8450,1,1,3,0,7,4,...,0,0,0,1,0,0,0,0,1,0
1,2,5,80.0,9600,1,1,3,0,6,7,...,0,0,0,1,0,0,0,0,1,0
2,3,10,68.0,11250,1,1,0,0,7,4,...,0,0,0,1,0,0,0,0,1,0
3,4,11,60.0,9550,1,1,0,0,7,4,...,0,0,0,1,1,0,0,0,0,0
4,5,10,84.0,14260,1,1,0,0,8,4,...,0,0,0,1,0,0,0,0,1,0


In [12]:
import lightgbm as lgb
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=31,
                              learning_rate=0.01, n_estimators=10000,
                              max_bin = 55, bagging_fraction = 0.8,
                              feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11,
                            )



In [13]:
model_lgb.fit(df_train.drop(['SalePrice',"Id"],axis=1),df_train['SalePrice'])

LGBMRegressor(bagging_fraction=0.8, bagging_seed=9, boosting_type='gbdt',
       class_weight=None, colsample_bytree=1.0, feature_fraction=0.2319,
       feature_fraction_seed=9, importance_type='split',
       learning_rate=0.01, max_bin=55, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_data_in_leaf=6, min_split_gain=0.0,
       min_sum_hessian_in_leaf=11, n_estimators=10000, n_jobs=-1,
       num_leaves=31, objective='regression', random_state=None,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=0)

In [29]:
pred=model_lgb.predict(df_test.drop(['SalePrice',"Id"],axis=1))
pred

array([126154.58062381, 163876.02819577, 188292.15152199, ...,
       174452.6627087 , 115371.66805284, 218637.35259986])

In [31]:
output=pd.DataFrame({'Id':df_test.Id, 'SalePrice':pred})
output.to_csv('submission.csv', index=False)