In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# from csc665 import features, metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score

In [2]:
from pandas.api.types import is_string_dtype

def create_categories(df: pd.DataFrame):
    for name, col in df.items():
        if is_string_dtype(col):
            df[name] = df[name].astype('category').cat.codes

In [3]:
csv_df_train = pd.read_csv("./house-prices-advanced-regression-techniques/train.csv")
csv_df_test = pd.read_csv("./house-prices-advanced-regression-techniques/test.csv")
csv_df_sample = pd.read_csv("./house-prices-advanced-regression-techniques/sample_submission.csv")

In [4]:
csv_df_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
csv_df_test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


csv_df_sample.head()

In [6]:
csv_df_train.shape, csv_df_test.shape, csv_df_sample.shape

((1460, 81), (1459, 80), (1459, 2))

In [7]:
create_categories(csv_df_train)
# X, y = features.preprocess_ver_1(csv_df_train, 'SalePrice')
csv_df_train.shape

(1460, 81)

In [8]:
csv_df_train.fillna(-1, inplace=True)

In [9]:
csv_df_train.shape

(1460, 81)

In [10]:
X = csv_df_train.drop(['SalePrice'], axis=1)

In [11]:
y = csv_df_train['SalePrice'].values

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

In [13]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1168, 80), (292, 80), (1168,), (292,))

# Decision Treee Regressor

In [14]:
dt = DecisionTreeRegressor()

In [15]:
%time dt.fit(X_train, y_train)

CPU times: user 30.3 ms, sys: 2.83 ms, total: 33.1 ms
Wall time: 31.7 ms


DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [16]:
test_Predict = dt.predict(X_test)

In [17]:
dt.score(X_test, y_test)

0.668389099189486

In [18]:
accuracy_score(y_test, test_Predict)

0.010273972602739725

# Random Forest Regressor

In [19]:
rf = RandomForestRegressor(n_estimators=1000, n_jobs=-1)

In [20]:
%time rf.fit(X_train, y_train)

CPU times: user 23.9 s, sys: 343 ms, total: 24.3 s
Wall time: 4.14 s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [21]:
rf_test_Predict = rf.predict(X_test)

In [22]:
rf.score(X_test, y_test)

0.7501718144919347

In [23]:
accuracy_score(y_test, test_Predict)

0.010273972602739725

In [24]:
create_categories(csv_df_test)
csv_df_test.fillna(-1, inplace=True)

In [25]:
testSet_predict = rf.predict(csv_df_test)

In [26]:
testSet_predict.shape

(1459,)

In [27]:
csv_df_sample.shape

(1459, 2)

In [28]:
csv_df_sample['SalePrice'] = testSet_predict

In [29]:
csv_df_sample.head()

Unnamed: 0,Id,SalePrice
0,1461,126064.893
1,1462,155411.985
2,1463,185907.374
3,1464,187743.684
4,1465,197967.399


In [30]:
csv_df_sample.to_csv("predict_RF.csv", index=False)