## Import Libraries

In [28]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

%matplotlib inline
plt.rcParams['figure.figsize'] = (16, 8)
pd.set_option('display.max_columns', 100)

## Read the data

In [2]:
train = './data/train.csv'
test = './data/test.csv'

df = pd.read_csv(train)
df_test = pd.read_csv(test)
df.sample(n=5)

test_id = df_test.Id

## Data manipulation

In [3]:
df['Garage Type'] = df['Garage Type'].fillna('None')
df_test['Garage Type'] = df_test['Garage Type'].fillna('None')

df['Garage Cars'] = df['Garage Cars'].fillna(0)
df_test['Garage Cars'] = df_test['Garage Cars'].fillna(0)

df['Total Bsmt SF'] = df['Total Bsmt SF'].fillna(0)
df_test['Total Bsmt SF'] = df_test['Total Bsmt SF'].fillna(0)

In [4]:
missing_data = df.isnull().sum()[df.isnull().sum() > 0]
missing_data = list(missing_data.index)
missing_data

df.drop(missing_data, axis=1, inplace=True)
df_test.drop(missing_data, axis=1, inplace=True)
print(df.shape)
print(df_test.shape)

(2051, 58)
(879, 57)


## Feature engineering

In [5]:
garage_dummies = pd.get_dummies(df['Garage Type'], prefix='Garage')
garage_dummies_test = pd.get_dummies(df_test['Garage Type'], prefix='Garage')

In [6]:
df['MS Zoning'] = df['MS Zoning'].str.replace('R.', 'R', regex=True)
df_test['MS Zoning'] = df_test['MS Zoning'].str.replace('R.', 'R', regex=True)
# All residentials as a single variable

zoning_dummies = pd.get_dummies(df['MS Zoning'], prefix='Zoning')
zoning_dummies_test = pd.get_dummies(df_test['MS Zoning'], prefix='Zoning')

zoning_dummies = zoning_dummies[['Zoning_R']]
zoning_dummies_test = zoning_dummies_test[['Zoning_R']]

In [7]:
h_style_dummies = pd.get_dummies(df['House Style'], prefix='Style')
h_style_dummies_test = pd.get_dummies(df_test['House Style'], prefix='Style')

h_style_dummies = h_style_dummies[['Style_1Story', 'Style_2Story']] # Just the 2 most common
h_style_dummies_test = h_style_dummies_test[['Style_1Story', 'Style_2Story']] # Just the 2 most common

In [8]:
sale_type_dummies = pd.get_dummies(df['Sale Type'], prefix='Sale')
sale_type_dummies_test = pd.get_dummies(df_test['Sale Type'], prefix='Sale')

sale_type_dummies = sale_type_dummies[['Sale_New', 'Sale_WD ']] # Looking for new or WD
sale_type_dummies_test = sale_type_dummies_test[['Sale_New', 'Sale_WD ']] # Looking for new or WD

New in try 5

In [9]:
condition_dummies = pd.get_dummies(df['Condition 1'], prefix='Condition')
condition_dummies_test = pd.get_dummies(df_test['Condition 1'], prefix='Condition')

condition_dummies = condition_dummies[['Condition_Norm', 'Condition_PosA', 'Condition_PosN']]
condition_dummies_test = condition_dummies_test[['Condition_Norm', 'Condition_PosA', 'Condition_PosN']]

In [10]:
foundation_dummies = pd.get_dummies(df['Foundation'], prefix='Foundation')
foundation_dummies_test = pd.get_dummies(df_test['Foundation'], prefix='Foundation')

foundation_dummies = foundation_dummies[['Foundation_PConc']]
foundation_dummies_test = foundation_dummies_test[['Foundation_PConc']]

In [11]:
heating_dummies = pd.get_dummies(df['Heating QC'], prefix='Heating')
heating_dummies_test = pd.get_dummies(df_test['Heating QC'], prefix='Heating')

heating_dummies = heating_dummies[['Heating_Ex']]
heating_dummies_test = heating_dummies_test[['Heating_Ex']]

In [12]:
#df['Functional'] = df['Functional'].str.slice(0,3)
#df_test['Functional'] = df_test['Functional'].str.slice(0,3)

functional_dummies = pd.get_dummies(df['Functional'], prefix='Functional')
functional_dummies_test = pd.get_dummies(df_test['Functional'], prefix='Functional')

functional_dummies = functional_dummies[['Functional_Typ']]
functional_dummies_test = functional_dummies_test[['Functional_Typ']]

In [13]:
dummies = pd.concat([garage_dummies, zoning_dummies, h_style_dummies, sale_type_dummies, condition_dummies, foundation_dummies, heating_dummies, functional_dummies], axis=1)
dummies_test = pd.concat([garage_dummies_test, zoning_dummies_test, h_style_dummies_test, sale_type_dummies_test, condition_dummies_test, foundation_dummies_test, heating_dummies_test, functional_dummies_test], axis=1)

New in try 5

In [14]:
df['AgeBuilt'] = 2019 - df['Year Built']
df_test['AgeBuilt'] = 2019 - df_test['Year Built']

df['AgeRemod'] = 2019 - df['Year Remod/Add']
df_test['AgeRemod'] = 2019 - df_test['Year Remod/Add']

df['BuiltArea'] = df['Total Bsmt SF'] + df['1st Flr SF'] + df['Gr Liv Area']
df_test['BuiltArea'] = df_test['Total Bsmt SF'] + df_test['1st Flr SF'] + df_test['Gr Liv Area']

## Feature and predictor selection

In [15]:
feature_cols = ['Overall Qual', 'Full Bath', 'Garage Cars', 'Lot Area', 'AgeBuilt', 'AgeRemod', 'BuiltArea', 'Open Porch SF', 'Enclosed Porch']
X = df[feature_cols]
X = pd.concat([X, dummies], axis = 1)
print(X.shape)

X_df_test = df_test[feature_cols]
X_df_test = pd.concat([X_df_test, dummies_test], axis = 1)
print(X_df_test.shape)

(2051, 27)
(879, 27)


In [16]:
y = df['SalePrice']

## Standard Scaler

In [32]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
Xs = ss.fit_transform(X)
Xs_test = ss.transform(X_df_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  """


## Train test split

In [33]:
X_train, X_test, y_train, y_test = train_test_split(Xs,y, random_state = 99)

## Model fitting

In [34]:
rfr = RandomForestRegressor(n_estimators=300)
rfr.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

## Model prediction

In [35]:
y_pred = rfr.predict(Xs_test)

## Cross Validation

In [38]:
kf = model_selection.KFold(n_splits=5, shuffle=True)

print(np.mean(cross_val_score(rfr, X, y, cv=kf)))
print(np.sqrt(np.mean(-cross_val_score(rfr, X, y, cv=kf, scoring='neg_mean_squared_error'))))

0.8731369911485029
27970.979547353454


## Predict with test data

In [34]:
y_pred = rfr.predict(X_df_test)

prediction = pd.concat([test_id, pd.Series(y_pred)], axis = 1)
prediction.columns = ['Id', 'SalePrice']

prediction.sample(5)

Unnamed: 0,Id,SalePrice
17,790,131910.0
114,2908,155700.0
851,972,163000.0
174,2509,150947.6
163,1922,135630.0


In [35]:
prediction.to_csv("Emilio_Try6_RandomForest_SS.csv", index=False)