## Import Libraries

In [98]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

%matplotlib inline
plt.rcParams['figure.figsize'] = (16, 8)
pd.set_option('display.max_columns', 100)

## Read the data

In [109]:
train = './data/train.csv'
test = './data/test.csv'

df = pd.read_csv(train)
df_test = pd.read_csv(test)
df.sample(n=5)

test_id = df_test.Id

## Data manipulation

In [110]:
df['Garage Type'] = df['Garage Type'].fillna('None')
df_test['Garage Type'] = df_test['Garage Type'].fillna('None')

df['Garage Cars'] = df['Garage Cars'].fillna(0)
df_test['Garage Cars'] = df_test['Garage Cars'].fillna(0)

In [111]:
missing_data = df.isnull().sum()[df.isnull().sum() > 0]
missing_data = list(missing_data.index)
missing_data

df.drop(missing_data, axis=1, inplace=True)
df_test.drop(missing_data, axis=1, inplace=True)
print(df.shape)
print(df_test.shape)

(2051, 57)
(879, 56)


## Feature engineering

In [112]:
garage_dummies = pd.get_dummies(df['Garage Type'], prefix='Garage')
garage_dummies_test = pd.get_dummies(df_test['Garage Type'], prefix='Garage')

In [113]:
df['MS Zoning'] = df['MS Zoning'].str.replace('R.', 'R', regex=True)
df_test['MS Zoning'] = df_test['MS Zoning'].str.replace('R.', 'R', regex=True)
# All residentials as a single variable

zoning_dummies = pd.get_dummies(df['MS Zoning'], prefix='Zoning')
zoning_dummies_test = pd.get_dummies(df_test['MS Zoning'], prefix='Zoning')

zoning_dummies = zoning_dummies[['Zoning_R']]
zoning_dummies_test = zoning_dummies_test[['Zoning_R']]

In [114]:
h_style_dummies = pd.get_dummies(df['House Style'], prefix='Style')
h_style_dummies_test = pd.get_dummies(df_test['House Style'], prefix='Style')

h_style_dummies = h_style_dummies[['Style_1Story', 'Style_2Story']] # Just the 2 most common
h_style_dummies_test = h_style_dummies_test[['Style_1Story', 'Style_2Story']] # Just the 2 most common

In [115]:
sale_type_dummies = pd.get_dummies(df['Sale Type'], prefix='Sale')
sale_type_dummies_test = pd.get_dummies(df_test['Sale Type'], prefix='Sale')

sale_type_dummies = sale_type_dummies[['Sale_New', 'Sale_WD ']] # Looking for new or WD
sale_type_dummies_test = sale_type_dummies_test[['Sale_New', 'Sale_WD ']] # Looking for new or WD

In [116]:
dummies = pd.concat([garage_dummies, zoning_dummies, h_style_dummies, sale_type_dummies], axis=1)
dummies_test = pd.concat([garage_dummies_test, zoning_dummies_test, h_style_dummies_test, sale_type_dummies_test], axis=1)

In [117]:
df['Years_Remodeled'] = df['Year Remod/Add'] - df['Year Built']
df['Years_Remodeled'].sample(10)

585      1
235     92
575      1
339      0
869     52
1602     1
596      1
1218     1
926      0
208      1
Name: Years_Remodeled, dtype: int64

## Feature and predictor selection

In [118]:
feature_cols = ['Overall Qual', 'Year Built', 'Gr Liv Area', 'Full Bath', 'Garage Cars', 'Lot Area']
X = df[feature_cols]
X = pd.concat([X, dummies], axis = 1)
print(X.shape)

X_df_test = df_test[feature_cols]
X_df_test = pd.concat([X_df_test, dummies_test], axis = 1)
print(X_df_test.shape)

(2051, 18)
(879, 18)


In [119]:
y = df['SalePrice']

## Train test split

In [120]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 99)

## Model fitting

In [121]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

## Model prediction

In [122]:
y_pred = lr.predict(X_test)

## Cross Validation

In [123]:
kf = model_selection.KFold(n_splits=5, shuffle=True)

print(np.mean(-cross_val_score(lr, X, y, cv=kf, scoring='neg_mean_squared_error')))
print(np.mean(cross_val_score(lr, X, y, cv=kf)))

1319462364.945849
0.7900581671689385


## Predict with test data

In [141]:
y_pred = lr.predict(X_df_test)

prediction = pd.concat([test_id, pd.Series(y_pred)], axis = 1)
prediction.columns = ['Id', 'SalePrice']

prediction.sample(5)

Unnamed: 0,Id,SalePrice
430,2155,293329.934199
383,442,307052.948991
464,1768,471670.466387
634,288,43493.602043
327,673,134657.067098


In [142]:
prediction.to_csv("Emilio_Try1_LR.csv", index=False)