## Import Libraries

In [15]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_score

%matplotlib inline
plt.rcParams['figure.figsize'] = (16, 8)
pd.set_option('display.max_columns', 100)

## Read the data

In [2]:
train = './data/train.csv'
test = './data/test.csv'

df = pd.read_csv(train)
df_test = pd.read_csv(test)
df.sample(n=5)

test_id = df_test.Id

## Data manipulation

In [3]:
df['Garage Type'] = df['Garage Type'].fillna('None')
df_test['Garage Type'] = df_test['Garage Type'].fillna('None')

df['Garage Cars'] = df['Garage Cars'].fillna(0)
df_test['Garage Cars'] = df_test['Garage Cars'].fillna(0)

In [4]:
missing_data = df.isnull().sum()[df.isnull().sum() > 0]
missing_data = list(missing_data.index)
missing_data

df.drop(missing_data, axis=1, inplace=True)
df_test.drop(missing_data, axis=1, inplace=True)
print(df.shape)
print(df_test.shape)

(2051, 57)
(879, 56)


## Feature engineering

In [5]:
garage_dummies = pd.get_dummies(df['Garage Type'], prefix='Garage')
garage_dummies_test = pd.get_dummies(df_test['Garage Type'], prefix='Garage')

In [6]:
df['MS Zoning'] = df['MS Zoning'].str.replace('R.', 'R', regex=True)
df_test['MS Zoning'] = df_test['MS Zoning'].str.replace('R.', 'R', regex=True)
# All residentials as a single variable

zoning_dummies = pd.get_dummies(df['MS Zoning'], prefix='Zoning')
zoning_dummies_test = pd.get_dummies(df_test['MS Zoning'], prefix='Zoning')

zoning_dummies = zoning_dummies[['Zoning_R']]
zoning_dummies_test = zoning_dummies_test[['Zoning_R']]

In [7]:
h_style_dummies = pd.get_dummies(df['House Style'], prefix='Style')
h_style_dummies_test = pd.get_dummies(df_test['House Style'], prefix='Style')

h_style_dummies = h_style_dummies[['Style_1Story', 'Style_2Story']] # Just the 2 most common
h_style_dummies_test = h_style_dummies_test[['Style_1Story', 'Style_2Story']] # Just the 2 most common

In [8]:
sale_type_dummies = pd.get_dummies(df['Sale Type'], prefix='Sale')
sale_type_dummies_test = pd.get_dummies(df_test['Sale Type'], prefix='Sale')

sale_type_dummies = sale_type_dummies[['Sale_New', 'Sale_WD ']] # Looking for new or WD
sale_type_dummies_test = sale_type_dummies_test[['Sale_New', 'Sale_WD ']] # Looking for new or WD

In [9]:
dummies = pd.concat([garage_dummies, zoning_dummies, h_style_dummies, sale_type_dummies], axis=1)
dummies_test = pd.concat([garage_dummies_test, zoning_dummies_test, h_style_dummies_test, sale_type_dummies_test], axis=1)

In [10]:
df['Years_Remodeled'] = df['Year Remod/Add'] - df['Year Built']
df['Years_Remodeled'].sample(10)

1545     0
1151     0
1050    34
887      0
646      0
348      1
150     38
6        1
446      0
826     46
Name: Years_Remodeled, dtype: int64

## Feature and predictor selection

In [11]:
feature_cols = ['Overall Qual', 'Year Built', 'Gr Liv Area', 'Full Bath', 'Garage Cars', 'Lot Area']
X = df[feature_cols]
X = pd.concat([X, dummies], axis = 1)
print(X.shape)

X_df_test = df_test[feature_cols]
X_df_test = pd.concat([X_df_test, dummies_test], axis = 1)
print(X_df_test.shape)

(2051, 18)
(879, 18)


In [12]:
y = df['SalePrice']

## Train test split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 99)

## Model fitting

In [85]:
for i in [10**x for x in range(0,6)]:
    lasso = Lasso(alpha = float(i/10))
    lasso.fit(X_train, y_train)
    kf = model_selection.KFold(n_splits=5, shuffle=True)
    print(i, '-> ', np.mean(cross_val_score(lasso, X, y, cv=kf)))



1 ->  0.7905233407066001
10 ->  0.7849170210512899
100 ->  0.7878527265348623
1000 ->  0.7771710934104503
10000 ->  0.7823831406389053
100000 ->  0.7409228201251572


In [89]:
lasso = Lasso(2)
lasso.fit(X_train, y_train)

Lasso(alpha=2, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

## Model prediction

In [90]:
y_pred = lasso.predict(X_test)

## Cross Validation

In [91]:
kf = model_selection.KFold(n_splits=5, shuffle=True)

print(np.mean(-cross_val_score(lasso, X, y, cv=kf, scoring='neg_mean_squared_error')))
print(np.mean(cross_val_score(lasso, X, y, cv=kf)))

1311553741.517743
0.7901351530587855


## Predict with test data

In [93]:
y_pred = lasso.predict(X_df_test)

prediction = pd.concat([test_id, pd.Series(y_pred)], axis = 1)
prediction.columns = ['Id', 'SalePrice']

prediction.sample(5)

Unnamed: 0,Id,SalePrice
33,1032,152017.47223
37,2071,82462.089003
331,642,179774.100406
304,1347,144639.842419
261,1748,203166.241412


In [94]:
prediction.to_csv("Emilio_Try2_Lasso(2).csv", index=False)