In [36]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from data_preprocessing_final import cleaning

In [37]:
housing = pd.read_csv('Ames_HousePrice.csv')

### Cleaning

In [38]:
housing = cleaning(housing)

In [39]:
# instead of using the dummifying function in the data_preprocessing_final,
# ima dummify the categorical variables first and then
# the scaling will happen AFTER we train test split to prevent data leakage

### Dummifying

In [40]:
housing.head()

Unnamed: 0.1,Unnamed: 0,PID,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,LotShape,LandContour,LotConfig,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,Fence,MoSold,YrSold,SaleCondition,Bsmt_Unfin_Ratio,TotalLivArea
0,1,909176150,126000,30,RL,68.516053,7890,Reg,Lvl,Corner,...,0,0,0,166,No_Fence,3,2010,Normal,0.721963,1712.0
1,2,905476230,139500,120,RL,42.0,4235,Reg,Lvl,Inside,...,105,0,0,0,No_Fence,2,2009,Normal,0.099142,2098.0
2,3,911128020,124900,30,C (all),60.0,6060,Reg,Lvl,Inside,...,0,42,86,0,No_Fence,11,2007,Normal,0.119474,1838.0
3,4,535377150,114000,70,RL,80.0,8146,Reg,Lvl,Corner,...,0,168,0,111,No_Fence,5,2009,Normal,1.0,1444.0
4,5,534177230,227000,60,RL,70.0,8400,Reg,Lvl,Inside,...,45,0,0,0,No_Fence,11,2009,Normal,0.206173,2475.0


In [41]:
housing = housing.drop('Unnamed: 0', axis = 1)

In [45]:
def dummify_func(housing):
    price = housing['SalePrice'] # Create Y Variable
    price = np.log(price)
    category = housing.select_dtypes('object') #Select all 'object' data types  which are all categorical
    housing_num = housing.select_dtypes('int64', 'float64') # Select numeric data types
    ## Numeric Colums to convert
    # MSSubClass, OverallQual, OverallCond, YearBuilt, YearRemodAdd, MoSold, YrSold
    # How to handle MiscVal???
    #Leave YearBuilt and YearRemodAdd as numeric to be scaled
    housing_num = housing_num.drop(['PID', 'SalePrice', 'MSSubClass', 'OverallQual', \
    'OverallCond', 'MoSold', 'YrSold'], axis = 1)
    housing_num2cat = housing[['MSSubClass', 'OverallQual', 'OverallCond', \
    'MoSold', 'YrSold']]
    category = pd.concat([category, housing_num2cat.astype(str)], axis = 1) #Add all categorical features to dataframe to be dummified
    cat_dum = pd.get_dummies(category, drop_first = True)
    full_dum_data = pd.concat([housing_num, cat_dum], axis = 1)
    return full_dum_data, price

In [46]:
final, price = dummify_func(housing)
final

Unnamed: 0,LotArea,YearBuilt,YearRemodAdd,BedroomAbvGr,TotRmsAbvGrd,Fireplaces,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,...,MoSold_4,MoSold_5,MoSold_6,MoSold_7,MoSold_8,MoSold_9,YrSold_2007,YrSold_2008,YrSold_2009,YrSold_2010
0,7890,1939,1950,2,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,4235,1984,1984,2,3,0,0,105,0,0,...,0,0,0,0,0,0,0,0,1,0
2,6060,1930,2007,2,3,0,154,0,42,86,...,0,0,0,0,0,0,1,0,0,0
3,8146,1900,2003,2,4,0,0,0,168,0,...,0,1,0,0,0,0,0,0,1,0
4,8400,2001,2001,3,3,0,0,45,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2575,8854,1916,1950,2,2,1,0,98,0,0,...,0,1,0,0,0,0,0,0,1,0
2576,13680,1955,1955,4,4,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
2577,6270,1949,1950,4,4,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
2578,8826,2000,2000,3,4,1,193,96,0,0,...,0,0,0,1,0,0,1,0,0,0


In [47]:
price

0       11.744037
1       11.845820
2       11.735269
3       11.643954
4       12.332705
          ...    
2575    11.703546
2576    11.846536
2577    11.884489
2578    12.289954
2579    12.278393
Name: SalePrice, Length: 2579, dtype: float64

### Train_Test_Split

In [62]:
from sklearn.model_selection import train_test_split
X = final
y = price

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 2)

### Scaling

In [65]:
# scaling & fitting the training data only

scaler = MinMaxScaler()
scaler.fit(X_train)
train_scaled = scaler.transform(X_train)
train_scaled = pd.DataFrame(train_scaled, columns = X_train.columns)

# fitting the testing data from the training scale
test_scaled = scaler.transform(X_test)
test_scaled = pd.DataFrame(test_scaled, columns = X_test.columns)

### Lasso

In [80]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

In [81]:
lasso = Lasso(max_iter = 1000000)
params = {
    "alpha": [i for i in np.linspace(0.0001, 0.001,50)]
}

In [82]:
gs = GridSearchCV(lasso, params, scoring = 'r2')
gs.fit(X_train, y_train)

GridSearchCV(estimator=Lasso(max_iter=1000000),
             param_grid={'alpha': [0.0001, 0.00011836734693877552,
                                   0.00013673469387755102,
                                   0.00015510204081632654,
                                   0.00017346938775510205,
                                   0.00019183673469387756,
                                   0.00021020408163265308,
                                   0.00022857142857142857,
                                   0.0002469387755102041, 0.0002653061224489796,
                                   0.00028367346938775514,
                                   0.0003020408163265306, 0.0003204081632653061,
                                   0.0...
                                   0.0003755102040816327,
                                   0.00039387755102040817,
                                   0.00041224489795918366,
                                   0.0004306122448979592, 0.0004489795918367347,
           

In [83]:
gs.best_params_

{'alpha': 0.00021020408163265308}

In [84]:
gs.best_estimator_.score(X_train, y_train)

0.9084740373883916

In [85]:
gs.best_estimator_.score(X_test, y_test)

0.8841581450698478