In [60]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from data_preprocessing_final import cleaning, initiate_data
from sklearn.model_selection import GridSearchCV

In [61]:
housing = pd.read_csv('Ames_HousePrice.csv', index_col = 0)
housing = housing[housing['SaleCondition'] == 'Normal']

### Cleaning

In [62]:
housing = cleaning(housing)

In [63]:
# instead of using the dummifying function in the data_preprocessing_final,
# ima dummify the categorical variables first and then
# the scaling will happen AFTER we train test split to prevent data leakage

### Dummifying

In [64]:
housing.head()

Unnamed: 0,PID,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,LotShape,LandContour,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,Fence,MoSold,YrSold,SaleCondition,Bsmt_Unfin_Ratio,TotalLivArea
1,909176150,126000,30,RL,68.217524,7890,Reg,Lvl,Corner,Gtl,...,0,0,0,166,No_Fence,3,2010,Normal,0.721963,1712.0
2,905476230,139500,120,RL,42.0,4235,Reg,Lvl,Inside,Gtl,...,105,0,0,0,No_Fence,2,2009,Normal,0.099142,2098.0
3,911128020,124900,30,C (all),60.0,6060,Reg,Lvl,Inside,Gtl,...,0,42,86,0,No_Fence,11,2007,Normal,0.119474,1838.0
4,535377150,114000,70,RL,80.0,8146,Reg,Lvl,Corner,Gtl,...,0,168,0,111,No_Fence,5,2009,Normal,1.0,1444.0
5,534177230,227000,60,RL,70.0,8400,Reg,Lvl,Inside,Gtl,...,45,0,0,0,No_Fence,11,2009,Normal,0.206173,2475.0


In [65]:
def dummify_func(housing):
    price = housing['SalePrice'] # Create Y Variable
    price = np.log(price)
    category = housing.select_dtypes('object') #Select all 'object' data types  which are all categorical
    housing_num = housing.select_dtypes('int64', 'float64') # Select numeric data types
    ## Numeric Colums to convert
    # MSSubClass, OverallQual, OverallCond, YearBuilt, YearRemodAdd, MoSold, YrSold
    # How to handle MiscVal???
    #Leave YearBuilt and YearRemodAdd as numeric to be scaled
    housing_num = housing_num.drop(['PID', 'SalePrice', 'MSSubClass', 'OverallQual', \
    'OverallCond', 'MoSold', 'YrSold'], axis = 1)
    housing_num2cat = housing[['MSSubClass', 'OverallQual', 'OverallCond', \
    'MoSold', 'YrSold']]
    category = pd.concat([category, housing_num2cat.astype(str)], axis = 1) #Add all categorical features to dataframe to be dummified
    cat_dum = pd.get_dummies(category, drop_first = True)
    full_dum_data = pd.concat([housing_num, cat_dum], axis = 1)
    return full_dum_data, price

In [66]:
final, price = dummify_func(housing)
final

Unnamed: 0,LotArea,YearBuilt,YearRemodAdd,BedroomAbvGr,TotRmsAbvGrd,Fireplaces,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,...,MoSold_4,MoSold_5,MoSold_6,MoSold_7,MoSold_8,MoSold_9,YrSold_2007,YrSold_2008,YrSold_2009,YrSold_2010
1,7890,1939,1950,2,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,4235,1984,1984,2,3,0,0,105,0,0,...,0,0,0,0,0,0,0,0,1,0
3,6060,1930,2007,2,3,0,154,0,42,86,...,0,0,0,0,0,0,1,0,0,0
4,8146,1900,2003,2,4,0,0,0,168,0,...,0,1,0,0,0,0,0,0,1,0
5,8400,2001,2001,3,3,0,0,45,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
759,8854,1916,1950,2,2,1,0,98,0,0,...,0,1,0,0,0,0,0,0,1,0
760,13680,1955,1955,4,4,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
761,6270,1949,1950,4,4,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
762,8826,2000,2000,3,4,1,193,96,0,0,...,0,0,0,1,0,0,1,0,0,0


In [67]:
price

1      11.744037
2      11.845820
3      11.735269
4      11.643954
5      12.332705
         ...    
759    11.703546
760    11.846536
761    11.884489
762    12.289954
763    12.278393
Name: SalePrice, Length: 2409, dtype: float64

### Train_Test_Split

In [68]:
from sklearn.model_selection import train_test_split
X = final
y = price

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [69]:
y_test

131    12.066811
677    11.856515
180    12.077671
255    11.984178
138    11.588960
         ...    
622    11.849398
548    11.805595
206    12.072541
161    11.751942
467    12.111212
Name: SalePrice, Length: 482, dtype: float64

### Scaling

In [32]:
# scaling & fitting the training data only

scaler = MinMaxScaler()
scaler.fit(X_train)
train_scaled = scaler.transform(X_train)
train_scaled = pd.DataFrame(train_scaled, columns = X_train.columns)

# fitting the testing data from the training scale
test_scaled = scaler.transform(X_test)
test_scaled = pd.DataFrame(test_scaled, columns = X_test.columns)

In [33]:
X_train = train_scaled
X_train

Unnamed: 0,LotArea,YearBuilt,YearRemodAdd,BedroomAbvGr,TotRmsAbvGrd,Fireplaces,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,...,MoSold_4,MoSold_5,MoSold_6,MoSold_7,MoSold_8,MoSold_9,YrSold_2007,YrSold_2008,YrSold_2009,YrSold_2010
0,0.118973,0.760870,0.850000,0.166667,0.250,0.50,0.348315,0.000000,0.00000,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.070082,0.913043,0.816667,0.500000,0.375,0.25,0.000000,0.129825,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.079100,0.760870,0.700000,0.666667,0.375,0.50,0.225421,0.126316,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.070767,0.920290,0.833333,0.666667,0.375,0.25,0.101124,0.077193,0.00000,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.060241,0.202899,0.833333,0.500000,0.375,0.00,0.154494,0.200000,0.20751,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1922,0.051015,0.978261,0.950000,0.500000,0.375,0.25,0.117978,0.052632,0.00000,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1923,0.056436,0.492754,0.783333,0.333333,0.125,0.00,0.117978,0.000000,0.00000,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1924,0.081642,0.608696,0.800000,0.500000,0.250,0.25,0.000000,0.000000,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1925,0.065035,0.579710,0.033333,0.333333,0.125,0.00,0.000000,0.000000,0.00000,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [40]:
X_test = test_scaled
y_train

95     12.506177
555    12.287653
287    12.345835
300    12.323856
165    11.573550
         ...    
201    12.423198
82     11.589887
920    12.138864
3      11.661345
4      12.117241
Name: SalePrice, Length: 1927, dtype: float64

### Lasso

In [34]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

In [35]:
lasso = Lasso(max_iter = 1000000)
params = {
    "alpha": np.linspace(0.0001, 0.01, 100)
}

In [36]:
gs = GridSearchCV(lasso, params, scoring = 'r2', cv = 5, n_jobs = -1)
gs.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=Lasso(max_iter=1000000), n_jobs=-1,
             param_grid={'alpha': array([0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008,
       0.0009, 0.001 , 0.0011, 0.0012, 0.0013, 0.0014, 0.0015, 0.0016,
       0.0017, 0.0018, 0.0019, 0.002 , 0.0021, 0.0022, 0.0023, 0.0024,
       0.0025, 0.0026, 0.0027, 0.0028, 0.0029, 0.003 , 0.0031, 0.0032,
       0.0033, 0.0034, 0.0035, 0.0036, 0.0037, 0.003...
       0.0049, 0.005 , 0.0051, 0.0052, 0.0053, 0.0054, 0.0055, 0.0056,
       0.0057, 0.0058, 0.0059, 0.006 , 0.0061, 0.0062, 0.0063, 0.0064,
       0.0065, 0.0066, 0.0067, 0.0068, 0.0069, 0.007 , 0.0071, 0.0072,
       0.0073, 0.0074, 0.0075, 0.0076, 0.0077, 0.0078, 0.0079, 0.008 ,
       0.0081, 0.0082, 0.0083, 0.0084, 0.0085, 0.0086, 0.0087, 0.0088,
       0.0089, 0.009 , 0.0091, 0.0092, 0.0093, 0.0094, 0.0095, 0.0096,
       0.0097, 0.0098, 0.0099, 0.01  ])},
             scoring='r2')

In [37]:
gs.best_params_

{'alpha': 0.0002}

In [38]:
gs.best_estimator_.score(X_train, y_train)

0.911089435584835

# Bruh...

In [41]:
gs.best_estimator_.score(X_test, y_test)

0.9129621809352183

In [None]:
lassopred = gs.predict(X_test)
y_test = y_test.to_frame()

In [48]:
y_test['lassopred'] = lassopred
y_test

Unnamed: 0,SalePrice,lassopred
131,12.066811,11.941215
677,11.856515,11.861399
180,12.077671,12.129690
255,11.984178,12.142971
138,11.588960,11.581110
...,...,...
622,11.849398,11.815700
548,11.805595,11.754368
206,12.072541,12.206088
161,11.751942,11.750633


In [49]:
y_test.to_csv('y_test.csv', index = False)