# NOTEBOOK 04b: KAGGLE - Modeling

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import csv
import pickle
import re
import time

from scipy import stats
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold, SelectKBest, SelectPercentile, SelectFromModel, f_regression, RFECV
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.neighbors import KNeighborsRegressor

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

np.random.seed(42)

%matplotlib inline

In [2]:
df = pd.read_csv('../data/kaggle_clean.csv', index_col='Id')

In [3]:
now ='1544174228'

In [4]:
def extract_element_name(file_path, now):
    return re.findall(f'/([^/]*)_{now}', file_path)[0]

In [5]:
def make_file_dict(now):
    file_dict = {}
    file_list = !ls ../*/*'{now}'*
    for file in file_list:
        file_dict[extract_element_name(file, now)] = file
    return file_dict

In [6]:
file_dict = make_file_dict(f'{now}')

In [7]:
file_dict

{'columns': '../assets/columns_1544174228.pkl',
 'gs': '../assets/gs_1544174228.pkl'}

In [8]:
# use pickle.load to load in these assets
with open(f'../assets/columns_{now}.pkl', 'rb') as f:
    columns = pickle.load(f)
    
with open(f'../assets/gs_{now}.pkl', 'rb') as f:
    gs = pickle.load(f)

In [9]:
gs

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('var_thresh', VarianceThreshold(threshold=0)), ('ss', StandardScaler(copy=True, with_mean=True, with_std=True)), ('kbest', SelectKBest(k=37, score_func=<function f_regression at 0x1a1d06ef28>)), ('ridge', RidgeCV(alphas=array([ 0.1,  1. , 10. ]), cv=None, fit_intercept=True,
    gcv_mode=None, normalize=False, scoring=None, store_cv_values=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'var_thresh__threshold': [0, 0.001, 0.01, 0.02, 0.05], 'kbest__k': [5, 7, 13, 17, 23, 37, 53, 79, 'all'], 'ridge__alphas': [array([1.00000e-05, 1.88965e-05, ..., 5.29198e+49, 1.00000e+50])]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='r2', verbose=1)

In [10]:
df.head()

Unnamed: 0_level_0,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,Bsmt Unf SF,Total Bsmt SF,Central Air,...,Garage Qual_Ex,Heating_OthW,Utilities_NoSeWa,Heating_Wall,Heating QC_Po,Functional_Sev,MS SubClass_1.5_pud,Roof Matl_Membran,Misc Feature_TenC,Functional_Sal
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2658,69.0,9142,6,8,1910,1950,0.0,1020,1020,0,...,0,0,0,0,0,0,0,0,0,0
2718,0.0,9662,5,4,1977,1977,0.0,1967,1967,1,...,0,0,0,0,0,0,0,0,0,0
2414,58.0,17104,7,5,2006,2006,0.0,100,654,1,...,0,0,0,0,0,0,0,0,0,0
1989,60.0,8520,5,6,1923,2006,0.0,968,968,1,...,0,0,0,0,0,0,0,0,0,0
625,0.0,9500,6,5,1963,1963,247.0,785,1394,1,...,0,0,0,0,0,0,0,0,0,0


In [11]:
df[columns].shape

(879, 254)

In [12]:
preds = gs.predict(df[columns])

In [13]:
submission = pd.DataFrame(preds, index=df.index, columns=['SalePrice'])

## Sort index (required for proper submission)

In [14]:
submission.sort_index(inplace=True)

## Save to csv to submit

In [15]:
submission.to_csv(f'../data/gs_{now}.csv')

## Use command line `head` to check data is correct format

In [16]:
!head ../data/gs_{now}.csv

Id,SalePrice
2,123945.97979798025
4,272192.1500307784
6,199079.79225442107
7,227327.37581071266
17,206908.90467639032
18,367696.2230860186
22,194669.3356706095
27,118280.80951992303
31,99715.35086612903


# Continue to NOTEBOOK 05: PRODUCTION