In [1]:
import numpy as np
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import pickle
import csv

from sklearn import linear_model
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import statsmodels.api as sm

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

plt.style.use('fivethirtyeight')
np.random.seed(42)

#### Set Up Model

In [2]:
ames_df = pd.read_csv('../data/ames_iowa_data_clean', index_col='Id')

In [3]:
ames_df_objects = ames_df.select_dtypes(exclude=['int64', 'float64'])

In [4]:
ames_df_numeric = ames_df.select_dtypes(exclude='object')

In [5]:
ames_df_objects.head()

Unnamed: 0_level_0,ms_zoning,street,lot_shape,land_contour,utilities,lot_config,land_slope,neighborhood,condition_1,condition_2,...,heating,heating_qc,central_air,electrical,kitchen_qual,functional,garage_type,garage_qual,paved_drive,sale_type
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
109,RL,Pave,IR1,Lvl,AllPub,CulDSac,Gtl,Sawyer,RRAe,Norm,...,GasA,Ex,Y,SBrkr,Gd,Typ,Attchd,TA,Y,WD
544,RL,Pave,IR1,Lvl,AllPub,CulDSac,Gtl,SawyerW,Norm,Norm,...,GasA,Ex,Y,SBrkr,Gd,Typ,Attchd,TA,Y,WD
153,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,...,GasA,TA,Y,SBrkr,Gd,Typ,Detchd,TA,Y,WD
318,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,Timber,Norm,Norm,...,GasA,Gd,Y,SBrkr,TA,Typ,BuiltIn,TA,Y,WD
255,RL,Pave,IR1,Lvl,AllPub,Inside,Gtl,SawyerW,Norm,Norm,...,GasA,TA,Y,SBrkr,TA,Typ,Detchd,TA,N,WD


In [6]:
pd.get_dummies(ames_df_objects, drop_first=True).shape

(2025, 182)

In [7]:
ames_df_object_dummies = pd.get_dummies(ames_df_objects, drop_first=True)

In [8]:
ames_df_object_dummies.head()

Unnamed: 0_level_0,ms_zoning_C (all),ms_zoning_FV,ms_zoning_I (all),ms_zoning_RH,ms_zoning_RL,ms_zoning_RM,street_Pave,lot_shape_IR2,lot_shape_IR3,lot_shape_Reg,...,paved_drive_P,paved_drive_Y,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_WD
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
109,0,0,0,0,1,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,1
544,0,0,0,0,1,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,1
153,0,0,0,0,1,0,1,0,0,1,...,0,1,0,0,0,0,0,0,0,1
318,0,0,0,0,1,0,1,0,0,1,...,0,1,0,0,0,0,0,0,0,1
255,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [9]:
ames_features = pd.concat([ames_df_numeric, ames_df_object_dummies], axis=1)

In [10]:
ames_features.head()

Unnamed: 0_level_0,ms_subclass,lot_frontage,lot_area,overall_qual,overall_cond,year_built,year_remod/add,mas_vnr_area,bsmtfin_sf_1,bsmtfin_sf_2,...,paved_drive_P,paved_drive_Y,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_WD
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
109,60,69.0552,13517,6,8,1976,2005,289.0,533.0,0.0,...,0,1,0,0,0,0,0,0,0,1
544,60,43.0,11492,7,5,1996,1997,132.0,637.0,0.0,...,0,1,0,0,0,0,0,0,0,1
153,20,68.0,7922,5,7,1953,2007,0.0,731.0,0.0,...,0,1,0,0,0,0,0,0,0,1
318,60,73.0,9802,5,5,2006,2007,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,0,1
255,50,82.0,14235,6,8,1900,1993,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1


#### Setup X and y

In [11]:
X = ames_features.drop('saleprice', axis=1)
y = ames_features.saleprice

#### Create training and validation sets

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

#### Scale the data

In [13]:
ss = StandardScaler()

In [14]:
X_train_sc = ss.fit_transform(X_train)

In [15]:
X_test_sc = ss.transform(X_test)

#### CSV and Pickles

In [16]:
with open('../pickle/X_train_sc.csv', 'w+') as f:
    csv_writer = csv.writer(f)
    csv_writer.writerows(X_train_sc)

In [17]:
with open('../pickle/X_test_sc.csv', 'w+') as f:
    csv_writer = csv.writer(f)
    csv_writer.writerows(X_test_sc)

In [18]:
X_train.to_csv('../data/X_train.csv', index=False, header=False)

In [19]:
X_test.to_csv('../data/X_test.csv', index=False, header=False)

In [20]:
y_train.to_csv('../data/y_train.csv', index=False, header=False)

In [21]:
y_test.to_csv('../data/y_test.csv', index=False, header=False)

In [22]:
with open('../pickle/scaler.pkl', 'wb+') as f:
    pickle.dump(ss, f)