### Jake's Kaggle Submission Walkthrough

In [1]:
# Imports
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

In [2]:
# Read in data
df = pd.read_csv('../datasets/train.csv')
kaggle_sub_data = pd.read_csv('../datasets/test.csv')

In [3]:
df.head(3)

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000


In [4]:
# Rename columns
df.columns = [column.replace(' ','_').lower() for column in df.columns]

In [5]:
df.head(3)

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type,saleprice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000


### Model 1

In [6]:
# Jake chose features that required no EDA/data cleaning
# Define X and y
features = ['lot_area','fireplaces','1st_flr_sf','full_bath','half_bath','totrms_abvgrd']
X = df[features]
y = df['saleprice']

# Train test split
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)

In [7]:
# Instantiate linear regression model
lr_1 = LinearRegression()
lr_1.fit(X_train,y_train)
lr_1.score(X_train,y_train), lr_1.score(X_test,y_test)

(0.586908421365044, 0.6437361531661306)

In [8]:
# Make predictions from Kaggle test data - ERROR MESSAGE!!!
kaggle_preds = lr_1.predict(kaggle_sub_data[features])

KeyError: "None of [Index(['lot_area', 'fireplaces', '1st_flr_sf', 'full_bath', 'half_bath',\n       'totrms_abvgrd'],\n      dtype='object')] are in the [columns]"

In [9]:
# Need to make column names the same in training and testing dataframes
kaggle_sub_data.columns = [column.replace(' ','_').lower() for column in kaggle_sub_data.columns]

In [10]:
# Kaggle predictions (take 2)
kaggle_preds = lr_1.predict(kaggle_sub_data[features])

In [12]:
# View kaggle_preds
kaggle_preds

array([159449.71352753, 251304.6685732 , 190992.30040278, 110894.99003693,
       229389.16286688, 112164.72196138, 121331.68211782, 172927.86002174,
       169945.46639743, 222819.65972804, 171524.76594373, 116821.12968717,
       166322.30457315, 373933.34406684, 150056.16555926, 170052.95267679,
       196681.4464668 , 119730.1405442 , 264110.23553493, 175542.81034774,
       143222.02512136, 102210.08561821, 226241.04452358, 152531.22074919,
       172173.15701591, 102124.60612246, 205277.02114214, 131227.27885091,
       121493.1626068 ,  63251.56705786,  98234.59715581, 100727.75844322,
       208888.07861475, 186723.31830825, 219407.89656463, 124466.06060984,
       160507.69421621, 113430.9653693 , 106076.35257058, 183307.10873522,
       154336.12706764, 225189.52253981, 147595.36208882, 190194.1087286 ,
       237562.84195423, 157845.67579555, 208588.39634902, 113997.24698746,
       104651.85991275, 153775.11680658, 109678.19714206, 213358.64277999,
       232959.24870702, 1

In [13]:
# Assign kaggle_preds to 'saleprice' column (absent in test csv)
kaggle_sub_data['saleprice'] = kaggle_preds

In [14]:
kaggle_sub_data[['id','saleprice']]

Unnamed: 0,id,saleprice
0,2658,159449.713528
1,2718,251304.668573
2,2414,190992.300403
3,1989,110894.990037
4,625,229389.162867
...,...,...
873,1662,228869.864642
874,1234,234371.439112
875,1373,152430.717214
876,1672,102242.553569


In [16]:
# Save as csv file
kaggle_sub_data[['id','saleprice']].to_csv('../datasets/jake_sub_1.csv',index=False)

### Model 2

In [17]:
# Re-import test.csv just to have clean slate
kaggle_sub_data = pd.read_csv('../datasets/test.csv')

In [18]:
# Need to make column names the same in training and testing dataframes
kaggle_sub_data.columns = [column.replace(' ','_').lower() for column in kaggle_sub_data.columns]

In [19]:
# Define X and y
features = ['lot_area','fireplaces','1st_flr_sf','full_bath','half_bath','totrms_abvgrd']
X = df[features]
y = df['saleprice']

# Train test split
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)

In [20]:
# Instantiate PolyFeat
poly = PolynomialFeatures(include_bias=False)

# Fit the Poly
poly.fit(X_train)

PolynomialFeatures(include_bias=False)

In [21]:
# Transform the Poly
X_train_pf = poly.transform(X_train)
X_test_pf = poly.transform(X_test)

In [22]:
# View what happened as a dataframe - now have 27 columns 
# Important!! how to view PolyFeatures
pd.DataFrame(X_train_pf,columns=poly.get_feature_names(X.columns))

Unnamed: 0,lot_area,fireplaces,1st_flr_sf,full_bath,half_bath,totrms_abvgrd,lot_area^2,lot_area fireplaces,lot_area 1st_flr_sf,lot_area full_bath,...,1st_flr_sf^2,1st_flr_sf full_bath,1st_flr_sf half_bath,1st_flr_sf totrms_abvgrd,full_bath^2,full_bath half_bath,full_bath totrms_abvgrd,half_bath^2,half_bath totrms_abvgrd,totrms_abvgrd^2
0,10667.0,1.0,1587.0,2.0,0.0,7.0,113784889.0,10667.0,16928529.0,21334.0,...,2518569.0,3174.0,0.0,11109.0,4.0,0.0,14.0,0.0,0.0,49.0
1,12888.0,2.0,1262.0,1.0,1.0,7.0,166100544.0,25776.0,16264656.0,12888.0,...,1592644.0,1262.0,1262.0,8834.0,1.0,1.0,7.0,1.0,7.0,49.0
2,7200.0,0.0,864.0,1.0,0.0,5.0,51840000.0,0.0,6220800.0,7200.0,...,746496.0,864.0,0.0,4320.0,1.0,0.0,5.0,0.0,0.0,25.0
3,14000.0,0.0,1306.0,2.0,1.0,7.0,196000000.0,0.0,18284000.0,28000.0,...,1705636.0,2612.0,1306.0,9142.0,4.0,2.0,14.0,1.0,7.0,49.0
4,11929.0,1.0,1251.0,2.0,1.0,9.0,142301041.0,11929.0,14923179.0,23858.0,...,1565001.0,2502.0,1251.0,11259.0,4.0,2.0,18.0,1.0,9.0,81.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1533,9709.0,2.0,958.0,2.0,1.0,8.0,94264681.0,19418.0,9301222.0,19418.0,...,917764.0,1916.0,958.0,7664.0,4.0,2.0,16.0,1.0,8.0,64.0
1534,9000.0,1.0,1196.0,1.0,0.0,6.0,81000000.0,9000.0,10764000.0,9000.0,...,1430416.0,1196.0,0.0,7176.0,1.0,0.0,6.0,0.0,0.0,36.0
1535,10140.0,1.0,1309.0,1.0,1.0,5.0,102819600.0,10140.0,13273260.0,10140.0,...,1713481.0,1309.0,1309.0,6545.0,1.0,1.0,5.0,1.0,5.0,25.0
1536,1869.0,0.0,483.0,1.0,1.0,5.0,3493161.0,0.0,902727.0,1869.0,...,233289.0,483.0,483.0,2415.0,1.0,1.0,5.0,1.0,5.0,25.0


In [23]:
# Modeling - instantiate and fit linear model
lr_2 = LinearRegression()
lr_2.fit(X_train_pf,y_train)

LinearRegression()

In [24]:
# Checking R-squared score
lr_2.score(X_train_pf,y_train), lr_2.score(X_test_pf,y_test)

(0.7005281270500114, 0.4757434483326699)

In [None]:
# where I got stuck last night
# kaggle_preds = lr_2.predict(kaggle_test_subset)

In [27]:
# Poly transform kaggle test data 
kaggle_poly = poly.transform(kaggle_sub_data[features])

# Kaggle predictions
kaggle_preds = lr_2.predict(kaggle_poly)

kaggle_sub_data['saleprice'] = kaggle_preds

In [28]:
kaggle_sub_data[['id','saleprice']].to_csv('../datasets/jake_sub_2.csv',index=False)