In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.linear_model import LinearRegression, LassoCV, Lasso, RidgeCV, Ridge
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler


sns.set_style('darkgrid')
%config InlineBackend.figure_format = 'retina'
%matplotlib inline


In [2]:
#import data sets
houses = pd.read_csv('../data/train.csv')
kaggle_test_set = pd.read_csv('../data/test.csv')

In [3]:
houses.columns

Index(['Id', 'PID', 'MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area',
       'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities',
       'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
       'Condition 2', 'Bldg Type', 'House Style', 'Overall Qual',
       'Overall Cond', 'Year Built', 'Year Remod/Add', 'Roof Style',
       'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type',
       'Mas Vnr Area', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual',
       'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1',
       'BsmtFin Type 2', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF',
       'Heating', 'Heating QC', 'Central Air', 'Electrical', '1st Flr SF',
       '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath',
       'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr',
       'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd', 'Functional',
       'Fireplaces', 'Fireplace Qu', 'Garage Type', 'Garage Yr Blt',
       'G

In [4]:
#initial feature selection (all of them)
features = [col for col in houses.columns if col != 'Id' and col != 'PID']
X = houses[features] #initally used as full dataframe until reinitialized later on
y = houses['SalePrice']


In [5]:
X.shape

(2051, 79)

In [6]:
#rerun this a bunch of times to see what nulls are left to deal with

nulls = pd.DataFrame.from_dict({'Columns':X.columns, 'Nulls':X.isnull().sum()})
display(nulls[nulls['Nulls']>=1])

#I think I want to drop all the cols that have only 1 null, see what the intersection is between the ones with <100 and maybe drop them, and probably ignore the cols with >300. 100rows = 5% of data

Unnamed: 0,Columns,Nulls
Lot Frontage,Lot Frontage,330
Alley,Alley,1911
Mas Vnr Type,Mas Vnr Type,22
Mas Vnr Area,Mas Vnr Area,22
Bsmt Qual,Bsmt Qual,55
Bsmt Cond,Bsmt Cond,55
Bsmt Exposure,Bsmt Exposure,58
BsmtFin Type 1,BsmtFin Type 1,55
BsmtFin SF 1,BsmtFin SF 1,1
BsmtFin Type 2,BsmtFin Type 2,56


['Mas Vnr Type',
 'Mas Vnr Area',
 'BsmtFin SF 1',
 'BsmtFin SF 2',
 'Bsmt Unf SF',
 'Total Bsmt SF',
 'Bsmt Full Bath',
 'Bsmt Half Bath',
 'Garage Cars',
 'Garage Area']

Why would/do these cols have any null values? What do I do with them?

 - `'Mas Vnr Type'` - Masonry veneer type {None, BrkFace, Stone, BrkCmn, CBlock}. CBlock isn't represented in the unique values of the column. The nulls could correspond to CBlock. Decision: I had wanted to drop them initially but I ended up renaming to 'CBlock'
 
 
 - `'Mas Vnr Area'` - Masonry veneer area in square feet {float}. Matches up with MasVnrType nulls. There are values of 0 so it doesn't explicitly mean it is 0. I think this and `'Mas Vnr Type'` could be missing completely at random but if I am making the other one 'CBlock' I'm going to make this one 0. Decision: Make 0.
 
 
 - `['Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin Type 2']` - All correspond to the missing 'NA' value that was stated in the docs. Set to 'NA'.
 
 
 - `'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF']` - Only 1 null value per column. All corresponded with NA across the board regarding basement. Set each to 0.

 
 - `['Bsmt Full Bath', 'Bsmt Half Bath']` - 2 null values in these columns. Both line up to 'NA' across the board for basement. Set each to 0. 
 
 
 - `['Garage Cars', 'Garage Area']` - One null value per column. Has garage type of 'detached' with all other garage features nulled. Decision: Drop
 
 
 - `['Garage Type', 'Garage Yr Blt', 'Garage Finish', 'Garage Qual', 'Garage Cond']` - All have 113 nulls that correspond to each other. Each column has missing 'NA' value that is represented by null values. Set each to 'NA'. 
 
 
 - `'Lot Frontage'` - There are 330 nulls. The number 0 is not in the unique values so I think that null means 0 here. Replace with '0' 
 
 
 - `['Alley', 'Fireplace Qu', 'Pool QC', 'Fence', 'Misc Feature']` - Nulls correspond to 'NA'. Replace with 'NA'

In [7]:
#used for testing each feature, what the nulls lined up with in other relevant columns, what the unique values were, etc
X[X['Misc Feature'].isnull()][['Lot Frontage', 'Lot Area', 'MS Zoning', 'Street','Alley','Lot Shape','Lot Config']]
X['Misc Feature'].unique()
#X[X['Garage Type']=='Detchd'][['Garage Cars','Garage Type','Garage Area','Garage Yr Blt','Garage Finish','Garage Qual', 'Garage Cond']].head(10)

array([nan, 'Shed', 'TenC', 'Gar2', 'Othr', 'Elev'], dtype=object)

In [8]:
#used this cell to go through the analysis of Basement nulls (other data analysis was done w/o record for notebook clarity)
#X['Mas Vnr Area'].value_counts()
curr_col = 'BsmtFin Type 1'
curr_col2 = 'BsmtFin Type 2'
mask1 = X[curr_col].isnull()
mask2 = X[curr_col2].isnull()
mask3 = X['Bsmt Qual'].isnull()
mask4 = X['Bsmt Cond'].isnull()
mask5 = X['Bsmt Exposure'].isnull()

#X[X[curr_col].isnull()]
curr_test = X[mask2&mask1&mask3&mask4&mask5]
#X['Mas Vnr Type'].unique()
display(curr_test[[curr_col, curr_col2, 'Bsmt Unf SF', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure']].head())
#X['Bsmt Cond'].unique()

#these nulls all line up with each other, probably all indicate that they correspond to NA, need to check unique values of each of these cols to see if NA is present or not

print('Type1: ', X['BsmtFin Type 1'].unique()) #Missing value: NA
print('Type2: ', X['BsmtFin Type 2'].unique()) #NA
print('Qual:  ', X['Bsmt Qual'].unique()) #NA
print('Cond:  ', X['Bsmt Cond'].unique()) #NA
print('Expos: ', X['Bsmt Exposure'].unique()) #NA

Unnamed: 0,BsmtFin Type 1,BsmtFin Type 2,Bsmt Unf SF,Bsmt Qual,Bsmt Cond,Bsmt Exposure
12,,,0.0,,,
93,,,0.0,,,
114,,,0.0,,,
146,,,0.0,,,
183,,,0.0,,,


Type1:  ['GLQ' 'Unf' 'ALQ' 'Rec' nan 'BLQ' 'LwQ']
Type2:  ['Unf' 'Rec' nan 'BLQ' 'GLQ' 'LwQ' 'ALQ']
Qual:   ['TA' 'Gd' 'Fa' nan 'Ex' 'Po']
Cond:   ['TA' 'Gd' nan 'Fa' 'Po' 'Ex']
Expos:  ['No' 'Gd' 'Av' nan 'Mn']


In [9]:
X['Mas Vnr Type'].unique()

array(['BrkFace', 'None', nan, 'Stone', 'BrkCmn'], dtype=object)

In [10]:
features_to_NA = ['Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin Type 2','Garage Type','Garage Yr Blt','Garage Finish','Garage Qual', 'Garage Cond', 'Alley', 'Fireplace Qu', 'Pool QC', 'Fence', 'Misc Feature']

features_to_0 = ['Bsmt Unf SF', 'BsmtFin SF 1', 'BsmtFin SF 2', 'Total Bsmt SF', 'Bsmt Half Bath', 'Bsmt Full Bath', 'Lot Frontage', 'Mas Vnr Area']

features_to_drop = [
    #'Mas Vnr Type', 
    #'Mas Vnr Area', 
    'Garage Cars', 
    'Garage Area']
#moving 'Mas Vnr Area' to set to 0 because it is also in my training set
#changing 'Mas Vnr Type' to set to 'CBlock'

X = X.dropna(subset=features_to_drop)
X.loc[:,'Mas Vnr Type']=X['Mas Vnr Type'].replace(np.nan, 'CBlock')
X.loc[:,features_to_NA]=X[features_to_NA].replace(np.nan, 'NA')
X.loc[:,features_to_0]=X[features_to_0].replace(np.nan, 0.)

print(X['Mas Vnr Type'].isnull().sum())
print(X[features_to_NA].isnull().sum())
print(X[features_to_0].isnull().sum())


0
Bsmt Qual         0
Bsmt Cond         0
Bsmt Exposure     0
BsmtFin Type 1    0
BsmtFin Type 2    0
Garage Type       0
Garage Yr Blt     0
Garage Finish     0
Garage Qual       0
Garage Cond       0
Alley             0
Fireplace Qu      0
Pool QC           0
Fence             0
Misc Feature      0
dtype: int64
Bsmt Unf SF       0
BsmtFin SF 1      0
BsmtFin SF 2      0
Total Bsmt SF     0
Bsmt Half Bath    0
Bsmt Full Bath    0
Lot Frontage      0
Mas Vnr Area      0
dtype: int64


In [11]:
#Features to create Dummy Variables for
features_to_dummy = ['MS SubClass', 'MS Zoning', 'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type', 'House Style', 'Overall Qual', 'Overall Cond', 'Year Built', 'Year Remod/Add', 'Roof Style', 'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin Type 2', 'Heating', 'Heating QC', 'Central Air', 'Electrical', 'Kitchen Qual', 'Functional', 'Fireplace Qu', 'Garage Type', 'Garage Yr Blt', 'Garage Finish', 'Garage Qual', 'Garage Cond', 'Paved Drive', 'Pool QC', 'Fence', 'Misc Feature', 'Mo Sold', 'Yr Sold', 'Sale Type']

# Should I leave Overall Qual and Overall Cond as numeric? I put them in dummies anyway

In [12]:
X.columns #used to create above list

Index(['MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area', 'Street',
       'Alley', 'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config',
       'Land Slope', 'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type',
       'House Style', 'Overall Qual', 'Overall Cond', 'Year Built',
       'Year Remod/Add', 'Roof Style', 'Roof Matl', 'Exterior 1st',
       'Exterior 2nd', 'Mas Vnr Type', 'Mas Vnr Area', 'Exter Qual',
       'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure',
       'BsmtFin Type 1', 'BsmtFin SF 1', 'BsmtFin Type 2', 'BsmtFin SF 2',
       'Bsmt Unf SF', 'Total Bsmt SF', 'Heating', 'Heating QC', 'Central Air',
       'Electrical', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF',
       'Gr Liv Area', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath',
       'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr', 'Kitchen Qual',
       'TotRms AbvGrd', 'Functional', 'Fireplaces', 'Fireplace Qu',
       'Garage Type', 'Garage Yr Blt', 'Garage Finish', 'Gara

In [13]:
X_dummies = pd.get_dummies(X, columns=features_to_dummy, drop_first=True)
X_dummies.head()

Unnamed: 0,Lot Frontage,Lot Area,Mas Vnr Area,BsmtFin SF 1,BsmtFin SF 2,Bsmt Unf SF,Total Bsmt SF,1st Flr SF,2nd Flr SF,Low Qual Fin SF,...,Yr Sold_2009,Yr Sold_2010,Sale Type_CWD,Sale Type_Con,Sale Type_ConLD,Sale Type_ConLI,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_WD
0,0.0,13517,289.0,533.0,0.0,192.0,725.0,725,754,0,...,0,1,0,0,0,0,0,0,0,1
1,43.0,11492,132.0,637.0,0.0,276.0,913.0,913,1209,0,...,1,0,0,0,0,0,0,0,0,1
2,68.0,7922,0.0,731.0,0.0,326.0,1057.0,1057,0,0,...,0,1,0,0,0,0,0,0,0,1
3,73.0,9802,0.0,0.0,0.0,384.0,384.0,744,700,0,...,0,1,0,0,0,0,0,0,0,1
4,82.0,14235,0.0,0.0,0.0,676.0,676.0,831,614,0,...,0,1,0,0,0,0,0,0,0,1


In [14]:
X_dummies.isnull().sum().sum()

0

In [15]:
y = X['SalePrice']

features = [col for col in X_dummies.columns if col != 'SalePrice']
X_dummies = X_dummies[features]


In [16]:
X_dummies.head()

Unnamed: 0,Lot Frontage,Lot Area,Mas Vnr Area,BsmtFin SF 1,BsmtFin SF 2,Bsmt Unf SF,Total Bsmt SF,1st Flr SF,2nd Flr SF,Low Qual Fin SF,...,Yr Sold_2009,Yr Sold_2010,Sale Type_CWD,Sale Type_Con,Sale Type_ConLD,Sale Type_ConLI,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_WD
0,0.0,13517,289.0,533.0,0.0,192.0,725.0,725,754,0,...,0,1,0,0,0,0,0,0,0,1
1,43.0,11492,132.0,637.0,0.0,276.0,913.0,913,1209,0,...,1,0,0,0,0,0,0,0,0,1
2,68.0,7922,0.0,731.0,0.0,326.0,1057.0,1057,0,0,...,0,1,0,0,0,0,0,0,0,1
3,73.0,9802,0.0,0.0,0.0,384.0,384.0,744,700,0,...,0,1,0,0,0,0,0,0,0,1
4,82.0,14235,0.0,0.0,0.0,676.0,676.0,831,614,0,...,0,1,0,0,0,0,0,0,0,1


In [17]:
#split up the cleaned and formatted data
X_train, X_test, y_train, y_test = train_test_split(X_dummies, y, test_size=0.25, random_state=42) 

In [18]:
#scale data
ss = StandardScaler()

#fit and scale the train data
X_train_scaled = ss.fit_transform(X_train)

#scale the test the exact same way we scaled the train data
X_test_scaled = ss.transform(X_test)

In [19]:
#instantiate KFold object

kf = KFold(n_splits=5, shuffle=True, random_state=42)


## Lasso Model

In [20]:
#instantiate model
#number of iteractions / small alpha is bad. Need to fix metaparameters

#l_alphas = np.arange(0.001, 0.15, 0.0025)
#lasso_CV = LassoCV(alphas=l_alphas, cv=kf, random_state=42)
lasso_CV = LassoCV(cv=kf, random_state=42)

In [21]:
from IPython.display import HTML
HTML('''<script>
code_show_err=false; 
function code_toggle_err() {
 if (code_show_err){
 $('div.output_stderr').hide();
 } else {
 $('div.output_stderr').show();
 }
 code_show_err = !code_show_err
} 
$( document ).ready(code_toggle_err);
</script>
To toggle on/off output_stderr, click <a href="javascript:code_toggle_err()">here</a>.''')

In [22]:
# Once the errors are all through outputting, click the above toggle button to get rid of them

# The cell below used to give me lots of errors about the alpha but once I let it choose alphas for itself, it stopped. I guess that letting it do its own thing is the way to go.

In [23]:
%%time
cvs_lasso_pre = cross_val_score(lasso_CV, X_train_scaled, y_train).mean();
print('done')



done
CPU times: user 1min, sys: 600 ms, total: 1min 1s
Wall time: 10.3 s


In [24]:
print('Lasso_CV Pre-fit Regression: ', cvs_lasso_pre)

Lasso_CV Pre-fit Regression:  0.7889624789650975


In [25]:
#fit model
lasso_CV = lasso_CV.fit(X_train_scaled, y_train)
print('Lasso_CV score', lasso_CV.score(X_train_scaled, y_train))
print('Lasso_CV score', lasso_CV.score(X_test_scaled, y_test))


Lasso_CV score 0.9019454460266321
Lasso_CV score 0.906863970618406


In [26]:
lasso_CV.n_iter_, lasso_CV.alpha_ 
#well that alpha is nowhere near what I was trying. I wonder why the solution code for the lesson where we learned about lasso and ridge had lasso alphas between 0.001 and 0.15

(78, 1102.0086144275328)

In [27]:
lasso_CV.predict(X_test_scaled)[:5]

array([113847.90506854, 280556.07581978, 101784.19859745, 147098.45946546,
       160122.58322228])

Okay so the Lasso model converged on an alpha, the test scoring is 0.9 which is actually pretty good, better than the cross_val_score predicted. 

Let's see how a Linear Regression model and Ridge model do. I can also look into doing the model we learned in 4.01/4.02. After I choose one of these models I want to submit something to kaggle before continuing to tinker.


##### Other Things I can do
 - loop through multiple values for KFold
 - use Lasso / Ridge with optimized alpha from LassoCV and RidgeCV to see if there is a difference
 - change around features based on coef_ values in Lasso (create a big for loop thing to do this and multiple KFolds?)
 - begin to write my own class that will do some of this stuff for me
 - 
 

## Ridge Model

In [28]:
#Ridge model
ridge_CV = RidgeCV(cv=kf)


In [29]:
%%time
cvs_ridge_pre = cross_val_score(lasso_CV, X_train_scaled, y_train).mean();
print('done')



done
CPU times: user 59.7 s, sys: 494 ms, total: 1min
Wall time: 10.1 s


In [30]:
print('Ridge_CV Pre-fit Regression: ', cvs_ridge_pre)

Ridge_CV Pre-fit Regression:  0.7889624789650975


In [31]:
ridge_CV = ridge_CV.fit(X_train_scaled, y_train)
print('Ridge_CV Train Score', ridge_CV.score(X_train_scaled, y_train))
print('Ridge_CV Test Score', ridge_CV.score(X_test_scaled, y_test))

Ridge_CV Train Score 0.9502867866995209
Ridge_CV Test Score 0.9071748211456014


In [32]:
ridge_CV.alpha_

10.0

In [33]:
ridge_CV.n_iter_ #how do we tell if we've hit max iters? I got a warning but I'm not sure if that is accurate for the current values or not

AttributeError: 'RidgeCV' object has no attribute 'n_iter_'

Ridge didn't do as well as the Lasso with all of the same features. It could be overfit (the test score was .1 below train and train was getting close to 1). I might see if changing the features around makes a better model.

## Linear Regression Model

In [None]:
linreg = LinearRegression()

In [None]:
print('LinReg Pre-Fit Score: ', cross_val_score(linreg, X_train, y_train).mean())


In [None]:
linreg.fit(X_train, y_train)
print('LinReg Train Score: ', linreg.score(X_train, y_train))
print('LinReg Test Score:  ', linreg.score(X_test, y_test))

Well, not using a linear regression model then. 


## Gamma Model
It looks like I would have to make a lot of new columns based on the log10 of all of the numeric feature columns (at least, if not all of the columns including the dummy variables). I might do this if I have time after doing more with Lasso and Ridge.


## Lasso Model Kaggle Submission with all features

In [34]:
#need to clean the kaggle test data the same way that I cleaned the train data

#I'm not sure what to do with the nulls that I have that I dropped before. If I delete those rows, it will throw off my entire thing

kaggle_test_set_wo_nulls = kaggle_test_set.copy()

kaggle_test_set_wo_nulls.loc[:,'Electrical']=kaggle_test_set_wo_nulls['Electrical'].replace(np.nan, 'Mix')

kaggle_test_set_wo_nulls.loc[:,'Mas Vnr Type']=kaggle_test_set_wo_nulls['Mas Vnr Type'].replace(np.nan, 'CBlock')

kaggle_test_set_wo_nulls.loc[:,features_to_NA]=kaggle_test_set[features_to_NA].replace(np.nan, 'NA')

kaggle_test_set_wo_nulls.loc[:,features_to_0]=kaggle_test_set[features_to_0].replace(np.nan, 0.)

In [35]:
kaggle_test_set_wo_nulls['Electrical'].unique()

array(['FuseP', 'SBrkr', 'FuseA', 'FuseF', 'Mix'], dtype=object)

In [36]:
kaggle_test_set_wo_nulls.isnull().sum().sum()

0

In [37]:
nulls_kaggle = pd.DataFrame.from_dict({'Columns':kaggle_test_set_wo_nulls.columns, 'Nulls':kaggle_test_set_wo_nulls.isnull().sum()})
display(nulls_kaggle[nulls_kaggle['Nulls']>=1])

Unnamed: 0,Columns,Nulls


In [39]:
X_dummies.head()

Unnamed: 0,Lot Frontage,Lot Area,Mas Vnr Area,BsmtFin SF 1,BsmtFin SF 2,Bsmt Unf SF,Total Bsmt SF,1st Flr SF,2nd Flr SF,Low Qual Fin SF,...,Yr Sold_2009,Yr Sold_2010,Sale Type_CWD,Sale Type_Con,Sale Type_ConLD,Sale Type_ConLI,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_WD
0,0.0,13517,289.0,533.0,0.0,192.0,725.0,725,754,0,...,0,1,0,0,0,0,0,0,0,1
1,43.0,11492,132.0,637.0,0.0,276.0,913.0,913,1209,0,...,1,0,0,0,0,0,0,0,0,1
2,68.0,7922,0.0,731.0,0.0,326.0,1057.0,1057,0,0,...,0,1,0,0,0,0,0,0,0,1
3,73.0,9802,0.0,0.0,0.0,384.0,384.0,744,700,0,...,0,1,0,0,0,0,0,0,0,1
4,82.0,14235,0.0,0.0,0.0,676.0,676.0,831,614,0,...,0,1,0,0,0,0,0,0,0,1


In [38]:
X_kaggle_dummies = pd.get_dummies(kaggle_test_set_wo_nulls, columns=features_to_dummy, drop_first=True)

X_kaggle_dummies.head()


Unnamed: 0,Id,PID,Lot Frontage,Lot Area,Mas Vnr Area,BsmtFin SF 1,BsmtFin SF 2,Bsmt Unf SF,Total Bsmt SF,1st Flr SF,...,Yr Sold_2010,Sale Type_CWD,Sale Type_Con,Sale Type_ConLD,Sale Type_ConLI,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_VWD,Sale Type_WD
0,2658,902301120,69.0,9142,0.0,0.0,0.0,1020.0,1020.0,908,...,0,0,0,0,0,0,0,0,0,1
1,2718,905108090,0.0,9662,0.0,0.0,0.0,1967.0,1967.0,1967,...,0,0,0,0,0,0,0,0,0,1
2,2414,528218130,58.0,17104,0.0,554.0,0.0,100.0,654.0,664,...,0,0,0,0,0,0,1,0,0,0
3,1989,902207150,60.0,8520,0.0,0.0,0.0,968.0,968.0,968,...,0,0,0,0,0,0,0,0,0,1
4,625,535105100,0.0,9500,247.0,609.0,0.0,785.0,1394.0,1394,...,0,0,0,0,0,0,0,0,0,1


In [44]:
# Get missing columns in the training test
missing_cols = set( X_dummies.columns ) - set( X_kaggle_dummies.columns )
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    X_kaggle_dummies[c] = 0
# Ensure the order of column in the test set is in the same order than in train set
X_kaggle_dummies = X_kaggle_dummies[X_dummies.columns]

In [45]:
kaggle_features = [col for col in X_kaggle_dummies if col != 'Id' and col != 'PID']
X_kaggle_dummies = X_kaggle_dummies[X_dummies.columns]

In [46]:
set(X_kaggle_dummies.columns) - set(X_dummies.columns)
set(X_dummies.columns) - set(X_kaggle_dummies.columns)

set()

In [47]:
kaggle_test_set_scaled = ss.transform(X_kaggle_dummies)

In [49]:

submission_predicts = lasso_CV.predict(kaggle_test_set_scaled)
submission_ids = kaggle_test_set['Id']
df_submit = pd.DataFrame()
df_submit['Id'] = submission_ids
df_submit['SalePrice'] = submission_predicts

df_submit.head()

Unnamed: 0,Id,SalePrice
0,2658,120067.550595
1,2718,157233.500005
2,2414,226012.052826
3,1989,113035.464344
4,625,186699.178862


In [50]:
df_submit.to_csv('../data/kaggle_project2_submission_lasso_allFeats.csv', index=False)

Okay, so this submission got me to 38,374. There are a few things to note about the submission process. 

Problems I faced:
 - The dummy variables that are there in the training data are not all there in the kaggle test data.  Along the same lines, the Kaggle test data had some dummy variables that weren't in the training data.
   - I added in the dummy columns that were missing in the Kaggle test data.
   - I ended up taking the new dummy columns in the Kaggle test data out but they are datapoints that I now am not using which makes my model perform worse. There are a few that I think I might be able to use without dummies that had dummies, like years or something on a number scale. We shall see
 - After creating the dummy variables, the order of the columns in the Kaggle test data was not in the same order as the columns of my X that I fit my model on, which created bad results
 
 
Potential Fixes / Things to Try:
 - make sure that all of the dummy variables that can be created from the train data are. Don't have remove_first or whatever as True. The Kaggle test data might be removing different dummy columns than the train data.
 - see if making the columns that have to do with year or a numbered ranking system could be used without expanding them out to dummy variables

In [None]:
#I'm first going to test all the numerical data points without changing any of the categorical features. 

#If that is not satisfactory I will go through and fix what I believe to be the most important ones. 

#If still unsatisfactory, I will go and fix all of them. What I am not sure about is how to handle the categorical features that are encoded in numbers. I think I should still dummy them out.

In [None]:
X_refined['Lot Frontage'].isnull().sum()

In [None]:
X_refined[X_refined['Lot Frontage']==0] #so NaN means 0 or they didn't have the data. 

Past this is the attempt 1

In [None]:
houses['1st Flr SF'].head()

In [None]:
#Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) 


In [None]:
#instantiate model
lr = LinearRegression()

#predict R squared values
scores = cross_val_score(lr, X_train, y_train)

In [None]:
#Step 2: fit model
lr.fit(X_train, y_train)

# Make predictions
predictions = lr.predict(X_train)


# Score the model
score = lr.score(X_test, y_test)


# Plot the model
plt.figure(figsize=(8,8))
plt.scatter(predictions, y_train, s=30, c='r', marker='+', zorder=10)
plt.xlabel("Predicted Values from RM, PTRATIO, DIS - $\hat{y}$")
plt.ylabel("Actual Values PRICE - y")

plt.plot([0, np.max(y_train)], [0, np.max(y_train)], c = 'k')

plt.show()


print("actual score: ", score)
print('50-50 split cross val score:', scores.mean())

In [None]:

submission_predicts = lr.predict(test_set[features_to_dummy])


In [None]:
submission_ids = test_set['Id']
df_submit = pd.DataFrame()
df_submit['Id'] = submission_ids
df_submit['SalePrice'] = submission_predicts

In [None]:
df_submit.head()

In [None]:
df_submit.to_csv('../data/kaggle_project2_submission.csv', index=False)