In [69]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

In [70]:
# Set up the data
data = "../data/"
train = "datasets/train.csv"
train_clean = 'datasets/clean_train.csv'
test = "datasets/test.csv"

In [71]:
#Read in the data
train_df = pd.read_csv(data+train)
train4_df = pd.read_csv(data+train_clean)
test_df = pd.read_csv(data+test)
cat_select_traindf = pd.read_csv(data+'datasets/cat_select_train_m4.csv')
cat_select_testdf = pd.read_csv(data+'datasets/cat_select_test_m4.csv')

In [72]:
# Check that everything is copacetic
cat_select_traindf.drop(columns='Unnamed: 0',inplace=True)
cat_select_testdf.drop(columns='Unnamed: 0',inplace=True)


**Second Prediction: Linear Regression**

The "second_submission" includes Linear Regreesion Modeling and reflects the results unclean data.
- Features included were randomly chosen to minimize complications in Kaggle submission.
- Features were choen as a first level improvement to the Null "baseline" submission.

In [74]:
# Creating our features list; note that ID is not among them.
# Pandas and Numpy will preserve the order of predictions on its own.
features = ['Year Remod/Add', 'Year Built', '1st Flr SF', 'Gr Liv Area', 'Overall Qual'] #RMSE Kaggle=36,743.34
features2 = ['Year Remod/Add', 'Year Built','1st Flr SF','Total Bsmt SF','Garage Area', 'Gr Liv Area', 'Overall Qual'] #RMSE Kaggle=35,957.49


In [75]:
print(train_df[['Garage Area','Total Bsmt SF']].isnull().sum())
train_df.dropna(subset=['Garage Area','Total Bsmt SF'],inplace=True)

Garage Area      1
Total Bsmt SF    1
dtype: int64


In [76]:
# Setting up our features and our target from the train_df to feed into a linear regression.
X = train_df[features2]
y_actual = train_df['SalePrice']

In [77]:
# Check that we have an equal number of observations in our X and y.
# Verify dimensions, n = No. of rows
print('X:        ', X.shape) # X.shape equals (n,p)
print('y_actual: ', y_actual.shape) # y.shape equals (n, null)

X:         (2049, 7)
y_actual:  (2049,)


In [78]:
# Check that everything is copacetic.
X.head(3)

Unnamed: 0,Year Remod/Add,Year Built,1st Flr SF,Total Bsmt SF,Garage Area,Gr Liv Area,Overall Qual
0,2005,1976,725,725.0,475.0,1479,6
1,1997,1996,913,913.0,559.0,2122,7
2,2007,1953,1057,1057.0,246.0,1057,5


##### Model 4: Submission

In [79]:
#One-hot encode Categorical variables to concat with scaled numerical features
X4k_train_dum = pd.get_dummies(cat_select_traindf,columns=list(cat_select_traindf.columns),drop_first=True)
X4k_test_dum = pd.get_dummies(cat_select_testdf,columns=list(cat_select_testdf.columns),drop_first=True)

In [80]:
# Setup y-target
y = np.log(train4_df['SalePrice'])

In [81]:
y

0       11.779129
1       12.301383
2       11.599103
3       12.066811
4       11.838626
          ...    
2044    12.607366
2045    11.320554
2046    12.083905
2047    11.877569
2048    12.149502
Name: SalePrice, Length: 2049, dtype: float64

In [82]:
#Setup X variables
X4k_train = train4_df[['Year Remod/Add', 'Year Built', '1st Flr SF','Total Bsmt SF','Garage Area', 'Gr Liv Area', 'Overall Qual']]
X4k_test  = test_df[ ['Year Remod/Add', 'Year Built', '1st Flr SF','Total Bsmt SF','Garage Area', 'Gr Liv Area', 'Overall Qual']]

In [83]:
X4k_test

Unnamed: 0,Year Remod/Add,Year Built,1st Flr SF,Total Bsmt SF,Garage Area,Gr Liv Area,Overall Qual
0,1950,1910,908,1020,440,1928,6
1,1977,1977,1967,1967,580,1967,5
2,2006,2006,664,654,426,1496,7
3,2006,1923,968,968,480,968,5
4,1963,1963,1394,1394,514,1394,6
...,...,...,...,...,...,...,...
873,1974,1974,1084,1084,488,1877,6
874,1999,1966,1104,1104,480,1988,6
875,1968,1968,1211,952,322,1211,5
876,1971,1971,864,864,528,864,4


In [84]:
#Scale the numerical-data (convert features into Z-scores = [X-mean(X)] / std(X))
ss4k = StandardScaler()
ss4k.fit(X4k_train)
Z4k_train = ss4k.transform(X4k_train)
Z4k_test = ss4k.transform(X4k_test)

In [85]:
# Convert np series to pd DataFrame for concat with categorical features
Z4k_traindf = pd.DataFrame(Z4k_train)
Z4k_testdf = pd.DataFrame(Z4k_test)

In [86]:
# Rename columns of numerical features back to original ID
Z4k_train = Z4k_traindf.rename(columns={0: "Year Remod/Add",
                            1: "Year Built", 
                            2: "1st Flr SF",
                            3: "Total Bsmt SF",
                            4: "Garage Area",
                            5: "Gr Liv Area",
                            6: "Overall Qual"})
Z4k_test = Z4k_testdf.rename(columns={0: "Year Remod/Add",
                            1: "Year Built", 
                            2: "1st Flr SF",
                            3: "Total Bsmt SF",
                            4: "Garage Area",
                            5: "Gr Liv Area",
                            6: "Overall Qual"})

In [87]:
# Concatenate scaled-numerical and encoded-categorical data for testing
Z4k_train = pd.concat([Z4k_train,X4k_train_dum], axis=1)
Z4k_test = pd.concat([Z4k_test,X4k_test_dum], axis=1)

In [89]:
print(Z4k_train.columns)
print(Z4k_test.shape)

Index(['Year Remod/Add', 'Year Built', '1st Flr SF', 'Total Bsmt SF',
       'Garage Area', 'Gr Liv Area', 'Overall Qual', 'NeighborhoodC_1',
       'NeighborhoodC_2', 'Paved DriveC_1', 'Lot ShapeC_1', 'Land ContourC_1',
       'Lot ConfigC_1', 'Condition 1C_1', 'Condition 1C_2', 'Bldg TypeC_1',
       'Exterior 1stC_1', 'Exter QualC_1', 'Exter QualC_2', 'FoundationC_1',
       'FoundationC_2', 'Heating QCC_1', 'Heating QCC_2', 'Central AirC_1'],
      dtype='object')
(878, 24)


In [90]:
# Instantiate linear regression model
lm = LinearRegression()
lm4 = LinearRegression()

In [91]:
# Fit the linear regression to chosen features.
#lm.fit(X, y_actual)
lm4.fit(Z4_train, y)

LinearRegression()

In [14]:
# The `lm` object contains our model's coefficients
lm.coef_

array([3.13096954e+02, 2.38379913e+02, 1.76578962e+01, 1.79254203e+01,
       5.06272306e+01, 4.38464706e+01, 2.02464621e+04])

In [93]:
# And the y-intercept.
#lm.intercept_
lm4.intercept_

12.024810377556943

In [94]:
# Create predictions using the `lm` object.
y_pred4 = lm4.predict(Z4_test)

In [97]:
test_df['SalePrice'] = np.exp(y_pred4)

In [98]:
test_df[['Id','SalePrice']]

Unnamed: 0,Id,SalePrice
0,2658,144588.629923
1,2718,190723.156767
2,2414,190309.480814
3,1989,123445.071423
4,625,159129.150206
...,...,...
873,1662,175933.961721
874,1234,188010.118541
875,1373,127873.175079
876,1672,109641.585224


In [17]:
# Score it: Evaluate the model locally with training values of Sale Price
# Mean Squared Error (MSE)
MSE = metrics.mean_squared_error(y_actual,y_pred)
print(f'MSE: {MSE:,.2f}')
# Root Mean Squared Error (RMSE)
RMSE = metrics.mean_squared_error(y_actual,y_pred,squared=False)
print(f'RMSE: {RMSE:,.2f}')

MSE: 1,317,520,890.96
RMSE: 36,297.67


### Start Kaggle specific Notebook here

In [7]:
# We make our predictions on the test df, which does not have a SalePrice column.
# The SalePrice columns is what we want to create with our model.
%store -r features3
#X_score = test_df[features2]
X_score4 = test_df[features3]
y_score = lm.predict(X_score)

KeyError: "['NeighborhoodC_1', 'Condition 1C_2', 'FoundationC_2', 'NeighborhoodC_2', 'Heating QCC_2', 'FoundationC_1', 'Lot ConfigC_1', 'Lot ShapeC_1', 'Heating QCC_1', 'Exterior 1stC_1', 'Central AirC_1', 'Exter QualC_2', 'Paved DriveC_1', 'Exter QualC_1', 'Land ContourC_1', 'Condition 1C_1', 'Bldg TypeC_1'] not in index"

In [19]:
# We have a list of predicted house prices.
# numpy and pandas preserve the order of these predictions from the df.
y_score[:10]

array([165670.03704389, 214321.5566123 , 195814.71319686, 126115.77262663,
       177992.67734344,  87503.74993506, 111418.15308434, 145951.56725929,
       217423.0393148 , 173015.40163308])

In [10]:
len(y_score4)

513

In [9]:
len(test_df)

878

In [8]:
# We don't have SalePrice in our test dataframe. That's what we're trying to create.
%store -r y_score4
test_df['SalePrice'] = y_score4

ValueError: Length of values does not match length of index

In [21]:
# Check that everything is copacetic.
test_df.head(3)

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,,,,0,4,2006,WD,165670.037044
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,,,,0,8,2006,WD,214321.556612
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,,,,0,9,2006,New,195814.713197


In [99]:
# Creating a submission dataframe out of the original test_df
submission = test_df[['Id','SalePrice']].copy()

In [100]:
# Write our dataframe to a csv WITHOUT the index column, because that's how Kaggle wants it.
submission.to_csv(data+'submissions/fourth_submission.csv',index=False)

In [101]:
# Triple-Check that everything is copacetic.
submission.head(3)

Unnamed: 0,Id,SalePrice
0,2658,144588.629923
1,2718,190723.156767
2,2414,190309.480814
