In [69]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression #class(LinearRegression) inside module(linear_model) inside library(sklearn)
from sklearn import metrics

In [70]:
# Set up the data
data = "../data/"
train = "datasets/train.csv"
test = "datasets/test.csv"

In [71]:
#Read in the data
train_df = pd.read_csv(data+train)
test_df = pd.read_csv(data+test)

In [72]:
# Check that everything is copacetic
train_df.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [73]:
test_df.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,...,0,185,0,,,,0,7,2009,WD



**Second Prediction: Linear Regression**

The "second_submission" includes Linear Regreesion Modeling and reflects the results unclean data.
- Features included were randomly chosen to minimize complications in Kaggle submission.
- Features were choen as a first level improvement to the Null "baseline" submission.

In [107]:
# Creating our features list; note that ID is not among them.
# Pandas and Numpy will preserve the order of predictions on its own.
features = ['Year Remod/Add', 'Year Built', '1st Flr SF', 'Gr Liv Area', 'Overall Qual']

In [108]:
# Setting up our features and our target from the train_df to feed into a linear regression.
X = train_df[features]
y_actualdf = train_df[['SalePrice']]
y_actual   = train_df['SalePrice']

In [109]:
# Check that we have an equal number of observations in our X and y.
X.shape[0] == y_actualdf.shape[0]
# Verify dimensions, n = No. of rows
print('X:        ', X.shape) # X.shape equals (n,p)
print('y_actual: ', y_actual.shape) # y.shape equals (n, null)

X:         (2051, 5)
y_actual:  (2051,)


In [110]:
# Check that everything is copacetic.
X.head()

Unnamed: 0,Year Remod/Add,Year Built,1st Flr SF,Gr Liv Area,Overall Qual
0,2005,1976,725,1479,6
1,1997,1996,913,2122,7
2,2007,1953,1057,1057,5
3,2007,2006,744,1444,5
4,1993,1900,831,1445,6


In [111]:
# Instantiate linear regression model
lm = LinearRegression()

In [112]:
# Fit the linear regression to chosen features.
lm.fit(X, y_actual)

LinearRegression()

In [113]:
# The `lm` object contains our model's coefficients
lm.coef_

array([  318.60667037,   343.81461712,    38.69940317,    46.34025652,
       22987.26988116])

In [114]:
# And the y-intercept.
lm.intercept_

-1383654.6188068874

In [115]:
# Create predictions using the `lm` object.
y_pred = lm.predict(X)

In [116]:
# Evaluate the model locally with training values of Sale Price
metrics.mean_squared_error(y_actual,y_pred)**(1/2)

37495.67578118966

**NOTE:** RMSE(Kaggle) = 36743.34764

### Start Kaggle specific Notebook here

In [117]:
# We make our predictions on the test df, which does not have a SalePrice column.
# The SalePrice columns is what we want to create with our model.
X_score = test_df[features]
y_score = lm.predict(X_score)

In [118]:
# We have a list of predicted house prices.
# numpy and pandas preserve the order of these predictions from the df.
y_score

array([156720.99904619, 208161.62656929, 201094.80052052, 113880.61065518,
       173147.27342328,  86322.67389328, 113556.80983445, 152542.1844904 ,
       215107.66010318, 172641.61058374, 176643.04970777, 139217.87188312,
       165569.7283018 , 276450.04316363, 175728.17955893, 157205.06104191,
       143307.24567406, 118225.32376924, 213929.26865217, 196228.87717425,
       135582.88376521, 103764.14133147, 212302.3628291 , 142595.69931407,
       210314.0998484 , 111050.77549382, 137975.41430739, 135238.1904483 ,
       144233.78912869,  23462.73641948,  86890.84636704,  80216.72965353,
       235218.40069262, 147018.03002385, 229640.43887853, 197485.31086568,
       120433.75340481,  83166.47860082, 116076.68220308, 217238.88857547,
       165477.8757627 , 225486.96265981, 155512.74801872, 155186.44548987,
       221131.89779094, 118534.87903411, 221447.0223559 , 115002.8321101 ,
       120593.58388785, 120055.96401188, 109253.44468387, 237778.63193165,
       267062.11462648, 1

In [119]:
# We don't have SalePrice in our test dataframe. That's what we're trying to create.
test_df['SalePrice'] = y_score

In [120]:
# Check that everything is copacetic.
test_df.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,,,,0,4,2006,WD,156720.999046
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,,,,0,8,2006,WD,208161.626569
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,,,,0,9,2006,New,201094.800521
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,0,,,,0,7,2007,WD,113880.610655
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,...,185,0,,,,0,7,2009,WD,173147.273423


In [121]:
# Creating a submission dataframe out of the original test_df
submission = test_df[['Id','SalePrice']].copy()

In [124]:
# Write our dataframe to a csv WITHOUT the index column, because that's how Kaggle wants it.
submission.to_csv(data+'submissions/second_submission',index=False)

In [123]:
# Triple-Check that everything is copacetic.
X.head()
submission

Unnamed: 0,Id,SalePrice
0,2658,156720.999046
1,2718,208161.626569
2,2414,201094.800521
3,1989,113880.610655
4,625,173147.273423
...,...,...
873,1662,190819.436505
874,1234,201951.842865
875,1373,137909.852256
876,1672,87401.084325
