In [69]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression #class(LinearRegression) inside module(linear_model) inside library(sklearn)
from sklearn import metrics

In [70]:
# Set up the data
data = "../data/"
train = "datasets/train.csv"
test = "datasets/test.csv"

In [71]:
#Read in the data
train_df = pd.read_csv(data+train)
test_df = pd.read_csv(data+test)

In [72]:
# Check that everything is copacetic
train_df.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [73]:
test_df.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,...,0,185,0,,,,0,7,2009,WD



**Second Prediction: Linear Regression**

The "second_submission" includes Linear Regreesion Modeling and reflects the results unclean data.
- Features included were randomly chosen to minimize complications in Kaggle submission.
- Features were choen as a first level improvement to the Null "baseline" submission.

In [214]:
# Creating our features list; note that ID is not among them.
# Pandas and Numpy will preserve the order of predictions on its own.
features = ['Year Remod/Add', 'Year Built', '1st Flr SF', 'Gr Liv Area', 'Overall Qual'] #RMSE Kaggle=36743.34764
features2 = ['Year Remod/Add', 'Year Built','1st Flr SF','Total Bsmt SF','Garage Area', 'Gr Liv Area', 'Overall Qual'] #RMSE Kaggle=35957.49

In [215]:
print(train_df[['Garage Area','Total Bsmt SF']].isnull().sum())
train_df.dropna(subset=['Garage Area','Total Bsmt SF'],inplace=True)

Garage Area      0
Total Bsmt SF    0
dtype: int64
0
Garage Area      0
Total Bsmt SF    0
dtype: int64


In [216]:
# Setting up our features and our target from the train_df to feed into a linear regression.
X = train_df[features2]
y_actual = train_df['SalePrice']

In [217]:
# Check that we have an equal number of observations in our X and y.
# Verify dimensions, n = No. of rows
print('X:        ', X.shape) # X.shape equals (n,p)
print('y_actual: ', y_actual.shape) # y.shape equals (n, null)

X:         (2049, 7)
y_actual:  (2049,)


In [242]:
# Check that everything is copacetic.
X.head(3)

Unnamed: 0,Year Remod/Add,Year Built,1st Flr SF,Total Bsmt SF,Garage Area,Gr Liv Area,Overall Qual
0,2005,1976,725,725.0,475.0,1479,6
1,1997,1996,913,913.0,559.0,2122,7
2,2007,1953,1057,1057.0,246.0,1057,5


In [219]:
# Instantiate linear regression model
lm = LinearRegression()

In [220]:
# Fit the linear regression to chosen features.
lm.fit(X, y_actual)

LinearRegression()

In [221]:
# The `lm` object contains our model's coefficients
lm.coef_

array([3.13096954e+02, 2.38379913e+02, 1.76578962e+01, 1.79254203e+01,
       5.06272306e+01, 4.38464706e+01, 2.02464621e+04])

In [222]:
# And the y-intercept.
lm.intercept_

-1162782.7051056402

In [223]:
# Create predictions using the `lm` object.
y_pred = lm.predict(X)

In [224]:
# Evaluate the model locally with training values of Sale Price
metrics.mean_squared_error(y_actual,y_pred)**(1/2)

36297.670599703146

### Start Kaggle specific Notebook here

In [225]:
# We make our predictions on the test df, which does not have a SalePrice column.
# The SalePrice columns is what we want to create with our model.
X_score = test_df[features2]
y_score = lm.predict(X_score)

In [237]:
# We have a list of predicted house prices.
# numpy and pandas preserve the order of these predictions from the df.
y_score[:10]

array([165670.03704389, 214321.5566123 , 195814.71319686, 126115.77262663,
       177992.67734344,  87503.74993506, 111418.15308434, 145951.56725929,
       217423.0393148 , 173015.40163308])

In [227]:
# We don't have SalePrice in our test dataframe. That's what we're trying to create.
test_df['SalePrice'] = y_score

In [238]:
# Check that everything is copacetic.
test_df.head(3)

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,,,,0,4,2006,WD,165670.037044
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,,,,0,8,2006,WD,214321.556612
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,,,,0,9,2006,New,195814.713197


In [229]:
# Creating a submission dataframe out of the original test_df
submission = test_df[['Id','SalePrice']].copy()

In [232]:
# Write our dataframe to a csv WITHOUT the index column, because that's how Kaggle wants it.
submission.to_csv(data+'submissions/fourth_submission.csv',index=False)

In [241]:
# Triple-Check that everything is copacetic.
submission.head(3)

Unnamed: 0,Id,SalePrice
0,2658,165670.037044
1,2718,214321.556612
2,2414,195814.713197
