In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression #class(LinearRegression) inside module(linear_model) inside library(sklearn)
from sklearn import metrics

In [2]:
# Set up the data
data = "./datasets/"
train = "train.csv"
test = "test.csv"

In [3]:
#Read in the data
train_df = pd.read_csv(data+train)
test_df = pd.read_csv(data+test)

In [4]:
# Check that everything is copacetic
train_df.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [5]:
test_df.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,...,0,185,0,,,,0,7,2009,WD



**Second Prediction: Linear Regression**

The "null model" includes no modeling and reflects the results of random guessing.
- For regression problems, we generally use the mean of our target variable.
- For classification problems, we use the majority class.

In [8]:
# Creating our features list; note that ID is not among them.
# Pandas and Numpy will preserve the order of predictions on its own.
features = ['Lot Area', 'Yr Sold', 'Overall Qual', 'Year Remod/Add', 'Year Built']

In [9]:
# Setting up our features and our target from the train_df to feed into a linear regression.
X_train = train_df[features]
y_train = train_df[['SalePrice']]
#train_df['SalePrice']

In [10]:
# Check that we have an equal number of observations in our X and y.
X_train.shape[0] == y_train.shape[0]

True

In [11]:
# Instantiate our linear regression
lm = LinearRegression()

In [12]:
# Fit the linear regression to our train features.
lm.fit(X_train, y_train)

LinearRegression()

In [13]:
# The `lm` object contains our model's coefficients
lm.coef_

array([[ 2.56614593e+00, -2.53054743e+02,  3.67013059e+04,
         3.05908993e+02,  3.01170017e+02]])

In [14]:
# And the y-intercept.
lm.intercept_

array([-761406.67820179])

In [15]:
# We make our predictions on the test df, which does not have a SalePrice column.
# The SalePrice columns is what we want to create with our model.
X_test = test_df[features]

In [16]:
# Create our predictions using the `lm` object.
test_preds = lm.predict(X_test)

In [22]:
# We have a list of predicted house prices.
# Remember that numpy and pandas preserve the order of these predictions from the df.
#test_preds

In [None]:
# Score it
metrics.mean_squared_error(train_df['SalePrice'],train_df['baseline'])**(1/2)

In [18]:
# Use the test_preds array to create a "SalePrice" column in the test df
test_df['SalePrice'] = test_preds

In [19]:
# Check that everything is copacetic.
test_df.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,,,,0,4,2006,WD,146390.317086
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,,,,0,8,2006,WD,139461.340998
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,,,,0,9,2006,New,249566.502028
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,0,,,,0,7,2007,WD,128885.927494
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,...,185,0,,,,0,7,2009,WD,166488.660889


In [20]:
# Creating a submission dataframe out of the original test_df
submission = test_df[['Id','SalePrice']].copy()

In [21]:
# Write our dataframe to a csv WITHOUT the index column, because that's how Kaggle wants it.
submission.to_csv(data+'submissions/second_submission',index=False)