In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('./datasets/train.csv')

In [3]:
kaggle_test=pd.read_csv('./datasets/test.csv')

In [4]:
df.shape, kaggle_test.shape

((2051, 81), (878, 80))

In [6]:
df.head(3)
#has a sale price column

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000


In [8]:
kaggle_test.head(3)
#no sale price because we are trying to create a model for a true sale price

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,0,,,,0,9,2006,New


# Preliminary EDA

In [10]:
df.shape

(2051, 81)

In [14]:
df.corr()[['SalePrice']].sort_values(by='SalePrice',ascending=False).head(3)
#positive correlations have stronger relationships than negative
#Will use Overall Quality and Gr Liv Area

Unnamed: 0,SalePrice
SalePrice,1.0
Overall Qual,0.800207
Gr Liv Area,0.697038


## Create Variables

In [16]:
xvars = ['Overall Qual','Gr Liv Area']

In [17]:
X = df[xvars]
y=df['SalePrice']

#nulls have to be accounted for somehow. will get this error:
#Input contains NaN, infinity or a value too large for dtype('float64').

In [22]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)

In [23]:
X_train.shape, y_train.shape

((1538, 2), (1538,))

In [24]:
X_test.shape, y_test.shape

((513, 2), (513,))

In [27]:
lr = LinearRegression()
lr.fit(X,y)
lr.score(X_train,y_train), lr.score(X_test,y_test)

(0.7193561686388611, 0.7534470422118426)

In [32]:
#check the mean square error, kaggle doesn't accept R2 score

## Evaluate model

In [28]:
y_test_preds = lr.predict(X_test)

In [30]:
mean_squared_error(y_test,y_test_preds) **0.5


38908.11301194894

In [31]:
#on testing, my predictions are wrong by 38,908 between
min(y_test),max(y_test)

(12789, 460000)

## Generate submission to upload

In [33]:
kaggle_test.head(3)
#submit ID and sale price

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,0,,,,0,9,2006,New


Getting prediction off of `kaggle_test`

In [34]:
xvars

['Overall Qual', 'Gr Liv Area']

In [37]:
kaggle_test[xvars].head(3)

Unnamed: 0,Overall Qual,Gr Liv Area
0,6,1928
1,5,1967
2,7,1496


In [39]:
kaggle_preds=lr.predict(kaggle_test[xvars])
kaggle_preds[:10]

array([202108.23658375, 171155.152948  , 210730.03858826, 114389.37597702,
       171764.96841307,  88266.21330477,  88323.03590434, 131436.15584818,
       221185.39690924, 165230.36946245])

In [42]:
#create a dictionary from preds
submission = pd.DataFrame({
    "Id":kaggle_test['Id'],
    'SalePrice':kaggle_preds
    
})

In [44]:
#create a submissions folder of different tests
#make sure to set index to False
submission.to_csv('submissions/name.csv',index=False)

In [None]:
#upload to kaggle

Want to create multiple models using one variable vs many variables.  
Use random variables
Submit the prediction dictionary from the model you thought was successful,
or submit multiple prediciton dictionaries from models you thought was strong.

Can create a readable variable to change xvars, and pull into models.
But really understand what each variable is.

# Make Kaggle submission off of pipeline

```python
X_train.columns == test.columns

pipe_pred=pipe.predict(test)

submission = pd.DataFrame({
    "Id":X_test['Id'],
    'SalePrice':pipe_preds
})

submission.to_csv('submissions/name.csv',index=False) 

```

You will need to add or subtract columns in test to make it look like columns in train if you are going to submit a pipeline for Kaggle.