**Import Packages**

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import r2_score

%matplotlib inline

It is your job to predict the sales price for each house. For each Id in the test set, you must predict the value of the SalePrice variable.

**Read in Raw Data**

In [2]:
filepath_train_data="./datasets/train.csv"
filepath_test_data="./datasets/test.csv"

df_train = pd.read_csv(filepath_train_data)
df_test = pd.read_csv(filepath_test_data) # We won't touch this until after we've created model

In [3]:
# check the shape of df_train
df_train.shape

(2051, 81)

In [4]:
# check the shape of df_test
df_test.shape

(878, 80)

In [5]:
df_train.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [6]:
df_test.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,...,0,185,0,,,,0,7,2009,WD


In [8]:
#clean up column names 
#we want all lower case
#we want _ instead of spaces 
#we want _ instead of / 

def clean(df):
    df.columns=df.columns.str.lower().str.replace("/ ","_").str.replace(" ","_")
    return df 

In [9]:
df_train=clean(df_train)

In [10]:
df_test=clean(df_test)

In [11]:
# Quick check to see which column is in train.csv but not test.csv
def cols_not_in(test,train):
    cols_not_in_train= []
    for column in df_test.columns:
        if column not in df_train.columns: 
            cols_not_in_train.append(column)
    return cols_not_in_train

cols_not_in(df_test,df_train)

[]

In [12]:
len(df_test.columns)

80

In [13]:
len(df_train.columns)

81

In [None]:
pd.set_option('display.max_columns', 100)

In [None]:
df_train.head(30)

In [None]:
df_test.head()

# Data Cleaning

In [None]:
# Setting the index for our data frame according to one of the columns (Id)
df_train.set_index("Id",inplace=True)

In [None]:
df_train.shape

In [None]:
{final: df_train[final].isnull().sum() for final in df_train.columns if df_train[final].isnull().sum() > 0}

In [None]:
# What percent of each column are missing?
df_train.isnull().mean()

In [None]:
{final: df_train[final].isnull().mean()*100 for final in df_train.columns if df_train[final].isnull().mean() > 0}
# ignore all of these features except for 'Total Bsmt SF', Garage Yr Blt',Garage Cars',Garage Area

In [None]:
df_train.columns

In [None]:
df_train.info()

In [None]:
df_train.SalePrice.describe()

In [None]:
df_train.columns

In [None]:
df_test.columns

In [None]:
columns_to_drop= [final for final in df_train.columns if (df_train[final].isnull().mean() > 0) & (final !='total_bsmt_sf') & (final!='garage_yr_blt') & (final!='garage_cars') & (final!="garage_area")] 

# Total Bsmt SF', Garage Yr Blt',Garage Cars',Garage Area

In [None]:
columns_to_drop

In [None]:
df_train.drop(columns=columns_to_drop,inplace=True)

In [None]:
df_train["garage_yr_blt"].dropna(inplace=True)

In [None]:
df_train["garage_cars"].dropna(inplace=True)

In [None]:
df_train["garage_area"].dropna(inplace=True,axis=0) 

In [None]:
plt.figure(figsize=(4,10))
sns.heatmap(df_train.corr()[["saleprice"]].sort_values(by="saleprice",ascending=False), annot=True)

In [None]:
df_train.dropna(inplace=True)

In [None]:
df_train.head(50)

# EDA

Done!

## Preprocessing and Modeling

In [None]:
# Use "Overall Qual", "Lot Area", "Street" as features
#by looking at the correlations, potential factors: 
# YEAR BUILT-.57
# MAS VNR AREA - .51
# total bsmnt sf- .63 
# 1st floor SF- .62 
# gr liv area- .7
# full bath- .5  
# TotRms AbvGrd-.50 
# Garage Yr Blt .53 
# garage cars .65
# garage area- .65
# OVERALL QUAL-.8
# YEAR REMOD/ADD - .55 

#First, I'd like to create a simple model with two features with the highest correlations: overall qual and gr liv area
features = ["overall_qual","gr_liv_area","garage_area","garage_cars","total_bsmt_sf","year_built","year_remod/add","1st_flr_sf"]
X = df_train[features]
y = df_train["saleprice"]

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=9)

In [None]:
# Convert the street categories into processable numbers (i.e. turn categories into dummy variables)
lr=LinearRegression()

In [None]:
# Train-Test Split
cross_val_score(lr,X_train,y_train,cv=5).mean()

In [None]:
lr.fit(X_train,y_train)

In [None]:
predictions=lr.predict(X_test)

In [None]:
lr.score(X_train,y_train)

In [None]:
lr.score(X_test,y_test)

In [None]:
### Establish a Baseline Model that predicts y_train_mean for EVERYTHING

# Import Package
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error
# Instantiate: creates a skeleton of a model that will always predict the mean of the target
base_mean = DummyRegressor(strategy='mean')

# Fit the (terrible) model
base_mean = base_mean.fit(X_train, y_train)

# Get predictions!
y_hat_base_train = base_mean.predict(X_train)
y_hat_base_test = base_mean.predict(X_test)

# Evaluate: let's use RMSE
print(f'Our train RMSE scorefor our Baseline Model is: {np.sqrt(mean_squared_error(y_train, y_hat_base_train))}')
print(f'Our test RMSE score for our Baseline Model is: {np.sqrt(mean_squared_error(y_test, y_hat_base_test))}')

**My Actual Model: Linear Regression**

In [None]:
#Add features 

In [None]:
# Instantiate our linear regression object: 


# Fit our model:


In [None]:
# What are columns that are fed as input?


In [None]:
# What is beta_0?


In [None]:
# What are the rest of our betas?


In [None]:
# Make Predictions
y_hat_lr_train =   # familiar data
y_hat_lr_test =  # unseen/unfamiliar data

# Get RMSE


# Preparing .csv to submit to Kaggle

In [None]:
features

In [None]:
# Create a dataframe of our features from our testing data
X_kaggle = df_test[features]

### WARNING ###
# NEED to apply the feature engineering methods you did above to this X_kaggle dataset too!
# Otherwise, your model won't have the inputs it expects!
X_kaggle = pd.get_dummies(X_kaggle, columns = ["Street"])

In [None]:
# Make predictions and save those predictions to a new column
X_kaggle['SalePrice'] = lr.predict(X_kaggle)
X_kaggle.head()

In [None]:
# Create a new dataframe of JUST our predictions
output = X_kaggle[['SalePrice']]
output.head()

In [None]:
# Saving our predictions to our datasets folder
output.to_csv("./datasets/my_first_submission.csv")