In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score
from statsmodels.formula.api import ols
from sklearn import metrics

In [2]:
def re_name(df):
    news = []
    for i in df.columns:
        col = i.lower()
        splits = (col.split())
        joint = '_'.join(splits)
        news.append(joint)
    return news

In [3]:
def fix_nulls(df):
    nulls = []
    for i in df.columns:
        if df[i].isnull().sum() > 0:
            nulls.append(i)
    num_columns = []
    object_columns = []
    for i in nulls:
        if df[i].dtype != 'object':
            num_columns.append(i)
        else:
            object_columns.append(i)
    for i in num_columns:
        df[i].fillna(0, inplace=True)
    for i in object_columns:
        df[i].fillna('None', inplace=True)
    return     

In [4]:
def null_check(df, cols):
    nulls = []
    for col in cols:
        if df[col].isnull().sum() > 0:
            nulls.append((col, df[col].isnull().sum()))
    return nulls


In [5]:
def add_features(df):
    df['total_sf'] = df['1st_flr_sf'] + df['2nd_flr_sf']
    df.drop(columns = ['1st_flr_sf', '2nd_flr_sf'], inplace=True)
    
    
    
    

In [6]:
def dumify(df):
    df = pd.get_dummies(df, drop_first=True)
    
    

In [7]:
train_features = ['lot_frontage', 'overall_qual','total_bsmt_sf', 
            '1st_flr_sf', '2nd_flr_sf', 'low_qual_fin_sf', 'gr_liv_area', 'full_bath', 'fireplaces', 'garage_cars',
           'garage_area', 'totrms_abvgrd', 'saleprice']
test_features = ['lot_frontage', 'overall_qual', 'total_bsmt_sf', 
            '1st_flr_sf', '2nd_flr_sf', 'low_qual_fin_sf', 'gr_liv_area', 'full_bath', 'fireplaces', 'garage_cars',
           'garage_area', 'totrms_abvgrd']



## Read and Clean Test.csv

In [8]:
test = pd.read_csv('datasets/test.csv')
test.columns = re_name(test)
test = test[test_features]
test


Unnamed: 0,lot_frontage,overall_qual,total_bsmt_sf,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,full_bath,fireplaces,garage_cars,garage_area,totrms_abvgrd
0,69.0,6,1020,908,1020,0,1928,2,0,1,440,9
1,,5,1967,1967,0,0,1967,2,0,2,580,10
2,58.0,7,654,664,832,0,1496,2,1,2,426,7
3,60.0,5,968,968,0,0,968,1,0,2,480,5
4,,6,1394,1394,0,0,1394,1,2,2,514,6
...,...,...,...,...,...,...,...,...,...,...,...,...
873,80.0,6,1084,1084,793,0,1877,2,1,2,488,8
874,90.0,6,1104,1104,884,0,1988,2,1,2,480,9
875,55.0,5,952,1211,0,0,1211,1,1,1,322,5
876,60.0,4,864,864,0,0,864,1,0,2,528,5


In [9]:
null_check(test, test.columns)


[('lot_frontage', 160)]

In [10]:
fix_nulls(test)


In [11]:
null_check(test, test.columns)


[]

In [12]:
test.shape

(878, 12)

In [13]:
add_features(test)

In [14]:
#testing and training data are the same shape, can make predictions on the testing data 
test.shape

(878, 11)

In [15]:
test = pd.get_dummies(test)

In [16]:
test.shape

(878, 11)

In [17]:
test.head()

Unnamed: 0,lot_frontage,overall_qual,total_bsmt_sf,low_qual_fin_sf,gr_liv_area,full_bath,fireplaces,garage_cars,garage_area,totrms_abvgrd,total_sf
0,69.0,6,1020,0,1928,2,0,1,440,9,1928
1,0.0,5,1967,0,1967,2,0,2,580,10,1967
2,58.0,7,654,0,1496,2,1,2,426,7,1496
3,60.0,5,968,0,968,1,0,2,480,5,968
4,0.0,6,1394,0,1394,1,2,2,514,6,1394


In [18]:
test.columns

Index(['lot_frontage', 'overall_qual', 'total_bsmt_sf', 'low_qual_fin_sf',
       'gr_liv_area', 'full_bath', 'fireplaces', 'garage_cars', 'garage_area',
       'totrms_abvgrd', 'total_sf'],
      dtype='object')


## Read and Clean Train.csv 

In [19]:
train = pd.read_csv('datasets/base_train.csv')
train.drop(columns='Unnamed: 0', inplace=True)


In [20]:
train = train[train_features]

In [21]:
train.shape

(2051, 13)

In [22]:
add_features(train)

In [23]:
dum = pd.get_dummies(train, drop_first=True)

In [24]:
dum.columns

Index(['lot_frontage', 'overall_qual', 'total_bsmt_sf', 'low_qual_fin_sf',
       'gr_liv_area', 'full_bath', 'fireplaces', 'garage_cars', 'garage_area',
       'totrms_abvgrd', 'saleprice', 'total_sf'],
      dtype='object')

In [25]:
dum.shape

(2051, 12)

In [793]:
dum.isnull().sum()

lot_frontage       0
overall_qual       0
total_bsmt_sf      0
low_qual_fin_sf    0
gr_liv_area        0
full_bath          0
fireplaces         0
garage_cars        0
garage_area        0
totrms_abvgrd      0
saleprice          0
total_sf           0
dtype: int64

## Model A 

In [802]:
lr = LinearRegression()

In [815]:
#features are numeric only
X = dum.drop(columns='saleprice')
y = dum['saleprice']

In [816]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [820]:
lr.fit(X_train, y_train)

LinearRegression()

In [821]:
#a difference of 0.04 implies my model has too much variance,
#the scores are also not very high, implying it has too much bias and more variables need to be included
train_r2 = lr.score(X_train,y_train)
test_r2 = lr.score(X_test,y_test)

print(f'The training R2 is {round(train_r2,5)}.')
print(f'The testing R2 is {round(test_r2,5)}.')

The training R2 is 0.77435.
The testing R2 is 0.81379.


In [823]:
#good starting score
cross_val_score(lr,X_train,y_train,cv=20).mean()

0.7644998563037665

In [826]:
#not sure why totrms_abvgrd has a negative coef, might be due to collinearity with another feature or 
#outliers are throwing it off
coef_dict = {'coefficient':lr.coef_,'feature':X.columns}
coef_df = pd.DataFrame(coef_dict)
coef_df

Unnamed: 0,coefficient,feature
0,71.540808,lot_frontage
1,24793.376615,overall_qual
2,25.993407,total_bsmt_sf
3,-11.700682,low_qual_fin_sf
4,12.915311,gr_liv_area
5,3043.70288,full_bath
6,10425.584015,fireplaces
7,9619.59335,garage_cars
8,31.670838,garage_area
9,-27.073726,totrms_abvgrd


In [830]:
y_preds = lr.predict(test)

In [832]:
y_preds.mean()

179645.98480496177

In [834]:
df = pd.read_csv('datasets/test.csv')

In [837]:
submission = pd.DataFrame(df['Id'])

In [839]:
submission['SalePrice'] = y_preds

In [840]:
submission

Unnamed: 0,Id,SalePrice
0,2658,181955.020528
1,2718,192331.242051
2,2414,199890.271522
3,1989,127087.087315
4,625,196550.444669
...,...,...
873,1662,204083.902279
874,1234,209204.712715
875,1373,131235.593754
876,1672,97207.341082


In [841]:
submission.to_csv('submission_a.csv', index=False)