# Predicting House Sale Prices
---

### Prereqs

In [1]:
#Importin the libraries

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, KFold

%matplotlib inline

In [2]:
#Importing the data set

df=pd.read_csv('AmesHousing.tsv', delimiter="\t")
df.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


In [3]:
#Defining some functions

def transform_features(house):
    return house

def select_features(house):
    return house[['Gr Liv Area','SalePrice']]

def train_and_test(house):
    train=house.iloc[0:1460]
    test=house.iloc[1460:]
    
    train_num=train.select_dtypes(include=['integer','float'])
    test_num=test.select_dtypes(include=['integer','float'])
    
    features=train_num.columns.drop('SalePrice')
    lr=LinearRegression()
    lr.fit(train[features],train['SalePrice'])
    predict=lr.predict(test[features])
    rmse=np.sqrt(mean_squared_error(predict,test['SalePrice']))
    
    return rmse

transform_df = transform_features(df)
filtered_df = select_features(transform_df)
rmse = train_and_test(filtered_df)

rmse

57088.25161263909

### Feature Engineering

- Handle missing values:
    - All columns:
        - Drop any with 5% or more missing values **for now**.
    - Text columns:
        - Drop any with 1 or more missing values **for now**.
    - Numerical columns:
        - For columns with missing values, fill in with the most common value in that column

1: All columns: Drop any with 5% or more missing values **for now**.

In [4]:
#Finding the number of missing values
null_count=df.isnull().sum()
print(null_count[null_count>0.05*len(df)])

#Dropping the columns from the dataframe
df.drop(null_count[null_count>0.05*len(df)].index,axis=1,inplace=True)

Lot Frontage      490
Alley            2732
Fireplace Qu     1422
Garage Type       157
Garage Yr Blt     159
Garage Finish     159
Garage Qual       159
Garage Cond       159
Pool QC          2917
Fence            2358
Misc Feature     2824
dtype: int64


2: Text columns: Drop any with 1 or more missing values **for now**.

In [5]:
#Finding the number of text columns with one or more missing value
text_miss = df.select_dtypes(include=['object']).isnull().sum().sort_values(ascending=False)
print(text_miss[text_miss>=1])

#Dropping those columns from the dataframe
df.drop(text_miss[text_miss>=1].index,axis=1,inplace=True)

Bsmt Exposure     83
BsmtFin Type 2    81
BsmtFin Type 1    80
Bsmt Qual         80
Bsmt Cond         80
Mas Vnr Type      23
Electrical         1
dtype: int64


3: Numerical columns: For columns with missing values, fill in with the most common value in that column

In [6]:
#Finding the numerical columns with greater than 0 missing values but less than 5% missing
num_missing=df.select_dtypes(include=['float','integer']).isnull().sum()
crit=num_missing[(num_missing>0) & (num_missing<0.05*len(df))]
crit

Mas Vnr Area      23
BsmtFin SF 1       1
BsmtFin SF 2       1
Bsmt Unf SF        1
Total Bsmt SF      1
Bsmt Full Bath     2
Bsmt Half Bath     2
Garage Cars        1
Garage Area        1
dtype: int64

In [7]:
#Computing the mode of missing columns and creating a dictionary to replace it
replacement_values_dict = df[crit.index].mode().to_dict(orient='records')[0]
replacement_values_dict

#Replacing the values with the mode
df.fillna(replacement_values_dict,inplace=True)

#Checking if the nulls are filled
df.isnull().sum().unique()

array([0])

There are still some columns that we might not be able to use or even need to transform further in order to be able to use it, an example is the year based columns.

In [8]:
#Creating new columns for years
df['Years Before Sale']=df['Yr Sold'] - df['Year Built']
df['Years Since Remod']=df['Yr Sold'] - df['Year Remod/Add']

#Finding outliers in those columns
df.loc[(df['Years Before Sale'] | df['Years Since Remod'])<0,:]

#Dropping the outliers
df.drop([1702,2180,2181],axis=0,inplace=True)

#Rechecking
((df['Years Before Sale'] | df['Years Since Remod'])<0).sum()

0

Drop columns that:
- that aren't useful for ML
- leak data about the final sale, read more about columns [here](https://ww2.amstat.org/publications/jse/v19n3/decock/DataDocumentation.txt)

In [9]:
## Drop columns that aren't useful for ML
df = df.drop(["PID", "Order"], axis=1)

## Drop columns that leak info about the final sale
df = df.drop(["Mo Sold", "Sale Condition", "Sale Type", "Yr Sold"], axis=1)

Updating the transform function

In [10]:
def transform_features(df):
    df.drop(null_count[null_count>0.05*len(df)].index,axis=1,inplace=True)
    
    df.drop(text_miss[text_miss>=1].index,axis=1,inplace=True)
    
    num_missing=df.select_dtypes(include=['float','integer']).isnull().sum()
    crit=num_missing[(num_missing>0) & (num_missing<0.05*len(df))]
    replacement_values_dict = df[crit.index].mode().to_dict(orient='records')[0]
    replacement_values_dict
    df.fillna(replacement_values_dict,inplace=True)
    
    df['Years Before Sale']=df['Yr Sold'] - df['Year Built']
    df['Years Since Remod']=df['Yr Sold'] - df['Year Remod/Add']
    df.loc[(df['Years Before Sale'] | df['Years Since Remod'])<0,:]
    df.drop([1702,2180,2181],axis=0,inplace=True)
    
    df = df.drop(["PID", "Order"], axis=1)
    df = df.drop(["Mo Sold", "Sale Condition", "Sale Type", "Yr Sold"], axis=1)
    
    return df

df = pd.read_csv("AmesHousing.tsv", delimiter="\t")
transform_df = transform_features(df)
filtered_df = select_features(transform_df)
rmse = train_and_test(filtered_df)

rmse

55275.36731241307

### Feature Selection

In [11]:
#Selecting data with numerical types
numerical_df = transform_df.select_dtypes(include=['int', 'float'])
numerical_df.head(5)

Unnamed: 0,MS SubClass,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,BsmtFin SF 1,BsmtFin SF 2,Bsmt Unf SF,...,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,SalePrice,Years Before Sale,Years Since Remod
0,20,31770,6,5,1960,1960,112.0,639.0,0.0,441.0,...,210,62,0,0,0,0,0,215000,50,50
1,20,11622,5,6,1961,1961,0.0,468.0,144.0,270.0,...,140,0,0,0,120,0,0,105000,49,49
2,20,14267,6,6,1958,1958,108.0,923.0,0.0,406.0,...,393,36,0,0,0,0,12500,172000,52,52
3,20,11160,7,5,1968,1968,0.0,1065.0,0.0,1045.0,...,0,0,0,0,0,0,0,244000,42,42
4,60,13830,5,5,1997,1998,0.0,791.0,0.0,137.0,...,212,34,0,0,0,0,0,189900,13,12


In [12]:
#Finding the absolute correlation with the SalePrice column
abs_corr_coeffs = numerical_df.corr()['SalePrice'].abs().sort_values()
abs_corr_coeffs

BsmtFin SF 2         0.006127
Misc Val             0.019273
3Ssn Porch           0.032268
Bsmt Half Bath       0.035875
Low Qual Fin SF      0.037629
Pool Area            0.068438
MS SubClass          0.085128
Overall Cond         0.101540
Screen Porch         0.112280
Kitchen AbvGr        0.119760
Enclosed Porch       0.128685
Bedroom AbvGr        0.143916
Bsmt Unf SF          0.182751
Lot Area             0.267520
2nd Flr SF           0.269601
Bsmt Full Bath       0.276258
Half Bath            0.284871
Open Porch SF        0.316262
Wood Deck SF         0.328183
BsmtFin SF 1         0.439284
Fireplaces           0.474831
TotRms AbvGrd        0.498574
Mas Vnr Area         0.506983
Year Remod/Add       0.533007
Years Since Remod    0.534985
Full Bath            0.546118
Year Built           0.558490
Years Before Sale    0.558979
1st Flr SF           0.635185
Garage Area          0.641425
Total Bsmt SF        0.644012
Garage Cars          0.648361
Gr Liv Area          0.717596
Overall Qu

In [13]:
#Keeping columns with higher than 0.3 correlation
abs_corr_coeffs[abs_corr_coeffs > 0.3]

#Dropping columns with less than 0.3 correlation
transform_df.drop(abs_corr_coeffs[abs_corr_coeffs < 0.3].index,inplace=True,axis=1)

Which categorical columns should we keep?
- Which columns are currently numerical but need to be encoded as categorical instead (because the numbers don't have any semantic meaning)?
- If a categorical column has hundreds of unique values (or categories), should we keep it? When we dummy code this column, hundreds of columns will need to be added back to the data frame.

In [14]:
#Create a list of column names from documentation that are *meant* to be categorical
nominal_features = ["PID", "MS SubClass", "MS Zoning", "Street", "Alley", "Land Contour", "Lot Config", "Neighborhood", 
                    "Condition 1", "Condition 2", "Bldg Type", "House Style", "Roof Style", "Roof Matl", "Exterior 1st", 
                    "Exterior 2nd", "Mas Vnr Type", "Foundation", "Heating", "Central Air", "Garage Type", 
                    "Misc Feature", "Sale Type", "Sale Condition"]

In [15]:
# Which categorical columns have we still carried with us? We'll test these
transform_cat_cols = []
for col in nominal_features:
    if col in transform_df.columns:
        transform_cat_cols.append(col)

#How many unique values in each categorical column?
uniqueness_counts = transform_df[transform_cat_cols].apply(lambda col: len(col.unique())).sort_values()
print(uniqueness_counts)

#Aribtrary cutoff of 8 unique values
drop_nonuniq_cols = uniqueness_counts[uniqueness_counts > 8].index
transform_df = transform_df.drop(drop_nonuniq_cols, axis=1)

Street           2
Central Air      2
Land Contour     4
Lot Config       5
Bldg Type        5
Roof Style       6
Foundation       6
Heating          6
MS Zoning        7
Condition 2      8
House Style      8
Roof Matl        8
Condition 1      9
Exterior 1st    16
Exterior 2nd    17
Neighborhood    28
dtype: int64


In [16]:
# Select just the remaining text columns and convert to categorical
text_cols = transform_df.select_dtypes(include=['object'])
for col in text_cols:
    transform_df[col] = transform_df[col].astype('category')

# Create dummy columns and add back to the dataframe!
transform_df = pd.concat([
    transform_df, 
    pd.get_dummies(transform_df.select_dtypes(include=['category']))
], axis=1).drop(text_cols,axis=1)

transform_df.head()

Unnamed: 0,Overall Qual,Year Built,Year Remod/Add,Mas Vnr Area,BsmtFin SF 1,Total Bsmt SF,1st Flr SF,Gr Liv Area,Full Bath,TotRms AbvGrd,...,Functional_Maj2,Functional_Min1,Functional_Min2,Functional_Mod,Functional_Sal,Functional_Sev,Functional_Typ,Paved Drive_N,Paved Drive_P,Paved Drive_Y
0,6,1960,1960,112.0,639.0,1080.0,1656,1656,1,7,...,0,0,0,0,0,0,1,0,1,0
1,5,1961,1961,0.0,468.0,882.0,896,896,1,5,...,0,0,0,0,0,0,1,0,0,1
2,6,1958,1958,108.0,923.0,1329.0,1329,1329,1,6,...,0,0,0,0,0,0,1,0,0,1
3,7,1968,1968,0.0,1065.0,2110.0,2110,2110,2,8,...,0,0,0,0,0,0,1,0,0,1
4,5,1997,1998,0.0,791.0,928.0,928,1629,2,6,...,0,0,0,0,0,0,1,0,0,1


Updating the `select_features` function

In [17]:
def select_features(df, coeff_threshold=0.3, uniq_threshold=8):
    numerical_df = df.select_dtypes(include=['int', 'float'])
    
    abs_corr_coeffs = numerical_df.corr()['SalePrice'].abs().sort_values()
    
    df.drop(abs_corr_coeffs[abs_corr_coeffs < coeff_threshold].index,inplace=True,axis=1)
    
    transform_cat_cols = []
    for col in nominal_features:
        if col in df.columns:
            transform_cat_cols.append(col)
    uniqueness_counts = df[transform_cat_cols].apply(lambda col: len(col.unique())).sort_values()
    drop_nonuniq_cols = uniqueness_counts[uniqueness_counts > uniq_threshold].index
    df = df.drop(drop_nonuniq_cols, axis=1)
    
    text_cols = df.select_dtypes(include=['object'])
    for col in text_cols:
        df[col] = df[col].astype('category')
    df = pd.concat([
        df, 
        pd.get_dummies(df.select_dtypes(include=['category']))
    ], axis=1).drop(text_cols,axis=1)
    
    return df

df = pd.read_csv("AmesHousing.tsv", delimiter="\t")
transform_df = transform_features(df)
filtered_df = select_features(transform_df)
rmse = train_and_test(filtered_df)

rmse

381506946.6431339

Updating the `train_and_test` function to include K-Fold Cross Validation

In [18]:
def train_and_test(df,k=0):
    numeric_df = df.select_dtypes(include=['integer', 'float'])
    numeric_df=numeric_df.apply(lambda x: (x-x.mean())/x.std())
    features = numeric_df.columns.drop("SalePrice")
    lr = LinearRegression()
    
    if k==0:
        train=df.iloc[0:1460]
        test=df.iloc[1460:]
        
        lr.fit(train[features],train['SalePrice'])
        predict=lr.predict(test[features])
        rmse=np.sqrt(mean_squared_error(predict,test['SalePrice']))
        
        return rmse
        
    elif k==1:
        df.sample(frac=1,inplace=True)
        fold_one=df.iloc[0:1460]
        fold_two=df.iloc[1460:]

        lr.fit(fold_one[features],fold_one['SalePrice'])
        predict=lr.predict(fold_one[features])
        rmse1=np.sqrt(mean_squared_error(predict,fold_one['SalePrice']))
        
        lr.fit(fold_two[features],fold_two['SalePrice'])
        predict=lr.predict(fold_two[features])
        rmse2=np.sqrt(mean_squared_error(predict,fold_two['SalePrice']))
        
        return np.mean(rmse1,rmse2)
    else:
        kf = KFold(n_splits=k, shuffle=True)
        mses = cross_val_score(lr, df[features], df['SalePrice'],
scoring="neg_mean_squared_error", cv=kf)
        return np.mean(np.sqrt(np.abs(mses)))

df = pd.read_csv("AmesHousing.tsv", delimiter="\t")
transform_df = transform_features(df)
filtered_df = select_features(transform_df)
rmse = train_and_test(filtered_df, k=4)

rmse

29168.30521758529

### Conclusion

By determining the features of the house, we can predict roughly the price of a house by using Linear Regression. We need to keep in mind that the number of feature is negatively correlated with the variance of the model, which means increasing the number of feature reduces the bias of the model but increases the fidelity too.