In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn import linear_model

In [2]:
data = pd.read_csv('AmesHousing.tsv', delimiter='\t')

In [3]:
## Creating functions as a pipeline for predicting the data and returning the RMSE

## Function for transforming the features based on criteria (initially none)
def transform_features(df):
    train = df
    return train

## Function for selecting features for predicting based on criteria (initially none)
def select_features(df):
    return df[['Gr Liv Area', 'SalePrice']]

## Function for splitting a dataframe into train and test and returning the RMSE of the predictions
def train_and_test(df):
    train = df[:1460]
    test = df[1460:]
    
    ## Filter out only the numeric columns
    n_train = train.select_dtypes(include=['integer', 'float'])
    n_test = test.select_dtypes(include=['integer', 'float'])
    
    ## Create a list of all columns without 'SalePrice'
    features = n_train.columns.drop('SalePrice')
    
    ## Use linear regression to find the root mean squared error
    lr = linear_model.LinearRegression()
    lr.fit(train[features], train['SalePrice'])
    predictions = lr.predict(test[features])
    mse = mean_squared_error(test['SalePrice'], predictions)
    rmse = np.sqrt(mse)
    
    return rmse

## Test the functions
transform_data = transform_features(data)
filtered_data = select_features(transform_data)
rmse = train_and_test(filtered_data)

rmse

57088.25161263909

In [4]:
## Sum of null values for each column
null_values = data.isnull().sum()

In [5]:
## Drop columns with less than 5% missing values
over_5 = null_values[(null_values > len(data) * 0.05)].sort_values()
data = data.drop(over_5.index, axis=1)

In [6]:
## Replace numeric columns with missing values with the mode for that column
numeric_missing = data.select_dtypes(include=['float', 'int']).isnull().sum()
fix_columns = numeric_missing[numeric_missing > 0].sort_values()
fix_columns

BsmtFin SF 1       1
BsmtFin SF 2       1
Bsmt Unf SF        1
Total Bsmt SF      1
Garage Cars        1
Garage Area        1
Bsmt Full Bath     2
Bsmt Half Bath     2
Mas Vnr Area      23
dtype: int64

In [7]:
## Create a dictionary with the mode for each fixable numeric column
replace_dict = data[fix_columns.index].mode().to_dict(orient='records')[0]
replace_dict

{'BsmtFin SF 1': 0.0,
 'BsmtFin SF 2': 0.0,
 'Bsmt Unf SF': 0.0,
 'Total Bsmt SF': 0.0,
 'Garage Cars': 2.0,
 'Garage Area': 0.0,
 'Bsmt Full Bath': 0.0,
 'Bsmt Half Bath': 0.0,
 'Mas Vnr Area': 0.0}

In [8]:
## Use the dictionary to fill in the values where missing
data = data.fillna(replace_dict)

In [9]:
## Find text columns with null values
text_missing = data.select_dtypes(include='object').isnull().sum()

## Remove columns with null values
drop_text = text_missing[text_missing > 0]
data = data.drop(drop_text.index, axis=1)

## Verify that there are no more null values
data.isnull().sum().value_counts()

0    64
dtype: int64

In [10]:
## Create new columns to improve the data

## Create a column from Year Sold and Year Built
data['Years Before Sale'] = data['Yr Sold'] - data['Year Built']

## Create a column from Year Sold and Year Remodelled 
data['Years Since Remod'] = data['Yr Sold'] - data['Year Remod/Add']

In [11]:
## Find rows with negative values for Years Before Sale
data['Years Before Sale'][data['Years Before Sale'] < 0]

2180   -1
Name: Years Before Sale, dtype: int64

In [12]:
## Find rows with negative values for Years Since Remod
data['Years Since Remod'][data['Years Since Remod'] < 0]

1702   -1
2180   -2
2181   -1
Name: Years Since Remod, dtype: int64

In [13]:
## Drop rows with negative values for the new columns
data = data.drop([1702, 2180, 2181], axis=0)

## Drop columns no longer necessary columns
data = data.drop(['Year Built', 'Year Remod/Add'], axis=1)

In [14]:
## Drop columns that are not useful for ML
data = data.drop(['PID', 'Order'], axis=1)

## Drop columns that leak data about the sale
data = data.drop(['Mo Sold', 'Sale Condition', 'Sale Type', 'Yr Sold'], axis=1)

In [15]:
## Update the transform_features function with the findings above

## Function for transforming the features based on criteria (initially none)
def transform_features(df):
    ## Sum of null values for each column
    null_values = df.isnull().sum()
    
    ## Drop columns with less than 5% missing values
    over_5 = null_values[(null_values > len(df) * 0.05)].sort_values()
    df = df.drop(over_5.index, axis=1)
    
    ## Replace numeric columns with missing values with the mode for that column
    numeric_missing = df.select_dtypes(include=['float', 'int']).isnull().sum()
    fix_columns = numeric_missing[numeric_missing > 0].sort_values()
    
    ## Create a dictionary with the mode for each fixable numeric column
    replace_dict = df[fix_columns.index].mode().to_dict(orient='records')[0]
    
    ## Use the dictionary to fill in the values where missing
    df = df.fillna(replace_dict)
    
    ## Find text columns with null values
    text_missing = df.select_dtypes(include='object').isnull().sum()
    
    ## Remove columns with null values
    drop_text = text_missing[text_missing > 0]
    df = df.drop(drop_text.index, axis=1)
    
    ## Create a column from Year Sold and Year Built
    df['Years Before Sale'] = df['Yr Sold'] - df['Year Built']
    
    ## Create a column from Year Sold and Year Remodelled 
    df['Years Since Remod'] = df['Yr Sold'] - df['Year Remod/Add']
    
    ## Drop rows with negative values for the new columns
    df = df.drop([1702, 2180, 2181], axis=0)
    
    ## Drop columns no longer necessary columns
    df = df.drop(['Year Built', 'Year Remod/Add'], axis=1)
    
    ## Drop columns that are not useful for ML
    df = df.drop(['PID', 'Order'], axis=1)
    
    ## Drop columns that leak data about the sale
    df = df.drop(['Mo Sold', 'Sale Condition', 'Sale Type', 'Yr Sold'], axis=1)
    
    return df

## Function for selecting features for predicting based on criteria (initially none)
def select_features(df):
    return df[['Gr Liv Area', 'SalePrice']]

## Function for splitting a dataframe into train and test and returning the RMSE of the predictions
def train_and_test(df):
    train = df[:1460]
    test = df[1460:]
    
    ## Filter out only the numeric columns
    n_train = train.select_dtypes(include=['integer', 'float'])
    n_test = test.select_dtypes(include=['integer', 'float'])
    
    ## Create a list of all columns without 'SalePrice'
    features = n_train.columns.drop('SalePrice')
    
    ## Use linear regression to find the root mean squared error
    lr = linear_model.LinearRegression()
    lr.fit(train[features], train['SalePrice'])
    predictions = lr.predict(test[features])
    mse = mean_squared_error(test['SalePrice'], predictions)
    rmse = np.sqrt(mse)
    
    return rmse

## Test the functions
df = pd.read_csv('AmesHousing.tsv', delimiter='\t')
transform_df = transform_features(df)
filtered_df = select_features(transform_df)
rmse = train_and_test(filtered_df)

rmse

55275.367312413066

In [16]:
## Find columns with a high correlation with SalePrice

## Include only numeric columns
numeric_df = transform_df.select_dtypes(include=['float', 'int'])

## List columns with their correlation to SalePrice
abs_corr = numeric_df.corr()['SalePrice'].abs().sort_values()

abs_corr

BsmtFin SF 2         0.006127
Misc Val             0.019273
3Ssn Porch           0.032268
Bsmt Half Bath       0.035875
Low Qual Fin SF      0.037629
Pool Area            0.068438
MS SubClass          0.085128
Overall Cond         0.101540
Screen Porch         0.112280
Kitchen AbvGr        0.119760
Enclosed Porch       0.128685
Bedroom AbvGr        0.143916
Bsmt Unf SF          0.182751
Lot Area             0.267520
2nd Flr SF           0.269601
Bsmt Full Bath       0.276258
Half Bath            0.284871
Open Porch SF        0.316262
Wood Deck SF         0.328183
BsmtFin SF 1         0.439284
Fireplaces           0.474831
TotRms AbvGrd        0.498574
Mas Vnr Area         0.506983
Years Since Remod    0.534985
Full Bath            0.546118
Years Before Sale    0.558979
1st Flr SF           0.635185
Garage Area          0.641425
Total Bsmt SF        0.644012
Garage Cars          0.648361
Gr Liv Area          0.717596
Overall Qual         0.801206
SalePrice            1.000000
Name: Sale

In [17]:
## Use a correlation cutoff value to only include columns that have a higher correlation coefficient
cor_cutoff = 0.4

##Apply cor_cutoff to drop rows
transform_df = transform_df.drop(abs_corr[abs_corr < cor_cutoff].index, axis=1)

In [18]:
## List of categorical columns from the documentation
cat_columns = ["PID", 
               "MS SubClass", 
               "MS Zoning", 
               "Street", 
               "Alley", 
               "Land Contour", 
               "Lot Config", 
               "Neighborhood", 
               "Condition 1", 
               "Condition 2", 
               "Bldg Type", 
               "House Style", 
               "Roof Style", 
               "Roof Matl", 
               "Exterior 1st", 
               "Exterior 2nd", 
               "Mas Vnr Type", 
               "Foundation", 
               "Heating", 
               "Central Air", 
               "Garage Type", 
               "Misc Feature", 
               "Sale Type", 
               "Sale Condition"
              ]

## Limit the list to only those columns in our dataframe
current_cat_columns = []

for col in cat_columns:
    if col in transform_df.columns:
        current_cat_columns.append(col)

## Find the unique values in each column
uniq_counts = transform_df[current_cat_columns].apply(lambda col: len(col.value_counts())).sort_values()

uniq_counts

Street           2
Central Air      2
Land Contour     4
Lot Config       5
Bldg Type        5
Roof Style       6
Foundation       6
Heating          6
MS Zoning        7
Condition 2      8
House Style      8
Roof Matl        8
Condition 1      9
Exterior 1st    16
Exterior 2nd    17
Neighborhood    28
dtype: int64

In [19]:
## We will pick 10 as the default cutoff
uniq_cutoff = 10

## Apply the cutoff to the dataframe
drop_uniq_cols = uniq_counts[uniq_counts > uniq_cutoff].index
transform_df = transform_df.drop(drop_uniq_cols, axis=1)

In [20]:
## Create a list of text columns
text_cols = transform_df.select_dtypes(include=['object'])

## Convert text columns to object columns
for col in text_cols:
    transform_df[col] = transform_df[col].astype('category')
    
## Create dummy columns and add back to the dataframe
transform_df = pd.concat([
    transform_df,
    pd.get_dummies(transform_df.select_dtypes(include=['category']))
], axis=1).drop(text_cols, axis=1)

In [29]:
## Update the select_features function with the findings above
## Update the train_and_test function with a k value argument

## Function for transforming the features based on criteria detailed below
def transform_features(df):
    ## Sum of null values for each column
    null_values = df.isnull().sum()
    
    ## Drop columns with less than 5% missing values
    over_5 = null_values[(null_values > len(df) * 0.05)].sort_values()
    df = df.drop(over_5.index, axis=1)
    
    ## Replace numeric columns with missing values with the mode for that column
    numeric_missing = df.select_dtypes(include=['float', 'int']).isnull().sum()
    fix_columns = numeric_missing[numeric_missing > 0].sort_values()
    
    ## Create a dictionary with the mode for each fixable numeric column
    replace_dict = df[fix_columns.index].mode().to_dict(orient='records')[0]
    
    ## Use the dictionary to fill in the values where missing
    df = df.fillna(replace_dict)
    
    ## Find text columns with null values
    text_missing = df.select_dtypes(include='object').isnull().sum()
    
    ## Remove columns with null values
    drop_text = text_missing[text_missing > 0]
    df = df.drop(drop_text.index, axis=1)
    
    ## Create a column from Year Sold and Year Built
    df['Years Before Sale'] = df['Yr Sold'] - df['Year Built']
    
    ## Create a column from Year Sold and Year Remodelled 
    df['Years Since Remod'] = df['Yr Sold'] - df['Year Remod/Add']
    
    ## Drop rows with negative values for the new columns
    df = df.drop([1702, 2180, 2181], axis=0)
    
    ## Drop columns no longer necessary columns
    df = df.drop(['Year Built', 'Year Remod/Add'], axis=1)
    
    ## Drop columns that are not useful for ML
    df = df.drop(['PID', 'Order'], axis=1)
    
    ## Drop columns that leak data about the sale
    df = df.drop(['Mo Sold', 'Sale Condition', 'Sale Type', 'Yr Sold'], axis=1)
    
    return df

## Function for selecting features for predicting based on criteria detailed below
def select_features(df, corr_threshold=0.4, uniq_threshold=10):
    ## Include only numeric columns
    numeric_df = df.select_dtypes(include=['float', 'int'])
    
    ## List columns with their correlation to SalePrice
    abs_corr = numeric_df.corr()['SalePrice'].abs().sort_values()
    
    ##Apply cor_cutoff to drop rows
    df = df.drop(abs_corr[abs_corr < corr_threshold].index, axis=1)
    
    ## List of categorical columns from the documentation
    cat_columns = ["PID", 
               "MS SubClass", 
               "MS Zoning", 
               "Street", 
               "Alley", 
               "Land Contour", 
               "Lot Config", 
               "Neighborhood", 
               "Condition 1", 
               "Condition 2", 
               "Bldg Type", 
               "House Style", 
               "Roof Style", 
               "Roof Matl", 
               "Exterior 1st", 
               "Exterior 2nd", 
               "Mas Vnr Type", 
               "Foundation", 
               "Heating", 
               "Central Air", 
               "Garage Type", 
               "Misc Feature", 
               "Sale Type", 
               "Sale Condition"
              ]
    
    ## Limit the list to only those columns in our dataframe
    current_cat_columns = []
    for col in cat_columns:
        if col in df.columns:
            current_cat_columns.append(col)
        
    ## Find the unique values in each column
    uniq_counts = df[current_cat_columns].apply(lambda col: len(col.value_counts())).sort_values()
    
    ## Apply the cutoff to the dataframe
    drop_uniq_cols = uniq_counts[uniq_counts > uniq_threshold].index
    df = df.drop(drop_uniq_cols, axis=1)
    
    ## Create a list of text columns
    text_cols = df.select_dtypes(include=['object'])
    
    ## Convert text columns to object columns
    for col in text_cols:
        df[col] = df[col].astype('category')
    
    ## Create dummy columns and add back to the dataframe
    df = pd.concat([
        df,
        pd.get_dummies(df.select_dtypes(include=['category']))
    ], axis=1).drop(text_cols, axis=1)
    
    return df

## Function for splitting a dataframe into train and test and returning the RMSE of the predictions
## Added k value argument

def train_and_test(df, k=0):
    ## Filter out only the numeric columns
    n_df = df.select_dtypes(include=['integer', 'float'])
    
    ## Create a list of all columns without 'SalePrice'
    features = n_df.columns.drop('SalePrice')
    
    ## Use Linear Regression
    lr = linear_model.LinearRegression()
    
    ## For k=0 (default)
    if k == 0:
        train = df[:1460]
        test = df[1460:]
        
        lr.fit(train[features], train['SalePrice'])
        predictions = lr.predict(test[features])
        mse = mean_squared_error(test['SalePrice'], predictions)
        rmse = np.sqrt(mse)
        
        return rmse
    
    ## For k=1
    if k == 1:
        ## Shuffle the rows
        shuffled = df.sample(frac=1)
        fold_one = shuffled[:1460]
        fold_two = shuffled[1460:]
        
        ## Train on Fold One and Test on Fold Two
        lr.fit(fold_one[features], fold_one['SalePrice'])
        prediction_one = lr.predict(fold_two[features])
        mse_one = mean_squared_error(fold_two['SalePrice'], prediction_one)
        rmse_one = np.sqrt(mse_one)
        
        ## Train on Fold Two and Test on Fold One
        lr.fit(fold_two[features], fold_two['SalePrice'])
        prediction_two = lr.predict(fold_one[features])
        mse_two = mean_squared_error(fold_one['SalePrice'], prediction_two)
        rmse_two = np.sqrt(mse_two)
        
        ## Find the mean of the two rmse
        avg_rmse = np.mean([rmse_one, rmse_two])
        
        return avg_rmse
    
    else:
        kf = KFold(n_splits=k, shuffle=True)
        rmse_values = []
        
        for train_index, test_index, in kf.split(df):
            train = df.iloc[train_index]
            test = df.iloc[test_index]
            lr.fit(train[features], train["SalePrice"])
            predictions = lr.predict(test[features])
            mse = mean_squared_error(test["SalePrice"], predictions)
            rmse = np.sqrt(mse)
            rmse_values.append(rmse)
        
        avg_rmse = np.mean(rmse_values)
        
        return avg_rmse

## Test the functions
df_final = pd.read_csv('AmesHousing.tsv', delimiter='\t')
transform_df_final = transform_features(df_final)
filtered_df_final = select_features(transform_df_final)
rmse_final = train_and_test(filtered_df_final, k=4)

rmse_final

29316.267922230298