In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

In [None]:
data = pd.read_csv("AmesHousing.tsv", delimiter="\t")
data.info()

In [None]:
def transform_features(data):
    # Drop columns with more than 5% missing values
    missing_counts = data.isnull().sum()
    drop_missing_cols = missing_counts[missing_counts > len(data) / 20]
    data = data.drop(drop_missing_cols.index, axis = 1)
    
    # Remove the text columns with missing values
    missing_counts_txt = data.select_dtypes(include=['object']).isnull().sum()
    drop_missing_cols2 = missing_counts_txt[missing_counts_txt > 0]
    data = data.drop(drop_missing_cols2.index, axis=1)
    
    # Fill the missing values in numerical columns with the mode
    missing_counts_num = data.select_dtypes(include=['int', 'float']).isnull().sum()
    numeric_cols = missing_counts_num[missing_counts_num > 0]
    replacement_dict = data[numeric_cols.index].mode().to_dict(orient='records')[0]
    data = data.fillna(replacement_dict)
    
    # Add a new column
    data["Years Before Sale"] = data["Yr Sold"] - data["Year Built"]
    data["Years Since Remod"] = data["Yr Sold"] - data["Year Remod/Add"]
    data = data[(data["Years Before Sale"] >= 0) & (data["Years Since Remod"] >= 0)]
    data = data.drop(["Year Built", "Year Remod/Add"], axis = 1)
    
    # Remove unrelated features for machine learning
    data = data.drop(["PID", "Order"], axis=1)
    # Remove columns that leak info about the final sale
    data = data.drop(["Mo Sold", "Sale Condition", "Sale Type"], axis=1)
    return data

In [None]:
def select_features(data, corr_thred = 0.3, uniq_thred = 10):
    # Remove the featrues with weak correlations with the target
    
    abs_corr_coef = data.select_dtypes(include=["int", "float"]).corr()['SalePrice'].abs()
    data = data.drop(abs_corr_coef[abs_corr_coef < corr_thred].index, axis=1)

    # Create dummy variables for categorical features
    cat_features = ["PID", "MS SubClass", "MS Zoning", "Street", "Alley", "Land Contour", "Lot Config", 
                    "Neighborhood", "Condition 1", "Condition 2", "Bldg Type", "House Style", "Roof Style", 
                    "Roof Matl", "Exterior 1st", "Exterior 2nd", "Mas Vnr Type", "Foundation", "Heating", 
                    "Central Air", "Garage Type", "Misc Feature", "Sale Type", "Sale Condition"]
    
    # Check if included
    cat_cols = []
    for col in cat_features:
        if col in data.columns:
            cat_cols.append(col)

    # Filter via unique value counts
    uniq_counts = data[cat_cols].apply(lambda col: len(col.value_counts()), axis = 0)
    uniq_col_drop = uniq_counts[uniq_counts > uniq_thred].index
    data = data.drop(uniq_col_drop, axis = 1)
    
    # Get dummy variables and remove original columns
    txt_cols = data.select_dtypes(include=["object"])
    data = pd.concat([data, pd.get_dummies(txt_cols)], axis=1)
    data = data.drop(txt_cols, axis = 1)
    
    return data

In [None]:
def train_and_test(data, k):
    numerical_df = data.select_dtypes(include=["integer", "float"])
    numerical_features = numerical_df.columns.drop("SalePrice")
    lr = LinearRegression()
    
    if k == 0:
        train = numerical_df.iloc[:1460]
        test = numerical_df.iloc[1460:]
        
        lr.fit(train[numerical_features], train["SalePrice"])
        test_prediction = lr.predict(test[numerical_features])
        test_mse = mean_squared_error(test["SalePrice"], test_prediction)
        test_rmse = np.sqrt(test_mse)
        
        return test_rmse
    
    if k == 1:
        # Shuffle the ordering of all rows in the data frame
        shuffled_df = data.sample(frac = 1, )
        fold_one = shuffled_df.iloc[:1460]
        fold_two = shuffled_df.iloc[1460:]
        
        lr.fit(fold_one[numerical_features], fold_one["SalePrice"])
        prediction2 = lr.predict(fold_two[numerical_features])
        rmse2 = np.sqrt(mean_squared_error(fold_two["SalePrice"], prediction2))
        
        lr.fit(fold_two[numerical_features], fold_two["SalePrice"])
        prediction1 = lr.predict(fold_one[numerical_features])
        rmse1 = np.sqrt(mean_squared_error(fold_one["SalePrice"], prediction1))
        
        test_rmse = (rmse1 + rmse2) / 2
        
        return test_rmse
    else:
        # Use K-Fold cross-validation
        kf = KFold(n_splits = k, shuffle = True)
        
        rmse_list = []
        
        for train_index, test_index, in kf.split(data):
            train = data.iloc[train_index]
            test = data.iloc[test_index]
            lr.fit(train[numerical_features], train["SalePrice"])
            prediction = lr.predict(test[numerical_features])
            rmse = np.sqrt(mean_squared_error(test["SalePrice"], prediction))
            rmse_list.append(rmse)
            
        test_rmse = np.mean(rmse_list)
        
        return test_rmse

In [None]:
data_transferred = transform_features(data)

data_filtered = select_features(data_transferred)

rmse = train_and_test(data_filtered, 3)

print(rmse)

In [None]:
# Visualize the correlations between features
%matplotlib inline

corr = data_transferred.select_dtypes(include=["integer", "float"]).corr()

sns.heatmap(corr)