In [None]:
#graphs and utilities
import os
import pandas as pd #pandas stands for panel data
import numpy as np
import math as ma
import seaborn as sns
import scipy.stats as stats
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

#Analysis
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.manifold import TSNE
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import ExtraTreesClassifier
from scipy.stats.stats import pearsonr


#regression models
from scipy.stats import linregress as linRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

#model metrics, deciding which models perform best
from sklearn.metrics import mean_squared_error, r2_score

df_train = pd.read_csv("../input/train.csv")
df_test = pd.read_csv("../input/test.csv")

all_dfs = [df_train,df_test]

def makeDict(df,label):
    strcat_dict = {}
    for i,row in df.iterrows():
        strcat_dict[row[label]] = i 
    return strcat_dict

def makeOrdinal(df,label,showDict):
    filtered = df.sort_values([label], ascending = [True])
    df_filtered = filtered.groupby(label).first().reset_index()
    strcat_dict = {}
    
    for i,row in df_filtered.iterrows():
        strcat_dict[row[label]] = i 
    
    if showDict:
        print(strcat_dict)
    
    for j,row in df.iterrows():
        df.at[j,label] = strcat_dict.get(row[label])
    
    return df

print("imports complete")

In [None]:
#quick way to see what data is missing 
sns.set_style("whitegrid")
missing = df_train.isnull().sum()
missing = missing[missing > 0]
missing.sort_values(inplace=True)
missing.plot.bar()

#90% threshold makes it ok to drop these columns, too much data missing to make these useful
drop_cols = []
for val,column in missing.iteritems():
    if column > (len(df_train)*.90):
        drop_cols.append(val)

for df in all_dfs:
    df.drop(drop_cols,inplace=True, axis=1)

all_dfs = [df_train,df_test]

In [None]:
quantitative = [f for f in df_train.columns if df_train.dtypes[f] != 'object']
quantitative.remove('SalePrice')
quantitative.remove('Id')
qualitative = [f for f in df_train.columns if df_train.dtypes[f] == 'object']

for column in qualitative:
    for df_single in all_dfs:
        makeOrdinal(df_single,column,False)
        
all_dfs = [df_train,df_test]

In [None]:
#distplot stands for distribution plot (so we can see how the data for SalePrice is distributed)
y = df_train['SalePrice']
plt.figure(1); plt.title('Johnson SU')
sns.distplot(y, kde=False, fit=stats.johnsonsu)
plt.figure(2); plt.title('Normal')
sns.distplot(y, kde=False, fit=stats.norm)
plt.figure(3); plt.title('Log Normal')
sns.distplot(y, kde=False, fit=stats.lognorm)

In [None]:
test_normality = lambda x: stats.shapiro(x.fillna(0))[1] < 0.05
normal = pd.DataFrame(df_train[quantitative])
normal = normal.apply(test_normality)
print(not normal.any())
#so none of the quantitiative variables are normally distrubted at a 1% significance level

In [None]:
def spearman(frame, features):
    spr = pd.DataFrame()
    spr['feature'] = features
    spr['spearman'] = [frame[f].corr(frame['SalePrice'], 'spearman') for f in features]
    spr = spr.sort_values('spearman')
    print(spr['feature'])
    plt.figure(figsize=(6, 0.25*len(features)))
    sns.barplot(data=spr, y='feature', x='spearman', orient='h')
    
spearman(df_train,quantitative)

In [None]:
spearman(df_train,qualitative)

In [None]:
def bucketData(df,column_to_bucket):
    number_of_buckets = 5
    df['tempBand'] = pd.qcut(df[column_to_bucket],number_of_buckets)
    intervals = df['tempBand'].unique().sort_values().tolist()
    intervals = [x for x in intervals if str(x) != 'nan']
    
    interval_dict = {}
    for idx,interval in enumerate(intervals):
        interval_dict['low'+str(idx)] = interval.left
        interval_dict['high'+str(idx)] = interval.right
    
    df.loc[ df[column_to_bucket] <= interval_dict['high0'], column_to_bucket] = 0
    df.loc[(df[column_to_bucket] > interval_dict['low1']) & \
           (df[column_to_bucket] <= interval_dict['high1']), column_to_bucket] = 1
    df.loc[(df[column_to_bucket] > interval_dict['low2']) & \
           (df[column_to_bucket] <= interval_dict['high2']), column_to_bucket] = 2
    df.loc[(df[column_to_bucket] > interval_dict['low3']) & \
           (df[column_to_bucket] <= interval_dict['high3']), column_to_bucket] = 3
    df.loc[ df[column_to_bucket] > interval_dict['low4'], column_to_bucket] = 4
    
    df = df.drop(['tempBand'], axis=1)

for df in all_dfs:
    bucketData(df,'YearBuilt')
for df in all_dfs:
    bucketData(df,'LotFrontage')
for df in all_dfs:
    bucketData(df,'LotArea')

all_dfs = [df_train,df_test]
plt.hist(df_train['YearBuilt'])
plt.hist(df_train['LotFrontage'])
plt.show()

In [None]:
feature_imputer = SimpleImputer(strategy='mean')

float_cols = [cname for cname in df_train.columns if df_train[cname].dtype in ['float64']]
float_cols.extend(['KitchenQual','GarageFinish','BsmtQual','ExterQual','Foundation'])

for column in float_cols:
    for df in all_dfs:
        df[column] = df[column].fillna(int(df[column].mean()))

sns.set_style("whitegrid")
missing = df_train.isnull().sum()
missing = missing[missing > 0]
missing.sort_values(inplace=True)
missing.plot.bar()

df_train[float_cols] = df_train[float_cols].astype(int)

hold_columns = df_train[float_cols].columns
df_train[float_cols] = pd.DataFrame(feature_imputer.fit_transform(df_train[float_cols].values))
df_train[float_cols].columns = hold_columns

In [None]:
final_choices = ['YearRemodAdd','1stFlrSF','GarageYrBlt','TotalBsmtSF','FullBath','GarageArea',\
                 'LotFrontage','LotArea','YearBuilt','GarageCars','GrLivArea','OverallQual','SalePrice',\
                 'KitchenQual','GarageFinish','BsmtQual','ExterQual','Foundation']

best_features = df_train[final_choices]

y = best_features['SalePrice']
sale_predictors = best_features.drop(['SalePrice'], axis=1)

X = sale_predictors
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,random_state=0)

my_imputer = SimpleImputer(strategy='mean')
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

# Imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

stdscale = StandardScaler()
scaled_imputed_X_train = stdscale.fit_transform(imputed_X_train)
scaled_imputed_X_valid = stdscale.fit_transform(imputed_X_valid)

pca_fin = scaled_imputed_X_train
pca_fin_val = scaled_imputed_X_valid

In [None]:
#fit and predict with models and measure their accuracy
#when r2 = 1, it's a perfect prediction
reg_scores = pd.DataFrame(columns=['name','mean-sq-err','variance'])
seed = 0

linear_reg = LinearRegression()
linear_reg.fit(pca_fin,y_train)
lin_reg_pred = linear_reg.predict(pca_fin_val)
reg_scores.loc[0] = ['linear regression',\
                     int(mean_squared_error(y_valid, lin_reg_pred)),r2_score(y_valid, lin_reg_pred)]

ada_reg = AdaBoostRegressor(DecisionTreeRegressor(max_depth=7),n_estimators=300, random_state=seed)
ada_reg.fit(pca_fin,y_train)
ada_reg_pred = ada_reg.predict(pca_fin_val)
reg_scores.loc[1] = ['adaboost regression',\
                     int(mean_squared_error(y_valid, ada_reg_pred)),r2_score(y_valid, ada_reg_pred)]

rf_reg = RandomForestRegressor(n_estimators=350, random_state=seed)
rf_reg.fit(pca_fin,y_train)
rf_reg_pred = rf_reg.predict(pca_fin_val)
reg_scores.loc[2] = ['random forest regression',\
                     int(mean_squared_error(y_valid, rf_reg_pred)),r2_score(y_valid, rf_reg_pred)]

knn_reg = KNeighborsRegressor(weights='uniform')
knn_reg.fit(pca_fin,y_train)
knn_reg_pred = knn_reg.predict(pca_fin_val)
reg_scores.loc[3] = ['nearest neighbours regression',\
                     int(mean_squared_error(y_valid, knn_reg_pred)),r2_score(y_valid, knn_reg_pred)]

best_alpha = 0.0001
regr = Lasso(alpha=best_alpha, max_iter=50000)
regr.fit(pca_fin, y_train)

# Run prediction on training set to get a rough idea of how well it does.
lasso_reg_pred = regr.predict(pca_fin_val)

reg_scores.loc[4] = ['lasso',\
                    int(mean_squared_error(y_valid, lasso_reg_pred)),r2_score(y_valid, lasso_reg_pred)]

#only works on kaggle for now, since there's too many hoops to jump through to local install xgb
xgb_reg = XGBRegressor()
xgb_reg.fit(pca_fin, y_train)
xgb_reg_pred = xgb_reg.predict(pca_fin_val)
reg_scores.loc[5] = ['gradient boost regression',\
                     int(mean_squared_error(y_valid, xgb_reg_pred)),r2_score(y_valid, xgb_reg_pred)]

reg_scores.head(6)

In [None]:
final_choices = ['YearRemodAdd','1stFlrSF','GarageYrBlt','TotalBsmtSF','FullBath','GarageArea',\
                 'LotFrontage','LotArea','YearBuilt','GarageCars','GrLivArea','OverallQual','SalePrice',\
                 'KitchenQual','GarageFinish','BsmtQual','ExterQual','Foundation']

final_choices_test = ['YearRemodAdd','1stFlrSF','GarageYrBlt','TotalBsmtSF','FullBath','GarageArea',\
                 'LotFrontage','LotArea','YearBuilt','GarageCars','GrLivArea','OverallQual',\
                 'KitchenQual','GarageFinish','BsmtQual','ExterQual','Foundation']

X_test = df_test[final_choices_test]
best_features = df_train[final_choices]
y_train = best_features['SalePrice']
sale_predictors = best_features.drop(['SalePrice'], axis=1)
X_train = sale_predictors

my_imputer = SimpleImputer(strategy='mean')
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train.values))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_test.values))

imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_test.columns

stdscale = StandardScaler()
scaled_imputed_X_train = stdscale.fit_transform(imputed_X_train)
scaled_imputed_X_valid = stdscale.fit_transform(imputed_X_valid)

xgb_reg = XGBRegressor()
xgb_reg.fit(X_train, y_train)
xgb_reg_pred = xgb_reg.predict(X_test)

final_predictions = xgb_reg_pred

In [None]:
Submission = pd.DataFrame({ 'Id': df_test['Id'],
                            'SalePrice': final_predictions })

Submission.to_csv("Submission3.csv", index=False)