## Housing price kaggle competition
https://www.kaggle.com/c/house-prices-advanced-regression-techniques

In [95]:
# Import data and initial data exploration
import pandas as pd

train = pd.read_csv('./promptCode/train.csv')
print(train.shape)
train.head()

(1460, 81)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [81]:
# determine how much data is missing
print('Missing data by column')
missing_data = list(train.isna().sum())
for i in range(len(missing_data)):
    if missing_data[i] != 0:
        print(train.columns[i], missing_data[i])

Missing data by column
LotFrontage 259
Alley 1369
MasVnrType 8
MasVnrArea 8
BsmtQual 37
BsmtCond 37
BsmtExposure 38
BsmtFinType1 37
BsmtFinType2 38
Electrical 1
FireplaceQu 690
GarageType 81
GarageYrBlt 81
GarageFinish 81
GarageQual 81
GarageCond 81
PoolQC 1453
Fence 1179
MiscFeature 1406


In [96]:
# Clean data
#  - remove unnecessary features
#  - impute missing values for numerical data and normalize
#  - encode categorical data, missing values will have na column
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import normalize
import numpy as np

def clean_data(data):
    def remove_columns(_data):
        return _data.drop(['Id'], axis=1)
    
    def clean_quant_data(_data):
        numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        numeric_columns = _data.select_dtypes(include=numerics)
        imp_mean = Imputer(missing_values=np.nan, strategy='mean')
        imp_mean.fit(numeric_columns.values)
        numeric_columns_values = imp_mean.transform(numeric_columns.values)
        normalize(numeric_columns_values)
        for index, label in enumerate(numeric_columns.columns):
            _data[label] = numeric_columns_values[:,index]
        return _data

    def clean_qual_data(_data):
        _data = pd.get_dummies(_data)
        return _data

#     data = remove_columns(data)
    data = clean_quant_data(data)
#     data = clean_qual_data(data)
#     print(data)
    return data

train = clean_data(train)


In [97]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Given a dataframe, splits the data into 30/70 test train groups and runs the sklearn linear regression model
# 1) converts strings to numbers?
# prints the accuracy
def get_acc(df):
    df = df.apply(pd.to_numeric, errors='coerce')
    df.fillna(0, inplace=True)
    y = df.loc[:, "SalePrice"].values.reshape(-1, 1)
    x = df.drop(["SalePrice"], axis=1)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
    score = LinearRegression().fit(x_train, y_train).score(x_test, y_test)
    print("R2 value: ", score)
    
get_acc(train)

# original accuracy - no data manipulation
# Accuracy:  0.8279193449880126


R2 value:  0.8216645356897891
