In [138]:
import pandas as pd
import matplotlib
%matplotlib inline

In [139]:
train_orig = pd.read_csv('train.csv')

In [140]:
test_orig = pd.read_csv('test.csv')

In [141]:
pd.DataFrame(train_orig.PoolQC).isnull().sum()[0]

1453

In [142]:
# Remove columns that have more that two thirds null values
def remove_nullcols(df):
    for n in df.columns.values:
        if pd.DataFrame(df[n]).isnull().sum()[0]>int(len(df)/3):
            df =df.drop(n,axis=1)
    return df

In [143]:
len(train_orig)

1460

In [144]:
train_data = remove_nullcols(train_orig)
test_data = remove_nullcols(test_orig)

In [145]:
train_data._get_numeric_data().columns.values

array(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
       'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF',
       '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath',
       'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
       'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt',
       'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold', 'SalePrice'], dtype=object)

In [146]:
# Detect numeric columns
def col_types(df):
    num_cols = []
    cate_cols = []
    num_cols = df._get_numeric_data().columns.values
    cate_cols = [n for n in df.columns.values if n not in num_cols]
    return num_cols,cate_cols

In [147]:
train_num_cols,train_cate_cols = col_types(train_data.drop(['SalePrice','Id'],axis=1))
test_num_cols,test_cate_cols = col_types(test_data.drop(['Id'],axis=1))

In [148]:
# Filling and scaling
from sklearn.preprocessing import StandardScaler
def fill_scale(df,cols,is_num):
    if is_num==True:
        df_new = df[cols].fillna(df[cols].mean())
        scaler = StandardScaler()
        df_new = pd.DataFrame(scaler.fit_transform(df_new),columns=df_new.columns.values)
        dupe_cols = list(df_new.columns.values)
    else:
        df_new = df[cols].fillna('place_holder')
        dupe_cols = df_new.columns.values
    df = pd.concat([df.drop(dupe_cols,axis=1),df_new],axis=1)
    return df

In [149]:
train_step_1 = fill_scale(train_data,train_num_cols,True)
train_cleaned = fill_scale(train_step_1,train_cate_cols,False)

In [150]:
test_step_1 = fill_scale(test_data,test_num_cols,True)
test_cleaned = fill_scale(test_step_1,test_cate_cols,False)

In [151]:
train_cleaned.shape

(1460, 76)

In [152]:
test_cleaned.shape

(1459, 75)

In [153]:
# Encode categorical features
def encode_cate(df,cate_cols):
    df_new = pd.get_dummies(df[cate_cols])
    df = pd.concat([df.drop(cate_cols,axis=1),df_new],axis=1)
    return df

In [175]:
train_final = encode_cate(train_cleaned,train_cate_cols)

In [176]:
test_final = encode_cate(test_cleaned,test_cate_cols)

In [177]:
train_final.shape

(1460, 283)

In [178]:
test_final.shape

(1459, 272)

In [179]:
test_final = test_final.reindex(columns = train_final.columns, fill_value=0)

In [183]:
test_final.drop('SalePrice',axis=1,inplace=True)

In [184]:
test_final.shape

(1459, 282)

In [199]:
train_final.isnull().values.any()

False

In [194]:
test_final.isnull().values.any()

False

In [203]:
sub_test.isnull().values.any()

False

In [202]:
from sklearn.cross_validation import train_test_split
sub_train, sub_test = train_test_split(train_final,train_size=0.7,random_state=123)
sub_train_X = sub_train.drop('SalePrice',axis=1)
sub_train_y = sub_train.SalePrice
sub_test_X = sub_test.drop('SalePrice',axis=1)
sub_test_y = sub_test.SalePrice

In [209]:
from sklearn import linear_model
linreg = linear_model.LinearRegression()
linreg.fit(sub_train_X, sub_train_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [213]:
sub_test.loc[:,'predict_SalePrice'] = linreg.predict(sub_test_X)
sub_test.loc[:,'error_score'] = ((sub_test.SalePrice-\
                                sub_test.predict_SalePrice)/sub_test.SalePrice).abs()

In [215]:
sub_test.error_score.describe()

count    439.000000
mean       0.107928
std        0.126839
min        0.000396
25%        0.032974
50%        0.076044
75%        0.137001
max        1.411590
Name: error_score, dtype: float64

In [216]:
linreg = linear_model.LinearRegression()
linreg.fit(train_final.drop('SalePrice',axis=1), train_final.SalePrice)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [218]:
test_final.loc[:,'SalePrice'] = linreg.predict(test_final)

In [227]:
test_final['SalePrice'] = [round(n,3) for n in test_final['SalePrice']]

In [229]:
test_final[['Id','SalePrice']].to_csv('submission.csv', sep='\t',index = False)