In [None]:
import pandas as pd
import numpy as np

In [None]:
df_train = pd.read_csv('train.csv')

In [None]:
df_test = pd.read_csv('test.csv')

In [None]:
df_train.shape

In [None]:
df_train.drop('Utilities', axis=1, inplace=True)
df_test.drop('Utilities', axis=1, inplace=True)

In [None]:
numerical = ['LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 
             'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
             'TotalBsmtSF', '1stFlrSF', '2ndFlrSF','LowQualFinSF', 
             'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
             'HalfBath', 'TotRmsAbvGrd', 'Fireplaces', 'KitchenAbvGr',
             'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 
             'BedroomAbvGr', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 
             'ScreenPorch', 'PoolArea', 'MiscVal']
ordinal = ['OverallQual', 'OverallCond', 'ExterQual', 'ExterCond', 
           'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
           'BsmtFinType2', 'HeatingQC', 'KitchenQual', 'Functional', 
           'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond', 
           'PavedDrive', 'PoolQC', 'Fence']
binary = ['CentralAir']
cyclical = ['MoSold', 'YrSold']
ids = ['Id'] 
goal = ['SalePrice']
categorical = set(df_train.columns).difference(set(numerical),
                                              set(ordinal), set(binary),
                                              set(cyclical), set(ids),
                                              set(goal))


In [None]:
y_train = df_train.SalePrice
df_full = pd.concat([df_train.drop('SalePrice', axis=1), df_test])
df_full.set_index('Id', inplace=True, drop=True)

In [None]:
################# DEALING WITH NUMERICAL VARIABLES ###########################

In [None]:
# Transform Month and Year sold into a single
df_full.MoSold = (df_full.MoSold - 1)/12
df_full['TimeSold'] = df_full.MoSold + df_full.YrSold
numerical.append('TimeSold')
df_full.drop(cyclical, axis=1, inplace=True)

In [None]:
df_full[numerical].head()

In [None]:
df_full[numerical].isna().sum()[df_full[numerical].isna().any()]

In [None]:
df_full[numerical] =  df_full[numerical].fillna(0)
# These NAs are related to missing pieces in the house (no garage, no front)
# etc. 0 is adequate as it can be regressed on for areas, while it allows an
# easy split and separation between HAS - HAS NOT

In [None]:
################# DEALING WITH ORDINAL VARIABLES ###########################

In [None]:
ordinal

In [None]:
df_full.ExterQual.unique()

In [None]:
ordinal_map_1 = {'Ex':5,
                 'Gd':4,
                 'TA':3,
                 'Fa':2,
                 'Po':1,
                 np.nan: 0}
ordinal_map_2 = {'Gd':4,
                 'Av':3,
                 'Mn':2,
                 'No':1,
                 np.nan: 0}
ordinal_map_3 = {'GLQ':6,
                 'ALQ':5,
                 'BLQ':4,
                 'Rec':3,
                 'LwQ':2,
                 'Unf':1,
                 np.nan: 0}
ordinal_map_4 = {'Typ':8,
                 'Min1':7,
                 'Min2':6,
                 'Mod':5,
                 'Maj1':4,
                 'Maj2':3,
                 'Sev':2,
                 'Sav':1}
ordinal_map_5 = {'Fin':3,
                 'RFn':2,
                 'Unf':1,
                 np.nan: 0}
ordinal_map_6 = {'Y':2,
                 'P':2,
                 'N':1,
                 np.nan: 0}
ordinal_map_7 = {'GdPrv':4,
                 'MnPrv':3,
                 'GdWo':2,
                 'MnWw':1,
                 np.nan: 0}

In [None]:
ordinal_2 = ['BsmtExposure']
ordinal_3 = ['BsmtFinType1', 'BsmtFinType2']
ordinal_4 = ['Functional']
ordinal_5 = ['GarageFinish']
ordinal_6 = ['PavedDrive']
ordinal_7 = ['Fence']
ordinal_num = ['OverallCond', 'OverallQual']
ordinal_1 = list(set(ordinal).difference(set(ordinal_2), set(ordinal_3),
                                        set(ordinal_3), set(ordinal_4),
                                        set(ordinal_5), set(ordinal_6),
                                        set(ordinal_7), set(ordinal_num)))

In [None]:
# Kithen and apartments must have quality ratings. Therefore, we will fill
# KitchenQual and Functional missing values with the respective modes.

df_full.loc[df_full[df_full.Functional.isna()].index, 'Functional'] = df_train.Functional.mode()[0]
df_full.loc[df_full[df_full.KitchenQual.isna()].index, 'KitchenQual'] = df_train.KitchenQual.mode()[0]


In [None]:
for elem in ordinal_1:
    print(elem)
    df_full[elem] = df_full[elem].apply(lambda x: ordinal_map_1[x])
for elem in ordinal_2:
    df_full[elem] = df_full[elem].apply(lambda x: ordinal_map_2[x])
for elem in ordinal_3:
    df_full[elem] = df_full[elem].apply(lambda x: ordinal_map_3[x])
for elem in ordinal_4:
    df_full[elem] = df_full[elem].apply(lambda x: ordinal_map_4[x])
for elem in ordinal_5:
    df_full[elem] = df_full[elem].apply(lambda x: ordinal_map_5[x])
for elem in ordinal_6:
    df_full[elem] = df_full[elem].apply(lambda x: ordinal_map_6[x])
for elem in ordinal_7:
    df_full[elem] = df_full[elem].apply(lambda x: ordinal_map_7[x])

In [None]:
################# DEALING WITH CATEGORICAL VARIABLES ###########################

In [None]:
def get_target_value(elem, df, y):
    values = df[elem].unique()
    return df[[elem, y]].groupby(elem).median()
    # Should we return mean or median? We can check this through crossval

In [None]:
df_full.MSSubClass = df_full.MSSubClass.apply(lambda x: str(x))

In [None]:
df_full_ohe = pd.get_dummies(df_full[categorical])

In [None]:
df_full_ohe = df_full_ohe.join(df_full[numerical]).join(df_full[ordinal])

In [None]:
df_train_ohe = df_full_ohe.loc[:1460]
df_train_ohe = df_train_ohe.reset_index(drop=True).join(y_train)
df_test_ohe = df_full_ohe.loc[1460:]

In [None]:
df_train_ohe.to_csv('train_ohe.csv')
df_test_ohe.to_csv('test_ohe.csv')

In [None]:
df_full.loc[df_full[df_full.SaleType.isna()].index, 'SaleType'] = df_train.SaleType.mode()[0]
df_full.loc[df_full[df_full.Exterior1st.isna()].index, 'Exterior1st'] = df_train.Exterior1st.mode()[0]
df_full.loc[df_full[df_full.Electrical.isna()].index, 'Electrical'] = df_train.Electrical.mode()[0]
df_full.loc[df_full[df_full.Exterior2nd.isna()].index, 'Exterior2nd'] = df_train.Exterior2nd.mode()[0]
df_full.loc[df_full[df_full.MSZoning.isna()].index, 'MSZoning'] = df_train.MSZoning.mode()[0]


In [None]:
df_full.fillna('None', inplace=True)

In [None]:
df_train = df_full.loc[:1460].join(y_train)

In [None]:
list(categorical)

In [None]:
to_other

In [None]:
df_train[elem].apply(lambda x: 'Other' if x in to_other 
                                          else x)

In [None]:
for elem in categorical:
    print(elem)
    to_other = df_full.groupby(elem)['KitchenQual'].count()[
        df_full.groupby(elem)['KitchenQual'].count()<50].index
    df_train[elem] = df_train[elem].apply(lambda x: 'Other' if x in to_other 
                                          else x)
    tmp = get_target_value(elem, df_train, 'SalePrice')
    df_full[elem] = df_full[elem].apply(lambda x: tmp.loc[x])

In [None]:
df_full.MSSubClass.unique()

In [None]:
df_train.groupby('MSSubClass').count()