In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib notebook

In [3]:
train_full = pd.read_csv('train.csv')
test_full = pd.read_csv('test.csv')
train_full.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [62]:
string_features = train_full.select_dtypes(include=['object'])
string_features.head()

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal


In [63]:
print('Unique values:')
{c: len(string_features[c].unique()) for c in string_features.columns}

Unique values:


{'MSZoning': 5,
 'Street': 2,
 'Alley': 3,
 'LotShape': 4,
 'LandContour': 4,
 'Utilities': 2,
 'LotConfig': 5,
 'LandSlope': 3,
 'Neighborhood': 25,
 'Condition1': 9,
 'Condition2': 8,
 'BldgType': 5,
 'HouseStyle': 8,
 'RoofStyle': 6,
 'RoofMatl': 8,
 'Exterior1st': 15,
 'Exterior2nd': 16,
 'MasVnrType': 5,
 'ExterQual': 4,
 'ExterCond': 5,
 'Foundation': 6,
 'BsmtQual': 5,
 'BsmtCond': 5,
 'BsmtExposure': 5,
 'BsmtFinType1': 7,
 'BsmtFinType2': 7,
 'Heating': 6,
 'HeatingQC': 5,
 'CentralAir': 2,
 'Electrical': 6,
 'KitchenQual': 4,
 'Functional': 7,
 'FireplaceQu': 6,
 'GarageType': 7,
 'GarageFinish': 4,
 'GarageQual': 6,
 'GarageCond': 6,
 'PavedDrive': 3,
 'PoolQC': 4,
 'Fence': 5,
 'MiscFeature': 5,
 'SaleType': 9,
 'SaleCondition': 6}

In [64]:
"""
Going to make dummy variables from this column,
but will only bother keeping the main ones to avoid 
issues with variable sample size not reflecting population
"""
print(string_features.groupby(['HouseStyle'])['HouseStyle'].count())
dummies_HouseStyle = pd.get_dummies(string_features.HouseStyle)
dummies_HouseStyle.columns

HouseStyle
1.5Fin    154
1.5Unf     14
1Story    726
2.5Fin      8
2.5Unf     11
2Story    445
SFoyer     37
SLvl       65
Name: HouseStyle, dtype: int64


Index(['1.5Fin', '1.5Unf', '1Story', '2.5Fin', '2.5Unf', '2Story', 'SFoyer',
       'SLvl'],
      dtype='object')

In [65]:
def make_numeric(df):
    '''
    Note: this function creates and returns a copied & updated dataframe.
    Ideally, would re-write to modify the existinf df in place to save time and
    to avoid memory constrints.
    
    Note: I've only done a few example ones here, to show an outline of how I would approach this.
    '''
    df.PavedDrive = df.PavedDrive == 'Y'
    df.CentralAir = df.CentralAir =='Y'
    df['PavedStreet'] = df.Street == 'Pave'
    del df['Street']
    df['UtilitiesAllPub'] = df.Utilities == 'AllPub'  
    del df['Utilities']
    dummies_HouseStyle = pd.get_dummies(df.HouseStyle,prefix='HouseStyle')
    del dummies_HouseStyle['HouseStyle_1.5Unf'],dummies_HouseStyle['HouseStyle_2.5Fin'],dummies_HouseStyle['HouseStyle_2.5Unf']
    df = pd.concat([df,dummies_HouseStyle],axis=1)
    del df['HouseStyle']
    return df
string_features = make_numeric(string_features)
string_features.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,MSZoning,Alley,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,...,MiscFeature,SaleType,SaleCondition,PavedStreet,UtilitiesAllPub,HouseStyle_1.5Fin,HouseStyle_1Story,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl
0,RL,,Reg,Lvl,Inside,Gtl,CollgCr,Norm,Norm,1Fam,...,,WD,Normal,True,True,0,0,1,0,0
1,RL,,Reg,Lvl,FR2,Gtl,Veenker,Feedr,Norm,1Fam,...,,WD,Normal,True,True,0,1,0,0,0
2,RL,,IR1,Lvl,Inside,Gtl,CollgCr,Norm,Norm,1Fam,...,,WD,Normal,True,True,0,0,1,0,0
3,RL,,IR1,Lvl,Corner,Gtl,Crawfor,Norm,Norm,1Fam,...,,WD,Abnorml,True,True,0,0,1,0,0
4,RL,,IR1,Lvl,FR2,Gtl,NoRidge,Norm,Norm,1Fam,...,,WD,Normal,True,True,0,0,1,0,0
