In [1]:
## This is a slight alteration to the dataset obtained in the Preprocessing.ipynb
## We label encode all categorical variables (instead of non-nominal ones only)
## As intended, it will largely be a copy-paste effort.

import seaborn as sns
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
pd.set_option('display.max_columns', 500)

# Filling values

In this first part, we will fill in the values and ordinally encode some features. This is taken directly from the Preprocessing.ipynb

In [2]:


# setting the dataframes

raw_housing = pd.read_csv('./data/Ames_Housing_Price_Data.csv', index_col=0)
raw_realty = pd.read_csv('./data/Ames_Real_Estate_Data.csv', index_col=0)
raw_housing.shape


  raw_realty = pd.read_csv('./data/Ames_Real_Estate_Data.csv', index_col=0)


(2580, 80)

In [3]:
housing = raw_housing.reset_index()

In [4]:
# lets start by cleaning up the data with missing values. 
# not all missing values are expected to be "non-existent"

cols = housing.columns[housing.isna().any()].to_list()
print(cols)



['LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Electrical', 'BsmtFullBath', 'BsmtHalfBath', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']


In [5]:
# lets separate all of these to groups to make it easier to investigate

Basement = []
Garage = []
Outdoor = []
Other = []

for col in cols:
    if 'Bsmt' in col:
        Basement.append(col)
    elif 'Garage' in col:
        Garage.append(col)
    elif col in ['Fence', 'PoolQC', 'FireplaceQu', 'MasVnrType', 'MasVnrArea']:
        Outdoor.append(col)
    else:
        Other.append(col)
        
print(Basement)
print(Garage)
print(Outdoor)
print(Other)

['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath']
['GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond']
['MasVnrType', 'MasVnrArea', 'FireplaceQu', 'PoolQC', 'Fence']
['LotFrontage', 'Alley', 'Electrical', 'MiscFeature']


In [6]:
# lets first create some dictionaries

Exposure = {'NA': 0, 'No': 1, 'Mn': 2, 'Gd': 3, 'Av': 4}
FinType = {'NA': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6}
QualCond = {'NA': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}

GarFin = {'NA': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3}
FenceQual = {'NA': 0, 'MnWw': 1, 'GdWo': 2, 'MnPrv': 3, 'GdPrv': 4}



In [7]:
# lets start by looking at the null in total basement SF

housing[housing['TotalBsmtSF'].isna()] 
# looks like all of these have NaN in the basement type. 
# lets change these values to 0

housing['TotalBsmtSF'].fillna(0, inplace=True)
    
# additionally, lets have all observations = 0 if the total SF == 0
for col in housing[Basement].drop('TotalBsmtSF', axis=1).columns.tolist():
    housing[col] = housing[col].mask(housing['TotalBsmtSF'] == 0, 'NA')
    
    
# lets investigate the other missing values

# BsmtExposure has 2 missing values still, but those values have good basement quality
# lets fill those with the most common, which is 'No'
housing['BsmtExposure'].fillna('No', inplace = True)

# BsmtFinType2 has 1 missing value, but basement quality is good
# lets fill those with the most common, which is 'Unf'
housing['BsmtFinType2'].fillna('Unf', inplace = True)

# now we can apply formulas to the ordinal rows
housing['BsmtExposure'] = housing['BsmtExposure'].apply(lambda row: Exposure[row])
housing['BsmtFinType1'] = housing['BsmtFinType1'].apply(lambda row: FinType[row])
housing['BsmtFinType2'] = housing['BsmtFinType2'].apply(lambda row: FinType[row])
housing['BsmtQual'] = housing['BsmtQual'].apply(lambda row: QualCond[row])
housing['BsmtCond'] = housing['BsmtCond'].apply(lambda row: QualCond[row])
housing['BsmtFinSF1'] = housing['BsmtFinSF1'].apply(lambda row: 0 if row == 'NA' else row)
housing['BsmtFinSF2'] = housing['BsmtFinSF2'].apply(lambda row: 0 if row == 'NA' else row)
housing['BsmtUnfSF'] = housing['BsmtUnfSF'].apply(lambda row: 0 if row == 'NA' else row)
housing['BsmtFullBath'] = housing['BsmtFullBath'].apply(lambda row: 0 if row == 'NA' else row)
housing['BsmtHalfBath'] = housing['BsmtHalfBath'].apply(lambda row: 0 if row == 'NA' else row)

# and take a look at our partial df
housing[Basement]

Unnamed: 0,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,BsmtFullBath,BsmtHalfBath
0,3,3,1,3,238.0,1,0.0,618.0,856.0,1.0,0.0
1,4,3,2,6,552.0,5,393.0,104.0,1049.0,1.0,0.0
2,3,3,1,5,737.0,1,0.0,100.0,837.0,0.0,0.0
3,2,3,1,1,0.0,1,0.0,405.0,405.0,0.0,0.0
4,4,3,1,6,643.0,1,0.0,167.0,810.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
2575,3,3,1,1,0.0,1,0.0,952.0,952.0,0.0,0.0
2576,0,0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0
2577,3,3,1,4,284.0,1,0.0,717.0,1001.0,0.0,0.0
2578,4,3,1,6,841.0,1,0.0,144.0,985.0,1.0,0.0


In [8]:
# lets start by filling the null values in GarageArea to 0

housing['GarageArea'].fillna(0, inplace=True)

# lets fill missing GarageType values with 'NoGarage'
# as this is nominal, we'll dummify later
housing['GarageType'].fillna('NoGarage', inplace=True)

# now we can replace NaN values for observations without a garage

housing['GarageQual'].fillna('NA', inplace = True)
housing['GarageCond'].fillna('NA', inplace = True)
housing['GarageFinish'].fillna('NA', inplace = True)

# investigating missing values in columns WITH a garage:

# missing GarageYrBlt will be entered as the YearBuilt
housing['GarageYrBlt'] = housing['GarageYrBlt'].mask(
    housing['GarageType'] != 'NoGarage', housing['YearBuilt'])

# mising GarageCars values will be replaced to the most common: 2
housing['GarageCars'] = housing['GarageCars'].mask(
    housing['GarageType'] != 'NoGarage', 2)

# finally, we can apply a formula to the other missing values
housing['GarageQual'] = housing['GarageQual'].apply(lambda row: QualCond[row])
housing['GarageCond'] = housing['GarageCond'].apply(lambda row: QualCond[row])
housing['GarageFinish'] = housing['GarageFinish'].apply(lambda row: GarFin[row])

# Changing missing garage year built to 0 (this works fine with RF based methods)
housing['GarageYrBlt'].fillna(0, inplace=True)

In [9]:


# lets start with MasVnrType and MasVnrArea

# lets fill na values of MasVnrArea to 0, then mask the type with 'None'
# MasVnrType is nominal, so we'll dummify this later

housing['MasVnrArea'].fillna(0, inplace=True)
housing['MasVnrType'] = housing['MasVnrType'].mask(housing['MasVnrArea'] == 0, 'None')

# we can first fill some null values
housing['FireplaceQu'].fillna('NA', inplace=True)
housing['Fence'].fillna('NA', inplace=True)

# for PoolQC, I'm going to fill to 'Po' because it doesn't have a 'Poor quality'
# that way, I can use the QualCond row and subtract by 1
housing['PoolQC'].fillna('Po', inplace=True)

# now we can apply functions
housing['FireplaceQu'] = housing['FireplaceQu'].apply(lambda row: QualCond[row])
housing['PoolQC'] = housing['PoolQC'].apply(lambda row: QualCond[row]-1)
housing['Fence'] = housing['Fence'].apply(lambda row: FenceQual[row])

housing[Outdoor]



Unnamed: 0,MasVnrType,MasVnrArea,FireplaceQu,PoolQC,Fence
0,,0.0,4,0,0
1,BrkFace,149.0,0,0,0
2,,0.0,0,0,0
3,,0.0,0,0,0
4,,0.0,0,0,0
...,...,...,...,...,...
2575,,0.0,4,0,0
2576,,0.0,4,0,0
2577,,0.0,0,0,0
2578,BrkFace,144.0,3,0,0


In [10]:


# for LotFrontage and Alley, null values are 0, so lets mask them as such

housing['LotFrontage'].fillna(0, inplace=True)
housing['Alley'].fillna('None', inplace=True)

# for electrical, theres only one missing value, so lets mask it with the most common: SBrkr
housing['Electrical'].fillna('SBrkr', inplace=True)

# for misc features, these are nominal descriptive
# lets first fill missing values, and we'll dummify it later
housing['MiscFeature'].fillna('None', inplace=True)

housing[Other]

Unnamed: 0,LotFrontage,Alley,Electrical,MiscFeature
0,0.0,,SBrkr,
1,42.0,,SBrkr,
2,60.0,,SBrkr,
3,80.0,,SBrkr,
4,70.0,,SBrkr,
...,...,...,...,...
2575,0.0,,FuseF,
2576,0.0,,FuseA,
2577,82.0,,FuseA,
2578,0.0,,SBrkr,


# Encoding

Here, we'll encode every categorical variable with a label encoder.

In [11]:
df = housing.drop(['PID','SalePrice'], axis=1).copy()

In [12]:
## categorical and continuous columns

catCols = [col for col in df.columns if df[col].dtype=="O"]
contCols = [col for col in df.columns if df[col].dtype!='O']

In [13]:
catCols

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'GarageType',
 'PavedDrive',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [14]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

In [15]:
for col in catCols:
    #print(le.fit_transform(df[col]))
    df[col]=le.fit_transform(df[col])

In [16]:
otherAmes = pd.read_csv('data/Ames_215features.csv')

otherAmes.columns

Index(['LotShape', 'LandSlope', 'OverallQual', 'OverallCond', 'ExterQual',
       'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'HeatingQC',
       ...
       'SaleCondition_AdjLand', 'SaleCondition_Alloca', 'SaleCondition_Family',
       'SaleCondition_Partial', 'crime_rate', 'school_quality', 'HouseAge',
       'LastRemod', 'nb_appreciation', 'nb_income'],
      dtype='object', length=212)

In [17]:
df = pd.concat([df, otherAmes[['crime_rate', 'school_quality', 'HouseAge',
       'LastRemod', 'nb_appreciation', 'nb_income']]], axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2580 entries, 0 to 2579
Data columns (total 85 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   GrLivArea        2580 non-null   int64  
 1   MSSubClass       2580 non-null   int64  
 2   MSZoning         2580 non-null   int64  
 3   LotFrontage      2580 non-null   float64
 4   LotArea          2580 non-null   int64  
 5   Street           2580 non-null   int64  
 6   Alley            2580 non-null   int64  
 7   LotShape         2580 non-null   int64  
 8   LandContour      2580 non-null   int64  
 9   Utilities        2580 non-null   int64  
 10  LotConfig        2580 non-null   int64  
 11  LandSlope        2580 non-null   int64  
 12  Neighborhood     2580 non-null   int64  
 13  Condition1       2580 non-null   int64  
 14  Condition2       2580 non-null   int64  
 15  BldgType         2580 non-null   int64  
 16  HouseStyle       2580 non-null   int64  
 17  OverallQual   

In [19]:
df = pd.concat([df, housing[['PID', 'SalePrice']]], axis=1)
df.to_csv('RF_LabEnc_data.csv')

## We are now done. This will be used in a RandomForest notebook