# Data Pipeline
### This script will run all data transformation to go from "raw" data to "modelling" data.

In [33]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [16]:
train = pd.read_csv('src/train.csv')
test = pd.read_csv('src/test.csv')

Drop "Id" column from train dataset

In [17]:
train.drop("Id", axis=1, inplace=True)

Transform train target to ln(1+x)

In [18]:
train["SalePrice"] = np.log1p(train["SalePrice"])

### Convert from Numerical to Categorical variables on both train and test

In [19]:
# Train
train["OverallQual"]=train["OverallQual"].astype('category', ordered=True, categories=[1,2,3,4,5,6,7,8,9,10])
train["MSSubClass"]=train["MSSubClass"].astype('category', ordered=False)
train["OverallCond"]=train["OverallCond"].astype('category', ordered=True, categories=[1,2,3,4,5,6,7,8,9,10])
train["MoSold"]=train["MoSold"].astype('category', ordered=True, categories=[1,2,3,4,5,6,7,8,9,10,11,12])
# Test
test["OverallQual"]=test["OverallQual"].astype('category', ordered=True, categories=[1,2,3,4,5,6,7,8,9,10])
test["MSSubClass"]=test["MSSubClass"].astype('category', ordered=False)
test["OverallCond"]=test["OverallCond"].astype('category', ordered=True, categories=[1,2,3,4,5,6,7,8,9,10])
test["MoSold"]=test["MoSold"].astype('category', ordered=True, categories=[1,2,3,4,5,6,7,8,9,10,11,12])

  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """
  import sys
  
  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.


### Null inputation

#### Train

In [20]:
# Alley : NA means No Alley access
train["Alley"].fillna("No Alley", inplace=True)
#MasVnrType: Drop nulls only on train
train.dropna(axis=0,subset=["MasVnrType"], inplace=True)
train.reset_index(drop=True, inplace=True)
# BsmtQual : NA means No basement
train["BsmtQual"].fillna("No Basement", inplace=True)
# BsmtCond : NA means No basement
train["BsmtCond"].fillna("No Basement", inplace=True)
# BsmtExposure : NA means No basement
train["BsmtExposure"].fillna("No Basement", inplace=True)
#  BsmtFinType1 : NA means No basement
train["BsmtFinType1"].fillna("No Basement", inplace=True)
#  BsmtFinType2 : NA means No basement
train["BsmtFinType2"].fillna("No Basement", inplace=True)
# Electrical: Mode fill
train["Electrical"].fillna("SBrkr", inplace=True)
#FireplaceQu : NA means No fireplace
train["FireplaceQu"].fillna("No fireplace", inplace=True)
#GarageType : NA means No garage
train["GarageType"].fillna("No garage", inplace=True)
#GarageFinish : NA means No garage
train["GarageFinish"].fillna("No garage", inplace=True)
#GarageQual : NA means No garage
train["GarageQual"].fillna("No garage", inplace=True)
#GarageCond : NA means No garage
train["GarageCond"].fillna("No garage", inplace=True)
#PoolQC : NA means No pool
train["PoolQC"].fillna("No pool", inplace=True)
#Fence : NA means No fence
train["Fence"].fillna("No fence", inplace=True)
#MiscFeature : NA means None
train["MiscFeature"].fillna("None", inplace=True)
#LotFrontage : Group by neighborhood and fill in missing value by the median LotFrontage of all the neighborhood
train["LotFrontage"] = train.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))
#LGarageYrBlt : Based on the above, we will fill nulls with 0
train["GarageYrBlt"].fillna(0, inplace=True)

#### Test

In [21]:
# On test we will not drop any nulls and we will use medians from train when filling missing information
# Alley : NA means No Alley access
test["Alley"].fillna("No Alley", inplace=True)
# BsmtQual : NA means No basement
test["BsmtQual"].fillna("No Basement", inplace=True)
# BsmtCond : NA means No basement
test["BsmtCond"].fillna("No Basement", inplace=True)
# BsmtExposure : NA means No basement
test["BsmtExposure"].fillna("No Basement", inplace=True)
#  BsmtFinType1 : NA means No basement
test["BsmtFinType1"].fillna("No Basement", inplace=True)
#  BsmtFinType2 : NA means No basement
test["BsmtFinType2"].fillna("No Basement", inplace=True)
# Electrical: Mode fill
test["Electrical"].fillna("SBrkr", inplace=True)
#FireplaceQu : NA means No fireplace
test["FireplaceQu"].fillna("No fireplace", inplace=True)
#GarageType : NA means No garage
test["GarageType"].fillna("No garage", inplace=True)
#GarageFinish : NA means No garage
test["GarageFinish"].fillna("No garage", inplace=True)
#GarageQual : NA means No garage
test["GarageQual"].fillna("No garage", inplace=True)
#GarageCond : NA means No garage
test["GarageCond"].fillna("No garage", inplace=True)
#PoolQC : NA means No pool
test["PoolQC"].fillna("No pool", inplace=True)
#Fence : NA means No fence
test["Fence"].fillna("No fence", inplace=True)
#MiscFeature : NA means None
test["MiscFeature"].fillna("None", inplace=True)
#LotFrontage : Group  train by neighborhood and fill in missing value by the median LotFrontage of all the neighborhood
test["LotFrontage"] = train.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))
#LGarageYrBlt : Based on the above, we will fill nulls with 0
test["GarageYrBlt"].fillna(0, inplace=True)

Remove columns that will not be used on the model

In [22]:
remove_multicolinearity=['GarageArea', 'GarageYrBlt', 'TotRmsAbvGrd', '1stFlrSF']
remove_categorical=['Utilities', 'Street', 'Condition2', 'PoolQC']
remove = remove_multicolinearity + remove_categorical

In [23]:
train=train.loc[:,train.columns.difference(remove)]
test=test.loc[:,test.columns.difference(remove)]

In [37]:
train.info(verbose=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1452 entries, 0 to 1451
Columns: 72 entries, 2ndFlrSF to YrSold
dtypes: category(4), float64(3), int64(26), object(39)
memory usage: 779.1+ KB


In [42]:

cols = train.columns.difference(train._get_numeric_data().columns)
# process columns, apply LabelEncoder to categorical features
for c in cols:
    lbl = LabelEncoder() 
    lbl.fit(list(train[c].values)) 
    train[c] = lbl.transform(list(train[c].values))
    test[c] = lbl.transform(list(test[c].values))

# shape        
print('Shape train: {}'.format(train.shape))
print('Shape test: {}'.format(test.shape))

ValueError: y contains new labels: ['nan']

In [44]:
test.isnull().sum()

2ndFlrSF          0
3SsnPorch         0
Alley             0
BedroomAbvGr      0
BldgType          0
BsmtCond          0
BsmtExposure      0
BsmtFinSF1        1
BsmtFinSF2        1
BsmtFinType1      0
BsmtFinType2      0
BsmtFullBath      2
BsmtHalfBath      2
BsmtQual          0
BsmtUnfSF         1
CentralAir        0
Condition1        0
Electrical        0
EnclosedPorch     0
ExterCond         0
ExterQual         0
Exterior1st       1
Exterior2nd       1
Fence             0
FireplaceQu       0
Fireplaces        0
Foundation        0
FullBath          0
Functional        2
GarageCars        1
                 ..
LandContour       0
LandSlope         0
LotArea           0
LotConfig         0
LotFrontage       7
LotShape          0
LowQualFinSF      0
MSSubClass        0
MSZoning          4
MasVnrArea       15
MasVnrType       16
MiscFeature       0
MiscVal           0
MoSold            0
Neighborhood      0
OpenPorchSF       0
OverallCond       0
OverallQual       0
PavedDrive        0


In [24]:
train.shape

(1452, 72)

In [25]:
test.shape

(1459, 72)

In [29]:
x=pd.get_dummies(train)

In [30]:
x.shape

(1452, 326)

In [31]:
s=pd.get_dummies(test)

In [32]:
s.shape

(1459, 314)