### Import  Packages

In [438]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn import preprocessing
from sklearn.pipeline import Pipeline

### Read in data

In [391]:
train_df = pd.read_csv('data/train.csv')

In [392]:
#train_df.shape

In [393]:
#train_df.columns

In [394]:
#train_df

### Think about nulls

In [395]:
train_df.iloc[:,:60].isnull().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
Street             0
Alley           1369
LotShape           0
LandContour        0
Utilities          0
LotConfig          0
LandSlope          0
Neighborhood       0
Condition1         0
Condition2         0
BldgType           0
HouseStyle         0
OverallQual        0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
RoofStyle          0
RoofMatl           0
Exterior1st        0
Exterior2nd        0
MasVnrType         8
MasVnrArea         8
ExterQual          0
ExterCond          0
Foundation         0
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinSF1         0
BsmtFinType2      38
BsmtFinSF2         0
BsmtUnfSF          0
TotalBsmtSF        0
Heating            0
HeatingQC          0
CentralAir         0
Electrical         1
1stFlrSF           0
2ndFlrSF           0
LowQualFinSF       0
GrLivArea          0
BsmtFullBath 

In [396]:
train_df.iloc[:,60:].isnull().sum()

GarageFinish       81
GarageCars          0
GarageArea          0
GarageQual         81
GarageCond         81
PavedDrive          0
WoodDeckSF          0
OpenPorchSF         0
EnclosedPorch       0
3SsnPorch           0
ScreenPorch         0
PoolArea            0
PoolQC           1453
Fence            1179
MiscFeature      1406
MiscVal             0
MoSold              0
YrSold              0
SaleType            0
SaleCondition       0
SalePrice           0
dtype: int64

### Looks like I need to impute some values...
Here is my strategy for imputing values:
* LotFrontage: median? or does a lack of value mean the lot frontage is 0?
* Alley: should be 'NA'
* MasVnrType: should be 'None'
* MasVnrArea: should be 0
* BsmtQual: 'NA'
* BsmtCond: 'NA'
* BsmtExposure: 'NA', except **there's an extra one??????**
* 
* BsmtFinType1: 'NA'
* BsmtFinType2: 'NA'
* Electrical: just one missing... mode? Or should I look at the other features of that house and compare to other houses to see if I can determine what value should go there?
* FireplaceQu: should be 'NA'?
* Garage*: 'NA'
* PoolQC: should be 'NA'?
* Fence: should be 'NA'?
* MiscFeature: should be 'NA'?

In [397]:
train_df['Alley'] = train_df['Alley'].fillna('NA')
train_df['MasVnrType'] = train_df['MasVnrType'].fillna('None')
train_df['MasVnrArea'] = train_df['MasVnrArea'].fillna(0)
train_df['BsmtQual'] = train_df['BsmtQual'].fillna('NA')
train_df['BsmtCond'] = train_df['BsmtCond'].fillna('NA')
train_df['BsmtExposure'] = train_df['BsmtExposure'].fillna('NA')

train_df['BsmtFinType1'] = train_df['BsmtFinType1'].fillna('NA')
train_df['BsmtFinType2'] = train_df['BsmtFinType2'].fillna('NA')
train_df['FireplaceQu'] = train_df['FireplaceQu'].fillna('NA')
train_df['PoolQC'] = train_df['PoolQC'].fillna('NA')
train_df['Fence'] = train_df['Fence'].fillna('NA')
train_df['MiscFeature'] = train_df['MiscFeature'].fillna('NA')
train_df['GarageType'] = train_df['GarageType'].fillna('NA')
train_df['GarageFinish'] = train_df['GarageFinish'].fillna('NA')
train_df['GarageQual'] = train_df['GarageQual'].fillna('NA')
train_df['GarageCond'] = train_df['GarageCond'].fillna('NA')


## Start converting codes to numbers

### Coding Ranked Data

#### BsmtFinType1 and 2

In [398]:
# get the order for BsmtFinType
ordered_BsmtFinType = ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf', 'NA']
ordered_BsmtFinType.reverse()
#ordered_BsmtFinType

In [399]:
# make the column BsmtFinType1 categorical data and order by code
train_df.BsmtFinType1 = train_df.BsmtFinType1.astype("category",
  ordered=True,
  categories=ordered_BsmtFinType
).cat.codes
#train_df.BsmtFinType1

In [400]:
# make the column BsmtFinType2 categorical data and order by code
train_df.BsmtFinType2 = train_df.BsmtFinType2.astype("category",
  ordered=True,
  categories=ordered_BsmtFinType
).cat.codes
#train_df.BsmtFinType2

#### BsmtExposure

In [401]:
# get the order for BsmtExposure
ordered_BsmtExposure = ['Gd', 'Av', 'Mn', 'No', 'NA']
ordered_BsmtExposure.reverse()
#ordered_BsmtExposure

In [402]:
# make the column BsmtExposure categorical data and order by code
train_df.BsmtExposure = train_df.BsmtExposure.astype("category",
  ordered=True,
  categories=ordered_BsmtExposure
).cat.codes
#train_df.BsmtExposure

#### BsmtQual

In [403]:
# get the order for BsmtQual
ordered_BsmtQual = ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA']
ordered_BsmtQual.reverse()
#ordered_BsmtQual

In [404]:
# make the column BsmtQual categorical data and order by code
train_df.BsmtQual = train_df.BsmtQual.astype("category",
  ordered=True,
  categories=ordered_BsmtQual
).cat.codes
#train_df.BsmtQual

#### BsmtCond

In [405]:
# get the order for BsmtCond
ordered_BsmtCond = ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA']
ordered_BsmtCond.reverse()
#ordered_BsmtCond

In [406]:
# make the column BsmtQual categorical data and order by code
train_df.BsmtCond = train_df.BsmtCond.astype("category",
  ordered=True,
  categories=ordered_BsmtCond
).cat.codes
#train_df.BsmtCond

#### ExterQual

In [407]:
# get the order for ExterQual
ordered_ExterScore = ['Ex', 'Gd', 'TA', 'Fa', 'Po']
ordered_ExterScore.reverse()
#ordered_ExterScore

In [408]:
# make the column ExterQual categorical data and order by code
train_df.ExterQual = train_df.ExterQual.astype("category",
  ordered=True,
  categories=ordered_ExterQual
).cat.codes
#train_df.ExterQual

#### ExterCond

In [409]:
# make the column ExterQual categorical data and order by code
train_df.ExterCond = train_df.ExterCond.astype("category",
  ordered=True,
  categories=ordered_ExterScore
).cat.codes
#train_df.ExterCond

#### HeatingQC

In [410]:
# get the order for HeatingQC
ordered_HeatingQC = ['Ex', 'Gd', 'TA', 'Fa', 'Po']
ordered_HeatingQC.reverse()
#ordered_HeatingQC

In [411]:
# make the column HeatingQC categorical data and order by code
train_df.HeatingQC = train_df.HeatingQC.astype("category",
  ordered=True,
  categories=ordered_HeatingQC
).cat.codes
#train_df.HeatingQC

#### KitchenQual

In [412]:
# get the order for KitchenQual
ordered_KitchenQual = ['Ex', 'Gd', 'TA', 'Fa', 'Po']
ordered_KitchenQual.reverse()
#ordered_KitchenQual

In [413]:
# make the column KitchenQual categorical data and order by code
train_df.KitchenQual = train_df.KitchenQual.astype("category",
  ordered=True,
  categories=ordered_KitchenQual
).cat.codes
#train_df.KitchenQual

#### Functional

In [414]:
# get the order for Functional
ordered_Functional = ['Typ', 'Min1', 'Min2', 'Mod', 'Maj1', 'Maj2', 'Sev', 'Sal']
ordered_Functional.reverse()
#ordered_Functional

In [415]:
# make the column Functional categorical data and order by code
train_df.Functional = train_df.Functional.astype("category",
  ordered=True,
  categories=ordered_Functional
).cat.codes
#train_df.Functional

#### FireplaceQu

In [416]:
# get the order for FireplaceQu
ordered_FireplaceQu = ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA']
ordered_FireplaceQu.reverse()
#ordered_FireplaceQu

In [417]:
# make the column FireplaceQu categorical data and order by code
train_df.FireplaceQu = train_df.FireplaceQu.astype("category",
  ordered=True,
  categories=ordered_FireplaceQu
).cat.codes
#train_df.FireplaceQu

#### GarageFinish

In [418]:
# get the order for GarageFinish
ordered_GarageFinish = ['Fin', 'RFn', 'Unf', 'NA']
ordered_GarageFinish.reverse()
#ordered_GarageFinish

In [419]:
# make the column GarageFinish categorical data and order by code
train_df.GarageFinish = train_df.GarageFinish.astype("category",
  ordered=True,
  categories=ordered_GarageFinish
).cat.codes
#train_df.GarageFinish

#### GarageQual

In [420]:
# get the order for GarageQual
ordered_GarageQual = ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA']
ordered_GarageQual.reverse()
#ordered_GarageQual

In [421]:
# make the column GarageFinish categorical data and order by code
train_df.GarageQual = train_df.GarageQual.astype("category",
  ordered=True,
  categories=ordered_GarageQual
).cat.codes
#train_df.GarageQual

#### GarageCond

In [422]:
# get the order for GarageCond
ordered_GarageCond = ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA']
ordered_GarageCond.reverse()
#ordered_GarageCond

In [423]:
# make the column GarageCond categorical data and order by code
train_df.GarageCond = train_df.GarageCond.astype("category",
  ordered=True,
  categories=ordered_GarageCond
).cat.codes
#train_df.GarageCond

#### PavedDrive

In [424]:
# get the order for PavedDrive
ordered_PavedDrive = ['Y', 'P', 'N']
ordered_PavedDrive.reverse()
#ordered_PavedDrive

In [425]:
# make the column PavedDrive categorical data and order by code
train_df.PavedDrive = train_df.PavedDrive.astype("category",
  ordered=True,
  categories=ordered_PavedDrive
).cat.codes
#train_df.PavedDrive

#### PoolQC

In [426]:
# get the order for PoolQC
ordered_PoolQC = ['Ex', 'Gd', 'TA', 'Fa', 'NA']
ordered_PoolQC.reverse()
#ordered_PoolQC

In [427]:
# make the column PoolQC categorical data and order by code
train_df.PoolQC = train_df.PoolQC.astype("category",
  ordered=True,
  categories=ordered_PoolQC
).cat.codes
#train_df.PoolQC

0       0
1       0
2       0
3       0
4       0
5       0
6       0
7       0
8       0
9       0
10      0
11      0
12      0
13      0
14      0
15      0
16      0
17      0
18      0
19      0
20      0
21      0
22      0
23      0
24      0
25      0
26      0
27      0
28      0
29      0
       ..
1430    0
1431    0
1432    0
1433    0
1434    0
1435    0
1436    0
1437    0
1438    0
1439    0
1440    0
1441    0
1442    0
1443    0
1444    0
1445    0
1446    0
1447    0
1448    0
1449    0
1450    0
1451    0
1452    0
1453    0
1454    0
1455    0
1456    0
1457    0
1458    0
1459    0
Name: PoolQC, Length: 1460, dtype: int8

## Throw it in a model

In [465]:
#train_df.columns
#train_df[['Id', 'Neighborhood', 'Condition1', 'Condition2', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearRemodAdd', 'ExterQual', 'ExterCond', ]]

In [478]:
#from sklearn import linear_model
#reg = linear_model.LinearRegression()
#reg.fit(train_df[''], )

In [None]:
# IGNORE THIS
#_ = plt.scatter(train_df.OverallQual, train_df.OverallCond, alpha=0.025)
#_ = plt.xlabel('Overall Quality')
#_ = plt.ylabel('Overall Condition')
#plt.show()

In [None]:
#_ = plt.hist(train_df[(train_df.OverallCond == 5) & (train_df.OverallQual == 8)].SalePrice, bins=10)
#plt.show()

In [436]:
#medians_by_hood = train_df.groupby(['Neighborhood', 'MoSold'])['SalePrice'].median()