In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from ds_utils import *
%matplotlib inline

In [2]:
DF = pd.read_csv('train.csv')
DF = DF.drop(['Id'], axis=1)

In [3]:
def summary_missing_data(data):
    total = data.isnull().sum().sort_values(ascending=False)
    percent = (data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['# of Missing', 'Percent'])
    return missing_data

In [4]:
summary_missing_data(DF)

Unnamed: 0,# of Missing,Percent
PoolQC,1453,0.995205
MiscFeature,1406,0.963014
Alley,1369,0.937671
Fence,1179,0.807534
FireplaceQu,690,0.472603
LotFrontage,259,0.177397
GarageType,81,0.055479
GarageCond,81,0.055479
GarageFinish,81,0.055479
GarageQual,81,0.055479


#### NA's (or NaNs) as value
1. **Alley:** Type of alley access to property
2. **BsmtQual:** Evaluates the height of the basement
3. **BsmtCond:** Evaluates the general condition of the basement
4. **BsmtExposure:** Refers to walkout or garden level walls
5. **BsmtFinType1:** Rating of basement finished area
6. **BsmtFinType2:** Rating of basement finished area (if multiple types)*
7. **FireplaceQu:** Fireplace quality
8. **GarageType:** Garage location
9. **GarageFinish:** Interior finish of the garage
10. **GarageQual:** Garage quality
11. **GarageCond:** Garage condition
12. **PoolQC:** Pool quality (Biggest of the above)
13. **Fence:** Fence quality
14. **MiscFeature:** Miscellaneous feature not covered in other categories

##### The two below are listed as None in description, but NA in dataset
* MasVnrArea: Masonry veneer type
* MasVnrType: Masonry veneer area in square feet

##### Others
* LotFrontage: Maybe set to NA if there is no street connected to property?
* GarageYrBlt: Set to NA if the above Garage attributes are set as NA

* Electrical: Electrical system. There is only one property with a missing data in Electrical. At row 1381


### Filled NA in electrical
Replaced Nones in MasVnrType with NAs and 0s in MasVnrArea with NAs

In [5]:
DF['Electrical'].fillna('SBrkr', inplace=True)
DF['MasVnrType'].replace(to_replace=['None'],value=np.nan,inplace=True)
DF['MasVnrArea'].fillna(0, inplace=True)
DF['MasVnrArea'].replace(to_replace=[0],value=np.nan,inplace=True)

### Get Dummies

In [6]:
DFdum = pd.get_dummies(DF, dummy_na=True)
DFdum['PoolQC_nan'].head()
summary_missing_data(DFdum)

Unnamed: 0,# of Missing,Percent
MasVnrArea,869,0.595205
LotFrontage,259,0.177397
GarageYrBlt,81,0.055479
SaleCondition_nan,0,0.000000
Condition2_RRAe,0,0.000000
Condition1_RRNn,0,0.000000
Condition1_nan,0,0.000000
Condition2_Artery,0,0.000000
Condition2_Feedr,0,0.000000
Condition2_Norm,0,0.000000


Was going to get ready for PCA stuff below

In [7]:
from sklearn.preprocessing import StandardScaler

y = DF.loc[:,['SalePrice']].values
DF = DF.drop(['SalePrice'], axis=1)
y

X = datatrain.drop(columns=['SalePrice', 'Id'])
Y = datatrain['SalePrice']
Xt = datatest.drop(columns=['Id'])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
Xt_scaled = scaler.fit_transform(Xt)

