# Import appropriate packages and set analysis options.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell

print_all_output = True
InteractiveShell.ast_node_interactivity = 'all' if print_all_output else 'last_expr'

# Define convenient variables and functions.

In [2]:
TRAINING_DATA_PATH = "./Data/train.csv"

# Load the file given by 'path' into a Pandas DataFrame.
def load_data(path):
    df = pd.read_csv(path, header=0, index_col='Id')
    print("Loaded data dimensions: ", df.shape[0], "rows, ", df.shape[1], "columns")
    return df

# Print all columns which contain NaN values.
def print_if_nan(df, columns, col_type):
    nan_cols = []
    nan_counts = []
    for col in np.sort(columns):
        num_nan = np.sum(x_train[col].isnull())
        if (num_nan > 0):
            nan_cols.append(col)
            nan_counts.append(num_nan)
    print("\n", len(nan_cols), " ", col_type, "-type columns with NaN values.", sep='')
    for col, count in zip(nan_cols, nan_counts):
          print(col, "\t", count)

# Load and preview the housing training data.

In [3]:
df = load_data(TRAINING_DATA_PATH)

Loaded data dimensions:  1460 rows,  80 columns


In [4]:
df.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
df.describe()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,46.549315,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,161.319273,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,0.0,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,0.0,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,1474.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


# Clean and process the training data.

In [6]:
# Partition data into features (X-data) and targets (Y-data).
x_train = df.iloc[:,:79]
y_train = df.iloc[:,79]

In [7]:
# Determine which features have missing values.
int_cols = x_train.select_dtypes(include=['int']).columns
print_if_nan(x_train, int_cols, "int")

float_cols = x_train.select_dtypes(include=['float']).columns
print_if_nan(x_train, float_cols, "float")

string_cols = x_train.select_dtypes(include=['dtype']).columns
print_if_nan(x_train, string_cols, "string")


0 int-type columns with NaN values.

3 float-type columns with NaN values.
GarageYrBlt 	 81
LotFrontage 	 259
MasVnrArea 	 8

16 string-type columns with NaN values.
Alley 	 1369
BsmtCond 	 37
BsmtExposure 	 38
BsmtFinType1 	 37
BsmtFinType2 	 38
BsmtQual 	 37
Electrical 	 1
Fence 	 1179
FireplaceQu 	 690
GarageCond 	 81
GarageFinish 	 81
GarageQual 	 81
GarageType 	 81
MasVnrType 	 8
MiscFeature 	 1406
PoolQC 	 1453
