## Data Cleaning:

In [54]:
# Import Libraries:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import make_column_transformer

In [69]:
# Read in Data:

df_train = pd.read_csv('../datasets/train.csv')

### Address Improperly Cast Numeric Variables:

In [70]:
# Turn numerics (years) that should be objects into objects and fill with nones:

df_train[['Garage Yr Blt', 
          'Year Built', 
          'Year Remod/Add',
          'Mo Sold', 
          'Yr Sold']] = df_train[['Garage Yr Blt', 
                                  'Year Built', 
                                  'Year Remod/Add','Mo Sold', 
                                  'Yr Sold']].astype(str)

### Address Missing Values:

In [71]:
# Fill NA with 'none' for categorical colummns:

none_columns = ['Pool QC', 'Misc Feature',  'Alley',
             'Fence', 'Fireplace Qu', 'Garage Cond', 'Year Built',
             'Garage Qual', 'Garage Finish', 'Garage Type', 'Garage Yr Blt',
             'Year Remod/Add','Mo Sold', 'Yr Sold',
             'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin Type 2',
             'Bsmt Qual', 'Bsmt Cond', 'Mas Vnr Type', 'Bsmt Full Bath',
             'Bsmt Half Bath']

In [72]:
for i in none_columns:
    df_train[i] = df_train[i].fillna('none')

In [73]:
# Fill Null values with 0 for numeric columns (except lot frontage):

zero_columns = ['Mas Vnr Area', 'BsmtFin SF 1', 'BsmtFin SF 2',
             'Bsmt Unf SF', 'Total Bsmt SF', 'Garage Area', 'Garage Cars']

In [74]:
for i in zero_columns:
    df_train[i] = df_train[i].fillna(0)

In [75]:
# Explore Lot Frontage (strong correlation with sale price, 300+ missing values):

df_train.groupby('Neighborhood')['Lot Frontage'].median()

Neighborhood
Blmngtn    43.0
Blueste    24.0
BrDale     21.0
BrkSide    51.0
ClearCr    79.5
CollgCr    70.0
Crawfor    70.0
Edwards    67.5
Gilbert    65.0
Greens     38.0
IDOTRR     60.0
MeadowV    21.0
Mitchel    75.0
NAmes      74.5
NPkVill    24.0
NWAmes     80.0
NoRidge    93.5
NridgHt    86.5
OldTown    60.0
SWISU      60.0
Sawyer     75.0
SawyerW    66.0
Somerst    72.0
StoneBr    65.0
Timber     85.0
Veenker    80.0
Name: Lot Frontage, dtype: float64

In [76]:
# No values for GrnHill and Landmark >> Impute with median for neighborhoods.

lot_frontage = ['Lot Frontage']

for i in lot_frontage:
    df_train[i] = df_train[i].fillna(df_train['Lot Frontage'].median())

In [77]:
# FOR TEST DATA ONLY: Explore test data Electrical missing value:

df_train['Electrical'].value_counts()

SBrkr    813
FuseA     48
FuseF     15
FuseP      1
Name: Electrical, dtype: int64

In [78]:
# FOR TEST DATA ONLY: Impute most common Electrical type (SBrkr) for missing values:

Electrical = ['Electrical']

for i in Electrical:
    df_train[i] = df_train[i].fillna('SBrkr')

### Address Outliers:

In [79]:
#Sq Ft Total

### Cleaning Validation:

In [80]:
# Validate: No null values, data types are correct:

pd.set_option("display.max_rows", 999)
df_train.isnull().sum()

Id                 0
PID                0
MS SubClass        0
MS Zoning          0
Lot Frontage       0
Lot Area           0
Street             0
Alley              0
Lot Shape          0
Land Contour       0
Utilities          0
Lot Config         0
Land Slope         0
Neighborhood       0
Condition 1        0
Condition 2        0
Bldg Type          0
House Style        0
Overall Qual       0
Overall Cond       0
Year Built         0
Year Remod/Add     0
Roof Style         0
Roof Matl          0
Exterior 1st       0
Exterior 2nd       0
Mas Vnr Type       0
Mas Vnr Area       0
Exter Qual         0
Exter Cond         0
Foundation         0
Bsmt Qual          0
Bsmt Cond          0
Bsmt Exposure      0
BsmtFin Type 1     0
BsmtFin SF 1       0
BsmtFin Type 2     0
BsmtFin SF 2       0
Bsmt Unf SF        0
Total Bsmt SF      0
Heating            0
Heating QC         0
Central Air        0
Electrical         0
1st Flr SF         0
2nd Flr SF         0
Low Qual Fin SF    0
Gr Liv Area  

In [81]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878 entries, 0 to 877
Data columns (total 80 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Id               878 non-null    int64  
 1   PID              878 non-null    int64  
 2   MS SubClass      878 non-null    int64  
 3   MS Zoning        878 non-null    object 
 4   Lot Frontage     878 non-null    float64
 5   Lot Area         878 non-null    int64  
 6   Street           878 non-null    object 
 7   Alley            878 non-null    object 
 8   Lot Shape        878 non-null    object 
 9   Land Contour     878 non-null    object 
 10  Utilities        878 non-null    object 
 11  Lot Config       878 non-null    object 
 12  Land Slope       878 non-null    object 
 13  Neighborhood     878 non-null    object 
 14  Condition 1      878 non-null    object 
 15  Condition 2      878 non-null    object 
 16  Bldg Type        878 non-null    object 
 17  House Style     

### Export Training data to cleaned.csv: 

In [68]:
# Export to CSV for next step >> Feature Engineering:

df_train.to_csv('../datasets/train_cleaned.csv')

### Repeat Above Process for Test Data:

The above process was repeated on test_data.csv. The above is being kept in its current training-oriented format for the sake of duplicate code. 

In [82]:
# Export Test Data to CSV:
    
df_train.to_csv('../datasets/test_cleaned.csv')