In [1]:
import numpy as np
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import pickle
import csv

from sklearn import linear_model
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import statsmodels.api as sm
import warnings
warnings.filterwarnings('ignore')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline
np.random.seed(42)

plt.style.use('fivethirtyeight')

## Kaggle Test Data Cleaning

### Step 1: Load in the provided Kaggle Testing Data

- **test.csv** -- this data contains the test data for our model.

In [2]:
ames_df_test = pd.read_csv('../data/test.csv', index_col='Id')

#### The Data Vacuum Clean (again)

- In the next numerous amounts of cells I will be performing the same data cleaning that was performed on the Train dataset. 
    - Important thing to noe is that I will not be dropping any rows within the Test dataset
    
Overview:

- dropping specific columns
- Reviewing columns for NaN values
- Updating columns data types to the correct types based on if the data is categorical or numeric (continuous)
- Review numeric columns for empty cells and fill these cells with the average value from the dataset

<details><summary>For an in depth review of the data clean click here:</summary>
```
Droping the following columns:

- Alley : done
- Pool QC : done
- Fence : done
- Misc Feature : done
- misc_val : done
- garage_yr_built : done
- fireplace_qu: done
- pool_area: done (only 8 records with data that is not 0)
- 3ssn_porch : done
- low_qual_fin_sf : done
- garage_finish : done
- garage_cond : done
- bsmt_cond : done
- exter_cond : done

The following need to be reviewed for NaN to determine what needs to be inserted:

- year_built      : change to object
- year_remod/add  : change to object
- ms_subclass     : change to object
- overall_qual    : change to object
- overall_cond    : change to object
- full_bath       : change to object
- half_bath       : change to object
- bedroom_abvgr   : change to object
- kitchen_abvgr   : change to object
- kitchen_abvgr   : change to object
- totrms_abvgrd   : change to object
- fireplaces      : change to object
- mo_sold         : change to object
- yr_sold         : change to object
- Lot Frontage    : mean of the column for all NaN
- Mas Vnr Area    : mean of the column for all NaN
- Bsmt Qual       : fill with NA No Basement       
- Bsmt Cond       : fill with NA No Basement        
- Bsmt Exposure   : fill with NA No Basement     
- BsmtFin Type 1  : fill with NA No Basement
- BsmtFin SF 1    : fill with 0
- BsmtFin Type 2  : fill with NA No Basement
- BsmtFin SF 2    : fill with 0
- Bsmt Unf SF     : fill with 0
- Total Bsmt SF   : fill with 0
- Bsmt Full Bath  : fill with 0 and update type to object
- Bsmt Half Bath  : fill with 0 and update type to object
- Fireplace Qual  : dropping the column
- Garage Type     : fill with NA No Garage
- Garage Finish   : fill with NA No Garage
- Garage Cars     : fill with 0 and update type to object
- Garage Area     : fill with 0
- Garage Qual     : fill with NA No Garage
- Garage Cond     : fill with NA No Garage
- misc_val        : drop column
```
</details>

In [3]:
len(ames_df_test)

879

In [4]:
ames_df_test.drop('PID', 1, inplace=True)

In [5]:
ames_df_test.columns = [x.lower().replace(' ', '_') for x in ames_df_test.columns]

In [6]:
ames_df_test['bsmtfin_sf_1'].fillna(0, inplace=True)
ames_df_test['bsmtfin_sf_2'].fillna(0, inplace=True)
ames_df_test['bsmt_unf_sf'].fillna(0, inplace=True)
ames_df_test['total_bsmt_sf'].fillna(0, inplace=True)
ames_df_test['bsmt_full_bath'].fillna(0, inplace=True)
ames_df_test['bsmt_half_bath'].fillna(0, inplace=True)
ames_df_test['garage_cars'].fillna(0, inplace=True)
ames_df_test['garage_area'].fillna(0, inplace=True)

In [7]:
ames_df_test['bsmt_qual'].fillna('NA', inplace=True)
ames_df_test['bsmt_cond'].fillna('NA', inplace=True)
ames_df_test['bsmtfin_type_2'].fillna('NA', inplace=True)
ames_df_test['bsmt_exposure'].fillna('NA', inplace=True)
ames_df_test['bsmtfin_type_1'].fillna('NA', inplace=True)
ames_df_test['garage_type'].fillna('NA', inplace=True)
ames_df_test['garage_finish'].fillna('NA', inplace=True)
ames_df_test['garage_qual'].fillna('NA', inplace=True)
ames_df_test['garage_cond'].fillna('NA', inplace=True)

In [8]:
ames_df_test['lot_frontage'].fillna(ames_df_test['lot_frontage'].mean(), inplace=True)
ames_df_test['mas_vnr_area'].fillna(ames_df_test['mas_vnr_area'].mean(), inplace=True)

In [9]:
ames_df_test.drop('alley', axis=1, inplace=True)
ames_df_test.drop(['pool_qc', 'fence', 'misc_feature'], axis=1, inplace=True)
ames_df_test.drop('fireplace_qu', axis=1, inplace =True)
ames_df_test.drop('garage_yr_blt', axis=1, inplace=True)
ames_df_test.drop('misc_val', axis=1, inplace=True)
ames_df_test.drop('pool_area', axis=1, inplace=True)

In [10]:
ames_df_test[[
    'year_built', 
    'year_remod/add',
    'bsmt_full_bath',
    'bsmt_half_bath',
    'full_bath',
    'half_bath',
    'bedroom_abvgr',
    'kitchen_abvgr',
    'totrms_abvgrd',
    'fireplaces',
    'mo_sold',
    'yr_sold',   
    'garage_cars',
    'ms_subclass',
    'overall_qual',
    'overall_cond']] = ames_df_test[[
    'year_built', 
    'year_remod/add',
    'bsmt_full_bath',
    'bsmt_half_bath',
    'full_bath',
    'half_bath',
    'bedroom_abvgr',
    'kitchen_abvgr',
    'totrms_abvgrd',
    'fireplaces',
    'mo_sold',
    'yr_sold',   
    'garage_cars',
    'ms_subclass',
    'overall_qual',
    'overall_cond']].astype(object)

In [11]:
ames_df_test.drop('3ssn_porch', axis=1, inplace=True)
ames_df_test.drop('low_qual_fin_sf', axis=1, inplace=True)
ames_df_test.drop('garage_finish', axis=1, inplace=True)
ames_df_test.drop('garage_cond', axis=1, inplace=True)
ames_df_test.drop('bsmt_cond', axis=1, inplace=True)
ames_df_test.drop('exter_cond', axis=1, inplace=True)

In [12]:
ames_df_test.to_csv('../data/ames_iowa_data_test_data_clean')

In [13]:
len(ames_df_test)

879