# Kaggle Submissions

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import pickle
import numpy as np
np.random.seed(44)

In [2]:
with open('../assets/columns.pkl', 'rb') as f:
    columns=pickle.load(f)

with open('../assets/ridge.pkl', 'rb') as f:
    ridge=pickle.load(f)

In [3]:
kaggle=pd.read_csv('../data/test.csv', index_col='Id')

## Data Cleaning

We perform the same data cleaning to our kaggle set as we did our training.

1\. Change columns naming format

In [4]:
kaggle.columns = [column.lower().replace(' ', '_') for column in kaggle.columns]

2\. Drop same columns

In [5]:
kaggle.drop(columns=['pid', 'misc_val', 'fence', 'alley', 'misc_feature', 'pool_qc', 
                   'garage_yr_blt', 'mas_vnr_area', 'mas_vnr_type', 'lot_frontage',
                   'mo_sold', 'yr_sold', 'pool_area', 'utilities', 'low_qual_fin_sf', '3ssn_porch',
                   'neighborhood'], inplace=True)

3\. Use `fill_na` function to fill columns with `none`.

In [6]:
def fill_na(df, col_list, value='none'):
    for col in col_list:
        df[col].fillna(value, inplace=True)
    return

In [7]:
fill_na(df=kaggle, 
        col_list=['garage_type', 'garage_finish', 'garage_cond', 'garage_qual',
        'fireplace_qu', 'bsmt_qual', 'bsmt_cond', 'bsmt_exposure', 
        'bsmtfin_type_1', 'bsmtfin_type_2'])

4\. The kaggle test has a null value under `electrical`. There were no null values for `electrical` in the training set. We will impute `none` in order for the columns to match.

In [8]:
kaggle['electrical'].isna().sum()

1

In [9]:
kaggle['electrical'].fillna('none', inplace=True)

In [10]:
kaggle.isna().sum().sum()

0

## One-Hot Encoding

We create a binary column for each categorical column in order to feed it into our model.

In [11]:
kaggle_dummies = pd.get_dummies(data=kaggle)

The columns in `kaggle_dummies` and `ames_dummies` need to match. We'll find the difference between the two in order to match up the columns.

In [12]:
set(columns).difference(kaggle_dummies.columns)

{'bsmt_cond_Ex',
 'bsmt_cond_Po',
 'condition_2_Artery',
 'condition_2_RRAe',
 'condition_2_RRAn',
 'condition_2_RRNn',
 'electrical_Mix',
 'exterior_1st_CBlock',
 'exterior_1st_ImStucc',
 'exterior_1st_Stone',
 'exterior_2nd_Stone',
 'functional_Sal',
 'functional_Sev',
 'garage_qual_Ex',
 'heating_OthW',
 'heating_Wall',
 'heating_qc_Po',
 'ms_zoning_A (agr)',
 'roof_matl_Membran'}

In [13]:
diff_col = []
for col_name in set(columns).difference(kaggle_dummies.columns):
    diff_col.append(col_name)

In [14]:
for col in diff_col:
    kaggle_dummies[col] = 0

In [15]:
kaggle_dummies = kaggle_dummies[columns]

## Predictions

In [16]:
preds=ridge.predict(kaggle_dummies)



We align predictions back with index and set up header.

In [17]:
submission = pd.DataFrame(preds, index=kaggle_dummies.index, columns=['SalePrice'])

We sort the index in order for it to be in the kaggle-specified format.

In [18]:
submission.sort_index(inplace=True)

In [19]:
submission.head()

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
2,119084.521577
4,283720.889131
6,201933.96549
7,196601.06731
17,205143.66391


## Save to CSV for Submission

In [20]:
submission.to_csv('../data/ridge_preds.csv')

Check data is in correct format before submitting.

In [21]:
!head ../data/ridge_preds.csv

Id,SalePrice
2,119084.52157694497
4,283720.8891310245
6,201933.965490232
7,196601.0673102917
17,205143.6639097171
18,346058.3494124267
22,191887.92552058352
27,119422.25691001877
31,84231.06247627125
