In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import pickle
import re

In [2]:
data = pd.read_csv('model_data.csv', index_col='PID')
data.shape

(2579, 20)

In [3]:
y = data.SalePrice.copy()
data.drop('SalePrice', axis=1, inplace=True)

In [4]:
data_dum = pd.get_dummies(data, drop_first=True)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(data_dum, y, test_size=0.3)

In [6]:
X_train.shape

(1805, 72)

In [7]:
X_test.iloc[0,:].to_frame().T

Unnamed: 0,BsmtUnfSF,AllBathBsmt,AllBathAbv,YearBuilt,GarageCars,PorchArea,GoodLivArea,HasPool,MSSubClass_1FlPUD,MSSubClass_2Fl,...,FireplaceQu_Fa,FireplaceQu_Gd,FireplaceQu_None,FireplaceQu_Po,FireplaceQu_TA,GarageQual_Fa,GarageQual_Gd,GarageQual_None,GarageQual_Po,GarageQual_TA
528387050,223.0,1.0,2.5,1994.0,2.0,303.0,3127.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [8]:
pickled_model = pickle.load(open('APP_model.pkl', 'rb'))
10**pickled_model.predict(X_test.iloc[0,:].to_frame().T)

array([275969.51906563])

In [9]:
X_test.iloc[0,:].to_frame().T.to_csv('pickle_base.csv', index=False)

In [10]:
baseline = pd.read_csv('pickle_base.csv')

In [11]:
baseline['HasPool'] = 1
baseline.HasPool

0    1
Name: HasPool, dtype: int64

In [12]:
def num_format(num):
    # converts any int/float to human readable string with thousandth commas
    new_num = ''
    for idx, c in enumerate(str(np.int64(num))[::-1]):
        if (idx+1)%4 == 0:
            new_num += ','
        new_num += c
    return new_num[::-1]
        
num_format(1415)

'1,415'

In [13]:
neigh_cols = list(baseline.filter(regex='^Neigh').columns)
baseline.loc[0,'Neighborhood_Blueste']
baseline.loc[0,neigh_cols] = 0
baseline.loc[0,neigh_cols]

Neighborhood_Blueste    0.0
Neighborhood_BrDale     0.0
Neighborhood_BrkSide    0.0
Neighborhood_ClearCr    0.0
Neighborhood_CollgCr    0.0
Neighborhood_Crawfor    0.0
Neighborhood_Edwards    0.0
Neighborhood_Gilbert    0.0
Neighborhood_Greens     0.0
Neighborhood_GrnHill    0.0
Neighborhood_IDOTRR     0.0
Neighborhood_Landmrk    0.0
Neighborhood_MeadowV    0.0
Neighborhood_Mitchel    0.0
Neighborhood_NAmes      0.0
Neighborhood_NPkVill    0.0
Neighborhood_NWAmes     0.0
Neighborhood_NoRidge    0.0
Neighborhood_NridgHt    0.0
Neighborhood_OldTown    0.0
Neighborhood_SWISU      0.0
Neighborhood_Sawyer     0.0
Neighborhood_SawyerW    0.0
Neighborhood_Somerst    0.0
Neighborhood_StoneBr    0.0
Neighborhood_Timber     0.0
Neighborhood_Veenker    0.0
Name: 0, dtype: float64

In [14]:
neigh_cols.remove('Neighborhood_Blueste')

In [15]:
def pkl_encode_nbr(base_data, code):
    # Encodes the neighborhood selected with 1, all other dummy columns are set to 0
    target = 'Neighborhood_'+code
    neigh_cols = list(base_data.filter(regex='^Neigh').columns)
    base_data.loc[0,neigh_cols] = 0
    if target in neigh_cols:
        neigh_cols.remove(target)
        base_data.loc[0,target] = 1
    return base_data

pkl_encode_nbr(baseline, 'Blmngtn')

Unnamed: 0,BsmtUnfSF,AllBathBsmt,AllBathAbv,YearBuilt,GarageCars,PorchArea,GoodLivArea,HasPool,MSSubClass_1FlPUD,MSSubClass_2Fl,...,FireplaceQu_Fa,FireplaceQu_Gd,FireplaceQu_None,FireplaceQu_Po,FireplaceQu_TA,GarageQual_Fa,GarageQual_Gd,GarageQual_None,GarageQual_Po,GarageQual_TA
0,223.0,1.0,2.5,1994.0,2.0,303.0,3127.0,1,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [16]:
t = data.groupby(['Neighborhood','MSSubClass']).agg('median')

In [17]:
t.loc[('Veenker', 'SPLIT')]

BsmtUnfSF       289.0
AllBathBsmt       1.0
AllBathAbv        2.0
YearBuilt      1976.0
GarageCars        2.0
PorchArea       348.0
GoodLivArea    2503.0
HasPool           0.0
Name: (Veenker, SPLIT), dtype: float64

## CatPickle

In [41]:
pickled_model = pickle.load(open('C:\\Users\\dn-83\\Documents\\bootcamp\\ML_proj\\tony\\APP_model_tony.pkl', 'rb'))

In [42]:
data = pd.read_csv('C:\\Users\\dn-83\\Documents\\bootcamp\\ML_proj\\tony\\00_pickle_df_tony.csv', index_col='PID')

In [43]:
y = data[['SalePrice']]
X = data.drop('SalePrice', axis=1)

In [65]:
pickled_model.predict(X.loc[[535454150]])

array([144595.98437578])

In [52]:
data.reset_index(inplace=True)

In [64]:
data.agg('median')

PID             535454150.0
AllBathAbv              2.0
AllBathBsmt             0.0
BsmtCond                2.0
BsmtUnfSF             448.0
CentralAir              1.0
ExterQual               2.0
FireplaceQu             1.0
Foundation              5.0
GarageCars              2.0
GarageQual              2.0
GoodLivArea          1855.0
HasPool                 0.0
HeatingQC               4.0
KitchenQual             2.0
MSSubClass              5.0
Neighborhood           15.0
PavedDrive              1.0
PorchArea             167.0
SalePrice          159900.0
YearBuilt            1972.0
dtype: float64

In [63]:
# Append Median Basehouse

In [54]:
data.append(data.agg('median'), ignore_index=True)

Unnamed: 0,PID,AllBathAbv,AllBathBsmt,BsmtCond,BsmtUnfSF,CentralAir,ExterQual,FireplaceQu,Foundation,GarageCars,...,GoodLivArea,HasPool,HeatingQC,KitchenQual,MSSubClass,Neighborhood,PavedDrive,PorchArea,SalePrice,YearBuilt
0,5.263011e+08,1.0,1.0,3.0,441.0,1.0,2.0,3.0,5.0,2.0,...,2295.0,0.0,1.0,2.0,5.0,15.0,1.0,272.0,215000.0,1960.0
1,5.263020e+08,2.0,1.0,2.0,171.0,1.0,2.0,0.0,5.0,2.0,...,2300.0,0.0,3.0,2.0,5.0,15.0,1.0,280.0,149900.0,1954.0
2,5.263020e+08,1.0,1.0,2.0,235.0,1.0,2.0,2.0,5.0,1.0,...,1797.0,0.0,2.0,3.0,5.0,15.0,1.0,280.0,157500.0,1956.0
3,5.263021e+08,1.0,0.0,1.0,318.0,1.0,2.0,0.0,5.0,1.0,...,1446.0,0.0,2.0,2.0,5.0,15.0,1.0,0.0,124500.0,1956.0
4,5.263021e+08,1.5,0.0,2.0,490.0,1.0,2.0,2.0,5.0,3.0,...,2924.0,0.0,1.0,2.0,5.0,15.0,1.0,0.0,169000.0,1957.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2575,9.241510e+08,2.5,1.5,3.0,54.0,1.0,3.0,2.0,6.0,2.0,...,3428.0,0.0,4.0,3.0,5.0,14.0,1.0,415.0,285000.0,1994.0
2576,9.241510e+08,2.5,0.0,2.0,238.0,1.0,2.0,2.0,6.0,3.0,...,2758.0,0.0,4.0,2.0,6.0,14.0,1.0,238.0,188000.0,1993.0
2577,9.241520e+08,2.5,1.0,2.0,208.0,1.0,3.0,1.0,6.0,2.0,...,2741.0,0.0,4.0,3.0,6.0,14.0,1.0,474.0,231000.0,1993.0
2578,1.007100e+09,2.0,0.0,2.0,686.0,1.0,2.0,0.0,4.0,1.0,...,1836.0,0.0,4.0,2.0,6.0,11.0,0.0,0.0,103000.0,1900.0


0