In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import pickle
import re

In [2]:
data = pd.read_csv('model_data.csv', index_col='PID')
data.shape

(2579, 20)

In [3]:
y = data.SalePrice.copy()
data.drop('SalePrice', axis=1, inplace=True)

In [4]:
data_dum = pd.get_dummies(data, drop_first=True)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(data_dum, y, test_size=0.3)

In [6]:
X_train.shape

(1805, 72)

In [7]:
X_test.iloc[0,:].to_frame().T

Unnamed: 0,BsmtUnfSF,AllBathBsmt,AllBathAbv,YearBuilt,GarageCars,PorchArea,GoodLivArea,HasPool,MSSubClass_1FlPUD,MSSubClass_2Fl,...,FireplaceQu_Fa,FireplaceQu_Gd,FireplaceQu_None,FireplaceQu_Po,FireplaceQu_TA,GarageQual_Fa,GarageQual_Gd,GarageQual_None,GarageQual_Po,GarageQual_TA
923225310,25.0,1.0,1.0,1973.0,1.0,221.0,1489.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [9]:
pickled_model = pickle.load(open('APP_model.pkl', 'rb'))
10**pickled_model.predict(X_test.iloc[0,:].to_frame().T)

array([93638.37528021])

In [112]:
X_test.iloc[0,:].to_frame().T.to_csv('pickle_base.csv', index=False)

In [12]:
baseline = pd.read_csv('pickle_base.csv')

In [13]:
baseline['HasPool'] = 1
baseline.HasPool

0    1
Name: HasPool, dtype: int64

In [14]:
def num_format(num):
    # converts any int/float to human readable string with thousandth commas
    new_num = ''
    for idx, c in enumerate(str(np.int64(num))[::-1]):
        if (idx+1)%4 == 0:
            new_num += ','
        new_num += c
    return new_num[::-1]
        
num_format(1415)

'1,415'

In [20]:
for i, j in zip(X_test.columns,pickled_model.coef_):
    print(f'{i} : {j}')

BsmtUnfSF : 6.337042937167557e-05
AllBathBsmt : 0.006016079653607398
AllBathAbv : 0.00909928465588731
YearBuilt : 0.00047288832480718893
GarageCars : 0.020850567691074053
PorchArea : 4.76831882342999e-05
GoodLivArea : 0.00010289558729031867
HasPool : 0.009031361081837101
MSSubClass_1FlPUD : -0.019353792921089377
MSSubClass_2Fl : 0.017883438621467247
MSSubClass_2FlPUD : -0.04552855055120762
MSSubClass_DUP2FAM : -0.01758100508831451
MSSubClass_SPLIT : 0.022873351494003075
Foundation_CBlock : -0.0
Foundation_PConc : 0.008978441451801626
Foundation_Slab : 0.007039442275788678
Foundation_Stone : 0.05210234942062659
Foundation_Wood : 0.0
PavedDrive_P : 0.016005736373704494
PavedDrive_Y : 0.03207715128043419
HeatingQC_Fa : -0.02577842385304068
HeatingQC_Gd : -0.006330353287868188
HeatingQC_TA : -0.020789698823141613
Neighborhood_Blueste : 0.0
Neighborhood_BrDale : -0.013862409859160534
Neighborhood_BrkSide : -0.0005551559666542575
Neighborhood_ClearCr : 0.03771316256133862
Neighborhood_CollgC

In [160]:
neigh_cols = list(baseline.filter(regex='^Neigh').columns)
baseline.loc[0,'Neighborhood_Blueste']
baseline.loc[0,neigh_cols] = 0
baseline.loc[0,neigh_cols]

Neighborhood_Blueste    0.0
Neighborhood_BrDale     0.0
Neighborhood_BrkSide    0.0
Neighborhood_ClearCr    0.0
Neighborhood_CollgCr    0.0
Neighborhood_Crawfor    0.0
Neighborhood_Edwards    0.0
Neighborhood_Gilbert    0.0
Neighborhood_Greens     0.0
Neighborhood_GrnHill    0.0
Neighborhood_IDOTRR     0.0
Neighborhood_Landmrk    0.0
Neighborhood_MeadowV    0.0
Neighborhood_Mitchel    0.0
Neighborhood_NAmes      0.0
Neighborhood_NPkVill    0.0
Neighborhood_NWAmes     0.0
Neighborhood_NoRidge    0.0
Neighborhood_NridgHt    0.0
Neighborhood_OldTown    0.0
Neighborhood_SWISU      0.0
Neighborhood_Sawyer     0.0
Neighborhood_SawyerW    0.0
Neighborhood_Somerst    0.0
Neighborhood_StoneBr    0.0
Neighborhood_Timber     0.0
Neighborhood_Veenker    0.0
Name: 0, dtype: float64

In [151]:
neigh_cols.remove('Neighborhood_Blueste')

In [162]:
def pkl_encode_nbr(base_data, code):
    # Encodes the neighborhood selected with 1, all other dummy columns are set to 0
    target = 'Neighborhood_'+code
    neigh_cols = list(base_data.filter(regex='^Neigh').columns)
    base_data.loc[0,neigh_cols] = 0
    if target in neigh_cols:
        neigh_cols.remove(target)
        base_data.loc[0,target] = 1
    return base_data

pkl_encode_nbr(baseline, 'Blmngtn')

Unnamed: 0,BsmtUnfSF,AllBathBsmt,AllBathAbv,YearBuilt,GarageCars,PorchArea,GoodLivArea,HasPool,MSSubClass_1FlPUD,MSSubClass_2Fl,MSSubClass_2FlPUD,MSSubClass_DUP2FAM,MSSubClass_SPLIT,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,PavedDrive_P,PavedDrive_Y,HeatingQC_Fa,HeatingQC_Gd,HeatingQC_TA,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_Greens,Neighborhood_GrnHill,Neighborhood_IDOTRR,Neighborhood_Landmrk,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,CentralAir_Y,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_Po,KitchenQual_TA,ExterQual_Fa,ExterQual_Gd,ExterQual_TA,BsmtCond_Fa,BsmtCond_Gd,BsmtCond_None,BsmtCond_TA,FireplaceQu_Fa,FireplaceQu_Gd,FireplaceQu_None,FireplaceQu_Po,FireplaceQu_TA,GarageQual_Fa,GarageQual_Gd,GarageQual_None,GarageQual_Po,GarageQual_TA
0,88.0,1.0,2.5,1994.0,3.0,593.0,3989.0,1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [33]:
data.groupby(['Neighborhood','MSSubClass']).agg('median').index

MultiIndex([('Blmngtn',     '1Fl'),
            ('Blmngtn',  '1FlPUD'),
            ('Blueste',  '1FlPUD'),
            ('Blueste',  '2FlPUD'),
            ( 'BrDale',  '2FlPUD'),
            ('BrkSide',     '1Fl'),
            ('BrkSide',     '2Fl'),
            ('BrkSide', 'DUP2FAM'),
            ('ClearCr',     '1Fl'),
            ('ClearCr',  '1FlPUD'),
            ('ClearCr',     '2Fl'),
            ('ClearCr', 'DUP2FAM'),
            ('ClearCr',   'SPLIT'),
            ('CollgCr',     '1Fl'),
            ('CollgCr',  '1FlPUD'),
            ('CollgCr',     '2Fl'),
            ('CollgCr', 'DUP2FAM'),
            ('CollgCr',   'SPLIT'),
            ('Crawfor',     '1Fl'),
            ('Crawfor',  '1FlPUD'),
            ('Crawfor',     '2Fl'),
            ('Crawfor', 'DUP2FAM'),
            ('Crawfor',   'SPLIT'),
            ('Edwards',     '1Fl'),
            ('Edwards',  '1FlPUD'),
            ('Edwards',     '2Fl'),
            ('Edwards',  '2FlPUD'),
            ('Edwards', 'DUP

In [34]:
t = data.groupby(['Neighborhood','MSSubClass']).agg('median')

In [37]:
t.loc[('Veenker', 'SPLIT')]

BsmtUnfSF       289.0
AllBathBsmt       1.0
AllBathAbv        2.0
YearBuilt      1976.0
GarageCars        2.0
PorchArea       348.0
GoodLivArea    2503.0
HasPool           0.0
Name: (Veenker, SPLIT), dtype: float64