In [1]:
import pandas as pd
import numpy as np
import json
import xgboost as xgb
import sklearn
import pickle
import time

In [2]:
df_test = pd.read_csv('test.csv')

In [3]:
df_test.columns.to_series().groupby(df_test.dtypes).groups

{int64: ['Id', 'MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold'], float64: ['LotFrontage', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageYrBlt', 'GarageCars', 'GarageArea'], object: ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'Garag

In [4]:
df_test.drop('Id', axis=1,inplace=True) 

In [5]:
df_test['HasCentralAir'] = np.where(df_test['CentralAir']=='Y',1,0)
df_test.drop('CentralAir',axis=1,inplace=True)

In [6]:
# # Importo el diccionario de encodeo

In [7]:
with open('encoding_dict.txt', 'r') as file:
    encoding_dict = json.load(file)

In [8]:
encoding_dict

{'MSZoning': {'C (all)': 74528.0,
  'FV': 214014.06153846154,
  'RH': 131558.375,
  'RL': 191004.99478714162,
  'RM': 126316.83027522935},
 'Street': {'Grvl': 130190.5, 'Pave': 181130.53851444292},
 'Alley': {'Grvl': 122219.08, 'Pave': 168000.58536585365},
 'LotShape': {'IR1': 206101.6652892562,
  'IR2': 239833.36585365853,
  'IR3': 216036.5,
  'Reg': 164754.81837837838},
 'LandContour': {'Bnk': 143104.07936507938,
  'HLS': 231533.94,
  'Low': 203661.11111111112,
  'Lvl': 180183.74675819985},
 'LotConfig': {'Corner': 181623.4258555133,
  'CulDSac': 223854.6170212766,
  'FR2': 177934.5744680851,
  'FR3': 208475.0,
  'Inside': 176938.0475285171},
 'LandSlope': {'Gtl': 179956.7995658466,
  'Mod': 196734.13846153847,
  'Sev': 204379.23076923078},
 'Neighborhood': {'Blmngtn': 194870.88235294117,
  'Blueste': 137500.0,
  'BrDale': 104493.75,
  'BrkSide': 124834.05172413793,
  'ClearCr': 212565.42857142858,
  'CollgCr': 197965.77333333335,
  'Crawfor': 210624.72549019608,
  'Edwards': 128219.

In [9]:
# Encodeo variables del DataFrame a  scorear

In [10]:
for var_name in df_test.columns:
    var_dict_val = encoding_dict.get(var_name)
    df_test[var_name].replace(var_dict_val, inplace=True)    

In [11]:
df_test.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,HasCentralAir
0,20,131558.375,80.0,11622,181130.538514,,164754.818378,180183.746758,180950.95682,176938.047529,...,0,,148751.089172,,0,6,2010,173401.836622,175202.219533,1
1,20,191004.994787,81.0,14267,181130.538514,,206101.665289,180183.746758,180950.95682,181623.425856,...,0,,,170750.0,12500,6,2010,173401.836622,175202.219533,1
2,60,191004.994787,74.0,13830,181130.538514,,206101.665289,180183.746758,180950.95682,176938.047529,...,0,,148751.089172,,0,3,2010,173401.836622,175202.219533,1
3,60,191004.994787,78.0,9978,181130.538514,,206101.665289,180183.746758,180950.95682,176938.047529,...,0,,,,0,6,2010,173401.836622,175202.219533,1
4,120,191004.994787,43.0,5005,181130.538514,,206101.665289,231533.94,180950.95682,176938.047529,...,0,,,,0,1,2010,173401.836622,175202.219533,1


In [12]:
filename = 'model_pickle.dat'
loaded_model = pickle.load(open(filename, 'rb'))
print(type(loaded_model))

<class 'xgboost.core.Booster'>


In [13]:
loaded_model

<xgboost.core.Booster at 0x125c28090>

cols_when_model_builds = loaded_model.get_booster().feature_names

In [14]:
df_test.describe()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,HasCentralAir
count,1459.0,1455.0,1459.0,1459.0,1459.0,107.0,1459.0,1459.0,1457.0,1459.0,...,1459.0,3.0,290.0,51.0,1459.0,1459.0,1459.0,1458.0,1459.0,1459.0
mean,57.378341,179806.722628,68.706648,9819.161069,180921.052407,138050.067837,180482.928835,181661.243833,180951.0,180613.421121,...,1.744345,393996.666667,153166.186071,150095.689476,58.167923,6.104181,2007.769705,180526.668551,180528.088466,0.930775
std,42.74688,27607.779632,22.309312,4955.517327,3261.078802,21877.370884,21589.391576,13579.69778,2.911382e-11,10993.65674,...,30.491646,166282.651029,13458.566125,12351.599063,630.806978,2.722432,1.30174,29252.071128,28940.570153,0.253924
min,20.0,74528.0,21.0,1470.0,130190.5,122219.08,164754.818378,143104.079365,180951.0,176938.047529,...,0.0,201990.0,134286.363636,94000.0,0.0,1.0,2006.0,119850.0,104125.0,0.0
25%,20.0,191004.994787,59.0,7391.0,181130.538514,122219.08,164754.818378,180183.746758,180951.0,176938.047529,...,0.0,345995.0,148751.089172,151187.612245,0.0,4.0,2007.0,173401.836622,175202.219533,1.0
50%,50.0,191004.994787,68.0,9399.0,181130.538514,122219.08,164754.818378,180183.746758,180951.0,176938.047529,...,0.0,490000.0,148751.089172,151187.612245,0.0,6.0,2008.0,173401.836622,175202.219533,1.0
75%,70.0,191004.994787,80.0,11517.5,181130.538514,168000.585366,206101.665289,180183.746758,180951.0,177934.574468,...,0.0,490000.0,148751.089172,151187.612245,0.0,8.0,2009.0,173401.836622,175202.219533,1.0
max,190.0,214014.061538,200.0,56600.0,181130.538514,168000.585366,239833.365854,231533.94,180951.0,223854.617021,...,800.0,490000.0,178927.457627,170750.0,17000.0,12.0,2010.0,274945.418033,272291.752,1.0


In [15]:
df_test = df_test[['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'HasCentralAir', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition', 'Utilities', 'PavedDrive']]

In [16]:
dscore = xgb.DMatrix(df_test)

In [17]:
test_preds = loaded_model.predict(dscore)

In [19]:
df_test_preds = pd.DataFrame(data=test_preds)

In [22]:
df_test_preds.index = [1461 + x for x in range(0, len(df_test_preds))]

In [None]:
timestr = time.strftime("%Y_%m_%d-%H_%M_%S")

In [24]:
df_test_preds.to_csv(f'xgboost_preds_tuned_{timestr}.csv')