In [0]:
# Change directory to VSCode workspace root so that relative path loads work correctly. Turn this addition off with the DataScience.changeDirOnImportExport setting
import os
try:
	os.chdir(os.path.join(os.getcwd(), '../..'))
	print(os.getcwd())
except:
	pass


# Example on how to use fitted objects

In [9]:
from sklearn.externals import joblib
from sklearn.metrics import mean_absolute_error, mean_squared_error
from math import sqrt 
import os
import pandas as pd
import numpy as np


## Loading fitted models

In [10]:
alljoblibs = os.listdir('models')
print(alljoblibs)


['lgb_tuned_datacap_lower_quantile_20190507_02_35.joblib', 'lgb_tuned_timecap_upper_quantile_20190507_10_22.joblib', 'lgb_tuned_datacap_median_quantile_20190507_02_35.joblib', 'lgb_tuned_datacap_upper_quantile_20190507_02_35.joblib', 'lgb_tuned_fulldata_lower_quantile_20190507_07_25.joblib', 'lgb_tuned_timecap_lower_quantile_20190507_10_22.joblib', 'lgb_tuned_fulldata_upper_quantile_20190507_07_25.joblib', 'lgb_tuned_timecap_median_quantile_20190507_10_22.joblib', 'lgb_tuned_fulldata_median_quantile_20190507_07_25.joblib', 'old']


In [11]:
subset = {'datacap':{}, 'timecap':{},'fulldata':{}}

print('loading files double loop:') 
for sub in subset:
    joblibs = [file for file in alljoblibs if sub in file]

    for quantile in ['upper', 'median', 'lower']:
        path = [f'models/{x}' for x in joblibs if quantile in x][0]
        loaded = joblib.load(path)
        subset[sub].update({quantile: loaded})

        print(f'Loaded {sub}-{quantile} --- RMSE: {loaded["RMSE"]:.4f}')

print(subset.keys())
print(subset['datacap'].keys())




loading files double loop:
Loaded datacap-upper --- RMSE: 0.3099
Loaded datacap-median --- RMSE: 0.1525
Loaded datacap-lower --- RMSE: 0.2250
Loaded timecap-upper --- RMSE: 0.5251
Loaded timecap-median --- RMSE: 0.4461
Loaded timecap-lower --- RMSE: 0.5040
Loaded fulldata-upper --- RMSE: 0.5261
Loaded fulldata-median --- RMSE: 0.3807
Loaded fulldata-lower --- RMSE: 0.4690
dict_keys(['datacap', 'timecap', 'fulldata'])
dict_keys(['upper', 'median', 'lower'])


## Let's say these are the rows that we want to apply model

In [12]:
dummy_data = subset['datacap']['median']['X_sample']
dummy_data = dummy_data.head(10)

dummy_data.index = np.arange(0, dummy_data.shape[0])  # resetting index from 0:nrows
dummy_data


Unnamed: 0,airline,flight_duration_hrs,orig_country,seat_count,night_flight,flight_type,price_usd,ife,e_xtv,e_xphone,one_media,dest_country,economy_pct,bus_pass_percent,luxury,datacap_mb
0,5,10.96875,32,310.0,False,1,20.765625,True,False,True,True,86,93.0,20.953125,False,50.0
1,6,4.050781,85,288.0,False,1,34.78125,True,False,False,False,43,93.0,20.953125,False,200.0
2,12,8.0,88,236.0,False,1,39.375,True,False,True,False,91,78.0,33.6875,False,120.0
3,1,7.871094,15,243.0,True,1,1.950195,True,True,False,True,55,87.0,26.046875,False,5.0
4,12,7.320312,88,236.0,False,1,19.1875,True,False,True,False,48,78.0,33.6875,False,50.0
5,1,10.78125,60,243.0,True,1,1.950195,True,True,False,True,59,87.0,26.046875,False,5.0
6,12,5.800781,88,236.0,False,1,9.09375,True,False,True,False,89,78.0,33.6875,False,20.0
7,12,10.203125,16,236.0,False,1,39.375,True,False,True,False,83,78.0,33.6875,False,120.0
8,13,8.320312,91,321.0,False,1,8.992188,True,True,False,False,2,90.0,23.5,False,20.0
9,3,11.828125,48,256.0,False,1,6.0,True,True,True,False,44,86.0,26.90625,False,50.0


In [13]:
predictions = {}
for quantile, fit_objs in subset['datacap'].items():
    model = fit_objs['model']

    predictions.update({f'y_predicted_{quantile}': model.predict(dummy_data)})

predictions = pd.DataFrame(predictions)
predictions


Unnamed: 0,y_predicted_upper,y_predicted_median,y_predicted_lower
0,0.390835,0.159761,0.076028
1,0.321795,0.131638,0.10475
2,1.253938,0.451237,0.16729
3,0.035497,0.01334,0.008011
4,0.746985,0.335702,0.083869
5,0.078924,0.025644,0.007746
6,0.214614,0.081105,0.040561
7,0.722128,0.178592,0.16616
8,0.209045,0.056623,0.033812
9,0.356372,0.112069,0.019186


In [14]:
# Combining model predictions and data
predicted_df = pd.concat([predictions, dummy_data], axis=1)
predicted_df


Unnamed: 0,y_predicted_upper,y_predicted_median,y_predicted_lower,airline,flight_duration_hrs,orig_country,seat_count,night_flight,flight_type,price_usd,ife,e_xtv,e_xphone,one_media,dest_country,economy_pct,bus_pass_percent,luxury,datacap_mb
0,0.390835,0.159761,0.076028,5,10.96875,32,310.0,False,1,20.765625,True,False,True,True,86,93.0,20.953125,False,50.0
1,0.321795,0.131638,0.10475,6,4.050781,85,288.0,False,1,34.78125,True,False,False,False,43,93.0,20.953125,False,200.0
2,1.253938,0.451237,0.16729,12,8.0,88,236.0,False,1,39.375,True,False,True,False,91,78.0,33.6875,False,120.0
3,0.035497,0.01334,0.008011,1,7.871094,15,243.0,True,1,1.950195,True,True,False,True,55,87.0,26.046875,False,5.0
4,0.746985,0.335702,0.083869,12,7.320312,88,236.0,False,1,19.1875,True,False,True,False,48,78.0,33.6875,False,50.0
5,0.078924,0.025644,0.007746,1,10.78125,60,243.0,True,1,1.950195,True,True,False,True,59,87.0,26.046875,False,5.0
6,0.214614,0.081105,0.040561,12,5.800781,88,236.0,False,1,9.09375,True,False,True,False,89,78.0,33.6875,False,20.0
7,0.722128,0.178592,0.16616,12,10.203125,16,236.0,False,1,39.375,True,False,True,False,83,78.0,33.6875,False,120.0
8,0.209045,0.056623,0.033812,13,8.320312,91,321.0,False,1,8.992188,True,True,False,False,2,90.0,23.5,False,20.0
9,0.356372,0.112069,0.019186,3,11.828125,48,256.0,False,1,6.0,True,True,True,False,44,86.0,26.90625,False,50.0


## Inverse label transformation
 The dummy data I used above has already been label transformed
 because they are the actual traning data.
 To return the actual categories, we need to use the label encoder saved

In [15]:
lab_encoder = subset['datacap']['median']['label_encoders']

for col, le in lab_encoder.items():
    inv_transformer = le.inverse_transform
    predicted_df[col] = predicted_df[col].transform(inv_transformer)

predicted_df



Unnamed: 0,y_predicted_upper,y_predicted_median,y_predicted_lower,airline,flight_duration_hrs,orig_country,seat_count,night_flight,flight_type,price_usd,ife,e_xtv,e_xphone,one_media,dest_country,economy_pct,bus_pass_percent,luxury,datacap_mb
0,0.390835,0.159761,0.076028,EWG,10.96875,Germany,310.0,False,International,20.765625,True,False,True,True,Thailand,93.0,20.953125,False,50.0
1,0.321795,0.131638,0.10475,IBE,4.050781,Spain,288.0,False,International,34.78125,True,False,False,False,Israel,93.0,20.953125,False,200.0
2,1.253938,0.451237,0.16729,SWR,8.0,Switzerland,236.0,False,International,39.375,True,False,True,False,United States,78.0,33.6875,False,120.0
3,0.035497,0.01334,0.008011,AMX,7.871094,Chile,243.0,True,International,1.950195,True,True,False,True,Mexico,87.0,26.046875,False,5.0
4,0.746985,0.335702,0.083869,SWR,7.320312,Switzerland,236.0,False,International,19.1875,True,False,True,False,Kenya,78.0,33.6875,False,50.0
5,0.078924,0.025644,0.007746,AMX,10.78125,Mexico,243.0,True,International,1.950195,True,True,False,True,Netherlands,87.0,26.046875,False,5.0
6,0.214614,0.081105,0.040561,SWR,5.800781,Switzerland,236.0,False,International,9.09375,True,False,True,False,United Arab Emirates,78.0,33.6875,False,20.0
7,0.722128,0.178592,0.16616,SWR,10.203125,China,236.0,False,International,39.375,True,False,True,False,Switzerland,78.0,33.6875,False,120.0
8,0.209045,0.056623,0.033812,THA,8.320312,Thailand,321.0,False,International,8.992188,True,True,False,False,Australia,90.0,23.5,False,20.0
9,0.356372,0.112069,0.019186,AZA,11.828125,Japan,256.0,False,International,6.0,True,True,True,False,Italy,86.0,26.90625,False,50.0


## label transformation
 I'll need to write functions for this, but for now, the following should work

In [17]:
# new_df     # insert your own here
# new_df_encoded = new_df.copy()   # otherwise dataframes are modified by reference
# lab_encoder = subset['datacap']['median']['label_encoders']
# 
# for col, le in lab_encoder.items():
    # transformer = le.inverse_transform
    # new_df_encoded[col] = new_df_encoded[col].transform(transformer)