In [56]:
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import numpy as np

from tqdm import tqdm
import os

import seaborn as sns
sns.set_theme(style="whitegrid")
predictions = pd.read_csv("Downloads/predictions.csv")

In [57]:
predictions['pred_diff'] = predictions['y_pred'] - predictions['y_true']
predictions['per_error'] = abs(predictions['pred_diff']) / predictions['y_true']
predictions = predictions[predictions['policy'].isin(['total_error_cold', 'min_past'])]

In [58]:
# Remove Outliers
from scipy import stats
predictions = predictions[(np.abs(stats.zscore(predictions['per_error'])) < 3)]

In [59]:
predictions.groupby('policy').mean()['per_error']

policy
min_past            0.256342
total_error_cold    0.255298
Name: per_error, dtype: float64

In [60]:
overall_dataset_per_diff = 0.256342 - 0.255298

In [61]:
overall_dataset_per_diff

0.0010439999999999894

Now we look at the dataset we've trained on, and whether that influences the percent error difference across policies.

In [62]:
past_updates =  pd.read_pickle('Downloads/past_updates-2.pkl')
past_updates

{4268: 6,
 4269: 112,
 4270: 24,
 4271: 134,
 4272: 113,
 4273: 3,
 4274: 121,
 4275: 31,
 4276: 82,
 4277: 296,
 4278: 36,
 4279: 490,
 4280: 88,
 4281: 204,
 4282: 29,
 4283: 61,
 4284: 156,
 4285: 516,
 4286: 200,
 4287: 293,
 4288: 27,
 4289: 154,
 4290: 57,
 4291: 211,
 4292: 96,
 4293: 42,
 4294: 29,
 4295: 40,
 4296: 62,
 4297: 47,
 4298: 108,
 4299: 100,
 4300: 40,
 4301: 19,
 4302: 120,
 4303: 374,
 4304: 25,
 4305: 239,
 4306: 91,
 4307: 34,
 4308: 104,
 4309: 13,
 4310: 121,
 4311: 94,
 4312: 237,
 4313: 96,
 4314: 33,
 4315: 137,
 4316: 39,
 4317: 51,
 4319: 51,
 4320: 81,
 4321: 57,
 4322: 337,
 4323: 137,
 4324: 33,
 4325: 24,
 4326: 127,
 4327: 145,
 4328: 237,
 4329: 172,
 4330: 41,
 4331: 36,
 4332: 20,
 4333: 170,
 4334: 85,
 4335: 255,
 4336: 122,
 4337: 132,
 4338: 32,
 4339: 66,
 4340: 25,
 4341: 33,
 4342: 229,
 4343: 214,
 4344: 452,
 4345: 32,
 4346: 69,
 4347: 138,
 4348: 19,
 4349: 27,
 4350: 39,
 4351: 48,
 4352: 382,
 4353: 102,
 4354: 471,
 4355: 51,
 4356:

In [63]:
trained_predictions = predictions[predictions['user_id'].isin(past_updates)]
untrained_predictions = predictions[~predictions['user_id'].isin(past_updates)]

In [66]:
trained_predictions.groupby('policy').mean()['per_error']

policy
min_past            0.248708
total_error_cold    0.238725
Name: per_error, dtype: float64

In [68]:
trained_dataset_per_diff = 0.248708 - 0.238725

In [69]:
trained_dataset_per_diff

0.00998300000000002

In [67]:
untrained_predictions.groupby('policy').mean()['per_error']

policy
min_past            0.256430
total_error_cold    0.255489
Name: per_error, dtype: float64

In [70]:
untrained_dataset_per_diff = 0.256430 - 0.255489

In [71]:
untrained_dataset_per_diff

0.0009409999999999696

Now we look for whether bursty periods of time influence the percent error difference. I've defined bursty to be more than 200 ratings/timestamp, but you can change it to be whatever threshold you think is appropriate.

In [72]:
threshold = 200

In [73]:
temp_table = predictions[predictions['policy'] == 'min_past'].groupby('timestamp').count().sort_values('user_id', ascending = False)

In [76]:
timestamp_num = {}
for index, row in temp_table.iterrows():
    timestamp_num[index] = row['user_id']

In [77]:
timestamp_num

{2372: 655,
 2369: 549,
 2367: 538,
 5124: 513,
 2371: 457,
 18000: 432,
 2881: 432,
 2370: 405,
 2365: 405,
 16683: 378,
 16442: 378,
 5133: 378,
 2877: 370,
 15290: 369,
 2363: 369,
 15288: 369,
 16448: 369,
 5114: 360,
 3662: 354,
 2874: 351,
 2873: 351,
 15293: 351,
 2373: 349,
 2381: 348,
 3597: 346,
 17927: 342,
 17933: 342,
 2366: 337,
 2375: 337,
 15289: 333,
 2880: 333,
 16646: 333,
 16684: 333,
 488: 333,
 16453: 333,
 855: 333,
 5119: 333,
 2178: 333,
 7848: 333,
 16642: 333,
 3596: 333,
 988: 333,
 5126: 326,
 17798: 324,
 2368: 324,
 2364: 324,
 5121: 324,
 16570: 324,
 5144: 324,
 5650: 315,
 16649: 315,
 485: 315,
 676: 315,
 484: 315,
 2769: 315,
 15295: 315,
 16645: 315,
 480: 315,
 16452: 308,
 2879: 307,
 18001: 306,
 2177: 306,
 16680: 306,
 7849: 306,
 463: 306,
 7851: 306,
 677: 306,
 3601: 306,
 4163: 306,
 15292: 306,
 17796: 300,
 5135: 299,
 5137: 299,
 3664: 299,
 16635: 297,
 2391: 297,
 2354: 297,
 2114: 297,
 5139: 297,
 7852: 297,
 3589: 297,
 936: 297,
 

In [78]:
new_table = predictions.copy(deep=True)

In [79]:
new_table['num_per_timestamp'] = new_table.apply(lambda x: timestamp_num[x['timestamp']], axis=1)

In [82]:
above_threshold = new_table[new_table['num_per_timestamp'] > threshold]

In [84]:
above_threshold.groupby('policy').mean()['per_error']

policy
min_past            0.257234
total_error_cold    0.256289
Name: per_error, dtype: float64

In [85]:
bursty_dataset_per_diff = 0.257234 - 0.256289

In [86]:
bursty_dataset_per_diff

0.0009450000000000292

In [87]:
print('Overall Dataset Diff in Percent Errors: {:0.4f}\nTrained Dataset Diff in Percent Errors: {:0.4f}\nUntrained Dataset Diff in Percent Errors: {:0.4f}\nBursty Dataset Diff in Percent Errors: {:0.4f}\n'.format(overall_dataset_per_diff,trained_dataset_per_diff, untrained_dataset_per_diff,bursty_dataset_per_diff))

Overall Dataset Diff in Percent Errors: 0.0010
Trained Dataset Diff in Percent Errors: 0.0100
Untrained Dataset Diff in Percent Errors: 0.0009
Bursty Dataset Diff in Percent Errors: 0.0009

