In [2]:
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import numpy as np

from tqdm import tqdm
import os

import seaborn as sns
sns.set_theme(style="whitegrid")
predictions = pd.read_csv("~/Downloads/predictions.csv")

In [3]:
predictions['pred_diff'] = predictions['y_pred'] - predictions['y_true']
predictions['per_error'] = abs(predictions['pred_diff']) / predictions['y_true']
predictions = predictions[predictions['policy'].isin(['total_error_cold', 'min_past'])]

In [4]:
# Remove Outliers
from scipy import stats
predictions = predictions[(np.abs(stats.zscore(predictions['per_error'])) < 3)]

In [11]:
min_past_error = predictions[predictions['policy'] == 'min_past'].groupby('updates').mean()['per_error']

In [12]:
error_cold_error = predictions[predictions['policy'] == 'total_error_cold'].groupby('updates').mean()['per_error']

In [13]:
overall_dataset_per_diff = min_past_error - error_cold_error

Now we look at the dataset we've trained on, and whether that influences the percent error difference across policies.

In [14]:
past_updates =  pd.read_pickle('~/Downloads/past_updates-2.pkl')

In [15]:
trained_predictions = predictions[predictions['user_id'].isin(past_updates)]
untrained_predictions = predictions[~predictions['user_id'].isin(past_updates)]

In [17]:
min_past_train_error = trained_predictions[trained_predictions['policy'] == 'min_past'].groupby('updates').mean()['per_error']
cold_train_error = trained_predictions[trained_predictions['policy'] == 'total_error_cold'].groupby('updates').mean()['per_error']

In [18]:
trained_dataset_per_diff = min_past_train_error - cold_train_error

In [19]:
min_past_untrain_error = untrained_predictions[untrained_predictions['policy'] == 'min_past'].groupby('updates').mean()['per_error']
cold_untrain_error = untrained_predictions[untrained_predictions['policy'] == 'total_error_cold'].groupby('updates').mean()['per_error']

In [20]:
untrained_dataset_per_diff = min_past_untrain_error - cold_untrain_error

Now we look for whether bursty periods of time influence the percent error difference. I've defined bursty to be more than 200 ratings/timestamp, but you can change it to be whatever threshold you think is appropriate.

In [21]:
threshold = 200

In [22]:
temp_table = predictions[predictions['policy'] == 'min_past'].groupby('timestamp').count().sort_values('user_id', ascending = False)

In [23]:
timestamp_num = {}
for index, row in temp_table.iterrows():
    timestamp_num[index] = row['user_id']

In [24]:
new_table = predictions.copy(deep=True)

In [25]:
new_table['num_per_timestamp'] = new_table.apply(lambda x: timestamp_num[x['timestamp']], axis=1)

In [26]:
above_threshold = new_table[new_table['num_per_timestamp'] > threshold]

In [27]:
min_past_above_error = above_threshold[above_threshold['policy'] == 'min_past'].groupby('updates').mean()['per_error']
cold_above_error = above_threshold[above_threshold['policy'] == 'total_error_cold'].groupby('updates').mean()['per_error']

In [21]:
above_threshold.groupby('policy').mean()['per_error']

policy
min_past            0.257234
total_error_cold    0.256289
Name: per_error, dtype: float64

In [28]:
bursty_dataset_per_diff = min_past_above_error - cold_above_error

In [39]:
d = {"updates": sorted(list(set(predictions['updates']))), 'overall': overall_dataset_per_diff, 'trained': trained_dataset_per_diff, 'untrained': untrained_dataset_per_diff, 'bursty': bursty_dataset_per_diff}
conclusion_df = pd.DataFrame(data=d).drop('updates', axis=1)

In [40]:
conclusion_df

Unnamed: 0_level_0,overall,trained,untrained,bursty
updates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.2,0.002856378,0.03277973,0.002510297,0.001181654
0.25,0.00280137,0.03737641,0.002401165,0.0009736155
0.5,0.001789787,0.004837629,0.001754521,0.002110823
1.0,0.001426616,0.01631905,0.001254433,0.002026089
2.0,0.0004805088,-0.001302397,0.0005006305,0.001566335
3.0,7.513297e-05,0.0003629034,7.179209e-05,0.000920942
4.0,-5.878871e-05,-0.001045495,-4.740474e-05,-8.814543e-05
5.0,-5.487346e-05,0.0001282444,-5.69933e-05,-0.0002098476
8.0,1.475106e-09,-6.623909e-09,1.568598e-09,1.28977e-08


In [24]:
print('Overall Dataset Diff in Percent Errors: {:0.4f}\nTrained Dataset Diff in Percent Errors: {:0.4f}\nUntrained Dataset Diff in Percent Errors: {:0.4f}\nBursty Dataset Diff in Percent Errors: {:0.4f}\n'.format(overall_dataset_per_diff,trained_dataset_per_diff, untrained_dataset_per_diff,bursty_dataset_per_diff))

Overall Dataset Diff in Percent Errors: 0.0010
Trained Dataset Diff in Percent Errors: 0.0100
Untrained Dataset Diff in Percent Errors: 0.0009
Bursty Dataset Diff in Percent Errors: 0.0009

