These codes attempt to re-simulate the main-bar reviews that the reviewer would have seen whilst writing his/her own review. Please refer to the manuscript to understand how the hypothesized reversed-engineer algorithm works. 

## Importing packages and reading data

In [1]:
from tqdm import tqdm
import pickle
from datetime import datetime, timedelta
import time
import math
import ast
import numpy as np
import pandas as pd
import types
from joblib import Parallel, delayed

def imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            try:
                yield val.__name__, val.__version__
                pass
            except:
                yield val.__name__
                pass
            pass
        pass
    pass

list(imports())

['builtins',
 'builtins',
 'pickle',
 'time',
 'math',
 'ast',
 ('numpy', '1.24.3'),
 ('pandas', '2.1.1'),
 'types']

In [2]:
df=pd.read_csv("../data/raw_data/FPS_reviews.csv.zip", low_memory=False)

# Convert columns to their supposed data types (for safe measure)
df['recommendation_id'] = df['recommendation_id'].astype(str)
df['game_id'] = df['game_id'].astype(str)
df['review'] = df['review'].astype(str)
df['score'] = df['score'].astype(float)
df['timestamp_created'] = df['timestamp_created'].astype(int)
df['timestamp_updated'] = df['timestamp_updated'].astype(int)
df['votes_up'] = df['votes_up'].astype(int)
df['voted_up'] = df['voted_up'].astype(int)

# organizing data per game

In [3]:
print(f"Removing games with <30 reviews (10 for mainbar, 10 for sidebar, 10 for control).")
game_length=len(set(list(df["game_id"])))
print(f"Before removing games with fewer than 30 reviews, there were {len(df)} reviews from {game_length} games")
df = df.groupby('game_id').filter(lambda x: len(x) >= 30)
df['index'] = [i for i in range(len(df))]
df = df.set_index('index')
game_length=len(set(list(df["game_id"])))
print(f"After  removing games with fewer than 30 reviews, there were {len(df)} reviews from {game_length} games")

Removing games with <30 reviews (10 for mainbar, 10 for sidebar, 10 for control).
Before removing games with fewer than 30 reviews, there were 3637833 reviews from 2304 games
After  removing games with fewer than 30 reviews, there were 3629007 reviews from 1093 games


In [4]:
df = (df.drop_duplicates(subset=['recommendation_id', 'game_id'], keep='first')).reset_index(drop=True)
game_length=len(set(list(df["game_id"])))
print(f"After removing duplicates, there were {len(df)} reviews from {game_length} games")

After removing duplicates, there were 3621927 reviews from 1093 games


In [5]:
df.to_csv("../data/raw_data/data_filtered.csv.gz",compression="gzip",index=False)

## Implementing our hypothesized main-bar algorithm and performing our re-simulation. 



In [5]:
def sort_and_rearrange(temp_df):
    temp_df = temp_df.copy()
    temp_df["score"] = (temp_df["score"]).astype(float)
    temp_df = (temp_df.sort_values("score", ascending=False)).reset_index(drop=True)
    return temp_df

def get_x_days_ago(end_int, days):
    end = datetime.fromtimestamp(end_int)
    start = end - timedelta(days=days)
    start_int = int(time.mktime(start.timetuple()))
    return start_int

def get_x_days_ago_reviews(days, df, end_int):
    start = get_x_days_ago(int(end_int), days)
    df=df.copy()
    df['timestamp_created'] = (df['timestamp_created']).astype(int)
    temp_dates = df[df['timestamp_created'].between(start, end_int - 1)]
    return temp_dates

def condition_against(temp1, temp2, difference):
    if len(temp1) == 10:
        temp = temp1
    elif difference == 10:
        temp = temp2[:len(temp2)]
    elif len(temp2) == 0:
        temp = temp1
    else:
        temp = temp1
        for i in range(difference):
            if i >= len(list(temp2["score"])):
                continue
            if float(list(temp2["score"])[i]) > float(list(temp1["score"])[0]):
                to_be_added = temp2[i:i + 1]
                temp = pd.concat([temp, to_be_added])
    return temp

def temp_conditional_return_1(time_created, day_range1, day_range_ultimate, df):
    day_range1_reviews = get_x_days_ago_reviews(day_range1, df, time_created)
    day_range1_date = get_x_days_ago(time_created, day_range1)
    days_diff = day_range_ultimate - day_range1
    in_between_days_reviews = get_x_days_ago_reviews(days_diff, df, day_range1_date)
    difference = 10 - len(day_range1_reviews)
    temp1 = sort_and_rearrange(day_range1_reviews)
    temp2 = sort_and_rearrange(in_between_days_reviews)
    temp = condition_against(temp1, temp2, difference)
    return temp

def temp_conditional_return_2(time_created, day_range1, day_range2, day_range_ultimate, df):
    day_range1_reviews = get_x_days_ago_reviews(day_range1, df, time_created)
    day_range1_date = get_x_days_ago(time_created, day_range1)
    days_diff_1 = day_range2 - day_range1
    day_range2_reviews = get_x_days_ago_reviews(days_diff_1, df, day_range1_date)
    day_range2_date = get_x_days_ago(time_created, day_range2)
    difference = 10 - len(day_range1_reviews)
    temp1 = sort_and_rearrange(day_range1_reviews)
    temp2 = sort_and_rearrange(day_range2_reviews)
    temp = condition_against(temp1, temp2, difference)
    difference_2 = 10 - len(temp)
    days_diff_2 = day_range_ultimate - day_range2
    day_range3_reviews = get_x_days_ago_reviews(days_diff_2, df, day_range2_date)
    temp3 = sort_and_rearrange(day_range3_reviews)
    temp = condition_against(temp, temp3, difference_2)
    return temp

def temp_conditional_return_3(time_created, day_range1, day_range2, day_range_3, start_date, df):
    day_range1_reviews = get_x_days_ago_reviews(day_range1, df, time_created)
    day_range1_date = get_x_days_ago(time_created, day_range1)
    days_diff_1 = day_range2 - day_range1
    day_range2_reviews = get_x_days_ago_reviews(days_diff_1, df, day_range1_date)
    day_range2_date = get_x_days_ago(time_created, day_range2)
    difference = 10 - len(day_range1_reviews)
    temp1 = sort_and_rearrange(day_range1_reviews)
    temp2 = sort_and_rearrange(day_range2_reviews)
    temp = condition_against(temp1, temp2, difference)
    difference_2 = 10 - len(temp)
    days_diff_2 = day_range_3 - day_range2
    day_range3_reviews = get_x_days_ago_reviews(days_diff_2, df, day_range2_date)
    temp3 = sort_and_rearrange(day_range3_reviews)
    temp = condition_against(temp, temp3, difference_2)
    day_range3_date = get_x_days_ago(time_created, day_range_3)
    day_range_4 = int((time_created - start_date) / 86400)
    difference_3 = 10 - len(temp)
    days_diff_3 = day_range_4 - day_range_3
    day_range4_reviews = get_x_days_ago_reviews(days_diff_3, df, day_range3_date)
    temp4 = sort_and_rearrange(day_range4_reviews)
    temp = condition_against(temp, temp4, difference_3)
    return temp

In [49]:
def get_list_of_main_bar_reviews_old(df):
    order_of_visibility, order_of_scores, order_of_times = [], [], []
    for i in tqdm(range(len(df))):
        game_id = str(df["game_id"][i])
        time_created = int(df["timestamp_created"][i])
        temp = df[(df['game_id'] == game_id) & (df['votes_up'] != 0)]
        temp_dates = get_x_days_ago_reviews(30, temp, time_created)
        if len(temp) == 0 or len(temp_dates) >= 10:
            temp = temp_dates
        else:
            temp_90_dates = temp_conditional_return_1(time_created, 30, 90, temp)
            if len(temp_90_dates) >= 10:
                temp = temp_90_dates
            else:
                temp_180_dates = temp_conditional_return_2(time_created, 30, 90, 180, temp)
                if len(temp_180_dates) >= 10:
                    temp = temp_180_dates
                else:
                    start_date = int(temp['timestamp_created'].min())
                    temp = temp_conditional_return_3(time_created, 30, 90, 180, start_date, temp)

        temp["score"] = pd.to_numeric(temp["score"])
        temp = (temp.sort_values("score", ascending=False)).reset_index(drop=True)

        n = min(len(temp), 10)
        list_of_main_bar_reviews = list(temp["recommendation_id"][:n])
        list_of_scores = list(temp["score"][:n])
        list_of_times = list(temp["timestamp_created"][:n])
        order_of_visibility.append(list_of_main_bar_reviews)
        order_of_scores.append(list_of_scores)
        order_of_times.append(list_of_times)

    return order_of_visibility, order_of_scores, order_of_times

In [7]:
def get_list_of_main_bar_reviews(df):
    ordered_array = np.zeros(shape=(len(df), 10, 3), dtype=float)
    ordered_array[:] = np.nan
    i = np.where(np.isnan(ordered_array[:, 0, 0]))[0][0]
    for i in tqdm(range(i, len(df)), ncols=100):
        game_id = str(df["game_id"][i])
        time_created = int(df["timestamp_created"][i])
        temp = df[(df['game_id'] == game_id) & (df['votes_up'] != 0)]
        temp_dates = get_x_days_ago_reviews(30, temp, time_created)
        if len(temp) == 0 or len(temp_dates) >= 10:
            temp = temp_dates
            pass
        else:
            temp_90_dates = temp_conditional_return_1(time_created, 30, 90, temp)
            if len(temp_90_dates) >= 10:
                temp = temp_90_dates
                pass
            else:
                temp_180_dates = temp_conditional_return_2(time_created, 30, 90, 180, temp)
                if len(temp_180_dates) >= 10:
                    temp = temp_180_dates
                    pass
                else:
                    start_date = int(temp['timestamp_created'].min())
                    temp = temp_conditional_return_3(time_created, 30, 90, 180, start_date, temp)
                    pass
                pass
            pass
        temp["score"] = pd.to_numeric(temp["score"])
        temp = (temp.sort_values("score", ascending=False))
        n = min(len(temp), 10)
        ordered_array[i, :n] = temp[["recommendation_id", "score", "timestamp_created"]][:n]
        pass
    return ordered_array

In [8]:
def get_list_of_main_bar_reviews(df):
    ordered_array = np.zeros(shape=(len(df), 10, 5), dtype=float)
    ordered_array[:] = np.nan
    for i in tqdm(range(len(df.index))):
        game_id = str(df.loc[df.index[i], "game_id"])
        time_created = int(df.loc[df.index[i], "timestamp_created"])
        temp = df[(df['game_id'] == game_id) & (df['votes_up'] != 0)]
        temp_dates = get_x_days_ago_reviews(30, temp, time_created)
        if len(temp) == 0 or len(temp_dates) >= 10:
            temp = temp_dates
            pass
        else:
            temp_90_dates = temp_conditional_return_1(time_created, 30, 90, temp)
            if len(temp_90_dates) >= 10:
                temp = temp_90_dates
                pass
            else:
                temp_180_dates = temp_conditional_return_2(time_created, 30, 90, 180, temp)
                if len(temp_180_dates) >= 10:
                    temp = temp_180_dates
                    pass
                else:
                    start_date = int(temp['timestamp_created'].min())
                    temp = temp_conditional_return_3(time_created, 30, 90, 180, start_date, temp)
                    pass
                pass
            pass
        temp["score"] = pd.to_numeric(temp["score"])
        temp = (temp.sort_values("score", ascending=False))
        n = min(len(temp), 10)
        ordered_array[i, :n, :3] = temp[["recommendation_id", "score", "timestamp_created"]][:n]
        ordered_array[i, :, 3] = np.arange(10)+1
        ordered_array[i, :, 4] = df.index[i]
        pass
    return ordered_array

In [9]:
grouped_df = df.groupby(['game_id'])
split_df = [grouped_df.get_group(x) for x in grouped_df.groups]
try:
    ordered_array = np.load('../data/temp_reconstruction.npy')
    pass
except:
    ordered_array_split = Parallel(n_jobs=20)(delayed(get_list_of_main_bar_reviews)(split_df[xxx]) for xxx in tqdm(range(len(split_df))))
    ordered_array = np.vstack(ordered_array_split)
    np.save('../data/temp_reconstruction.npy', ordered_array)
    pass

100%|██████████████████████████████████████████████████████████████████████████████| 1093/1093 [17:04<00:00,  1.07it/s]


In [13]:
temp_tester_array = ordered_array[~np.any(np.isnan(ordered_array[:, -1, :3]), axis=1)]
unique_reviews_main_bar = [len(np.unique(temp_tester_array[xxx, :, 0])) for xxx in range(temp_tester_array.shape[0])]
unique_reviews_main_bar = np.array(unique_reviews_main_bar)
pd.Series(unique_reviews_main_bar).value_counts(normalize=True).round(3)

10    1.0
Name: proportion, dtype: float64

In [52]:
## DOESNT WORK
verification_df = df[df['game_id'] == '1000410'].copy()
verification_df["game_id"].index # index messed up, function dependent on index
#verification_df["order_of_main_bar_reviews"], verification_df["order_of_scores"], verification_df["order_of_times"] = get_list_of_main_bar_reviews_old(verification_df)

Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
       ...
       7091, 7092, 7093, 7094, 7095, 7096, 7097, 7098, 7099, 7100],
      dtype='int64', length=1011)

In [50]:
## Works now
verification_df = df[df['game_id'] == '1000410'].copy()
verification_df=verification_df.reset_index(drop=True)
verification_df["order_of_main_bar_reviews"], verification_df["order_of_scores"], verification_df["order_of_times"] = get_list_of_main_bar_reviews_old(verification_df)

100%|█████████████████████████████████████████████████████████████████████████████| 1011/1011 [00:02<00:00, 457.34it/s]


In [98]:
for xyz in tqdm(verification_df.index, ncols=100):
    new_array = ordered_array[ordered_array[:, :, -1] == xyz][:, 0]
    new_array = new_array[~np.isnan(new_array)].astype(int).astype(str)
    old_array = np.array(verification_df.loc[xyz, 'order_of_main_bar_reviews'])
    if not (np.all(old_array == new_array)):
        print('Problem with {}'.format(xyz))
        pass
    pass

100%|███████████████████████████████████████████████████████████| 1911/1911 [01:41<00:00, 18.77it/s]


In [122]:
for xyz in tqdm(verification_df.index, ncols=100):
    new_array = ordered_array[ordered_array[:, :, -1] == xyz][:, 1]
    new_array = new_array[~np.isnan(new_array)]
    old_array = np.array(verification_df.loc[xyz, 'order_of_scores'])
    if len(old_array) > 0:
        if max(abs(new_array - old_array)) > .001:
            print('Problem with {}'.format(xyz))
            pass
        pass
    pass

100%|███████████████████████████████████████████████████████████| 1911/1911 [01:40<00:00, 18.93it/s]


In [128]:
for xyz in tqdm(verification_df.index, ncols=100):
    new_array = ordered_array[ordered_array[:, :, -1] == xyz][:, 2]
    new_array = new_array[~np.isnan(new_array)].astype(int)
    old_array = np.array(verification_df.loc[xyz, 'order_of_times'])
    if not (np.all(old_array == new_array)):
        print('Problem with {}'.format(xyz))
        pass
    pass

100%|███████████████████████████████████████████████████████████| 1911/1911 [01:40<00:00, 18.94it/s]


In [152]:
verification_df = df[df['game_id'] == '559620'].copy().reset_index(drop=True)
verification_df["order_of_main_bar_reviews"], verification_df["order_of_scores"], verification_df["order_of_times"] = get_list_of_main_bar_reviews_old(verification_df)
verification_df.index = df[df['game_id'] == '559620'].index

100%|████████████████████████████████████████████████████████████| 312/312 [00:01<00:00, 245.62it/s]


In [153]:
for xyz in tqdm(verification_df.index, ncols=100):
    new_array = ordered_array[ordered_array[:, :, -1] == xyz][:, 0]
    new_array = new_array[~np.isnan(new_array)].astype(int).astype(str)
    old_array = np.array(verification_df.loc[xyz, 'order_of_main_bar_reviews'])
    if not (np.all(old_array == new_array)):
        print('Problem with {}'.format(xyz))
        pass
    pass

100%|█████████████████████████████████████████████████████████████| 312/312 [00:16<00:00, 18.36it/s]


In [154]:
for xyz in tqdm(verification_df.index, ncols=100):
    new_array = ordered_array[ordered_array[:, :, -1] == xyz][:, 1]
    new_array = new_array[~np.isnan(new_array)]
    old_array = np.array(verification_df.loc[xyz, 'order_of_scores'])
    if len(old_array) > 0:
        if max(abs(new_array - old_array)) > .001:
            print('Problem with {}'.format(xyz))
            pass
        pass
    pass

100%|█████████████████████████████████████████████████████████████| 312/312 [00:16<00:00, 19.17it/s]


In [155]:
for xyz in tqdm(verification_df.index, ncols=100):
    new_array = ordered_array[ordered_array[:, :, -1] == xyz][:, 2]
    new_array = new_array[~np.isnan(new_array)].astype(int)
    old_array = np.array(verification_df.loc[xyz, 'order_of_times'])
    if not (np.all(old_array == new_array)):
        print('Problem with {}'.format(xyz))
        pass
    pass

100%|█████████████████████████████████████████████████████████████| 312/312 [00:16<00:00, 19.16it/s]


In [9]:
#df["order_of_main_bar_reviews"], df["order_of_scores"], df["order_of_times"] = [np.nan]*3

In [10]:
#df.loc[df['game_id'] == x, ["order_of_main_bar_reviews", "order_of_scores", "order_of_times"]]

In [11]:
#df["order_of_main_bar_reviews"], df["order_of_scores"], df["order_of_times"]

## Organizing our data

## Saving our data