These codes attempt to re-simulate the default-sorted 'summary' reviews that the reviewer would have seen whilst writing his/her own review. We do so by tracing the time in which he/she wrote the review and implement our hypothesized default-sorted 'summary' of reviews. 

The psuedo-code of this could be found in the supplementary information (algorithm S1). 

In [None]:
# import the packages and read pickle file

from google.colab import drive
drive.mount("/content/drive")
%cd '/content/drive/My Drive/Dissertation'
from tqdm import tqdm
import pickle
with open("singleplayer_FPS_reviews_list_2", "rb") as fp: 
  single_FPS_reviews = pickle.load(fp)

with open("games_list_2","rb") as fp:
  games_id_list=pickle.load(fp)

len(games_id_list)

Mounted at /content/drive
/content/drive/My Drive/Dissertation


673

# organizing data per game

In [None]:
game_id,steam_id_list,player_review_list,time_stamp_created_list,time_stamp_updated_list,recommendation_id_list,scores,votes_up_total=[],[],[],[],[],[],[],[]
for i in tqdm(range(len(games_id_list))):
  game=games_id_list[i]
  game_reviews=single_FPS_reviews[i]
  for j in (range(len(game_reviews))):
    steam_id=game_reviews[j]["author"]["steamid"]
    player_review=game_reviews[j]["review"]
    timestamp_created=game_reviews[j]["timestamp_created"]
    timestamp_updated=game_reviews[j]["timestamp_updated"]
    recommendation_id=game_reviews[j]["recommendationid"]
    score=game_reviews[j]["weighted_vote_score"]
    votes_up=game_reviews[j]["votes_up"]
    game_id.append(game)
    steam_id_list.append(steam_id)
    player_review_list.append(player_review)
    time_stamp_created_list.append(timestamp_created)
    time_stamp_updated_list.append(timestamp_updated)
    recommendation_id_list.append(recommendation_id)
    scores.append(score)
    votes_up_total.append(votes_up)
import pandas as pd 
df = pd.DataFrame(list(zip(recommendation_id_list,game_id, scores,steam_id_list,player_review_list,time_stamp_created_list,time_stamp_updated_list,votes_up_total)), columns =['recommendation_id','game_id','score', 'steam_id','review','timestamp_created','timestamp_updated','votes_up'])  


100%|██████████| 673/673 [00:01<00:00, 390.98it/s]


Removing games with $\leq10$ games. 

In [None]:
df=df.groupby('game_id').filter(lambda x : len(x)>10)
df["index"]=[i for i in range(len(df))]
df=df.set_index("index")
df

Unnamed: 0_level_0,recommendation_id,game_id,score,steam_id,review,timestamp_created,timestamp_updated,votes_up
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,117084240,1533420,0.934666335582733154,76561197990540569,"if you like those crazy ass Japanese movies, M...",1655402060,1655402060,247
1,117083497,1533420,0.897299528121948242,76561198043707357,Purchased a Machine Girl album. got a free gam...,1655401168,1655401168,152
2,117089901,1533420,0.867490172386169434,76561198203123848,a gore-obsessed chick in a bunny mask called m...,1655409089,1655409089,92
3,117083452,1533420,0.812168419361114502,76561198097251074,Stuck with a black screen after the intro. Int...,1655401101,1655401101,68
4,117084445,1533420,0.75773167610168457,76561198157693142,machine girl <3,1655402308,1655402308,40
...,...,...,...,...,...,...,...,...
1056935,113604297,1445120,0,76561198271143660,reviews really werent kidding about the font b...,1649544470,1649544470,0
1056936,111673657,1445120,0,76561198836980654,"Dear Developers,\n\nIve played this game for 1...",1646570504,1646570504,0
1056937,90986327,1445120,0,76561198283885247,"The font used doesn't help a lot, Hs and Ks as...",1619497434,1619497499,0
1056938,90049741,1445120,0,76561198026846672,font is horrible.\nToo fast for most people\nB...,1618038505,1618038505,1


# Implementing our hypothesized default-sorted algorithm and performing our re-simulation. 

Getting the order of recommendation_IDs in which the reviewer would see when writing his/her review. 



In [None]:

from datetime import datetime, timedelta
from tqdm import tqdm
import time

# this function sorts the reviews feeded to the function from highest to lowest, based on its score
def sort_and_rearrange(temp_df):
  temp_df["score"] = pd.to_numeric(temp_df["score"])
  temp_df=temp_df.sort_values("score",ascending=False)
  temp_df["index"]=[i for i in range(len(temp_df))]
  temp_df=temp_df.set_index("index")
  return(temp_df)

# trace the reviews written x days ago
def get_x_days_ago(end_int,days):
  end= datetime.fromtimestamp(end_int)
  start=(end - timedelta(days=days))
  start_int=int((time.mktime(start.timetuple())))
  return(start_int)

# get reviews written x days ago. 
def get_x_days_ago_comments(days, df,end_int):
  start=get_x_days_ago(int(end_int),days)
  df['timestamp_created'] = pd.to_numeric(df['timestamp_created'])
  temp_dates = df[df['timestamp_created'].between(start,(end_int-1))]
  return(temp_dates)


# The below 4 functions are implementing lines 6,10 and 14 of the psuedo-code
# this function basically compares a list of reviews written in one block of days (x) and another block of days (y).
# it then tries to find all reviews where the scores of reviews written in y are higher than the highest score amongst the reviews of x 
def condition_against(temp1,temp2,difference):
  if len(temp1)==10:
    temp=temp1
  elif difference==10:
    temp=temp2[0:len(temp2)]
  elif len(temp2)==0:
    temp=temp1
  else:
    temp=temp1
    #print(len(list(temp2["score"])),len(list(temp1["score"])))
    for i in range(difference):
      if i>=len(list(temp2["score"])):
        temp=temp
      elif float((list(temp2["score"]))[i])>float((list(temp1["score"]))[0]):
        to_be_added=temp2[i:i+1]
        temp=pd.concat([temp,to_be_added])
      else:
        temp=temp
  return(temp)


def temp_conditional_return_1(time_created,day_range1,day_range_ultimate,df):
  day_range1_reviews=get_x_days_ago_comments(day_range1,df,time_created)
  day_range1_date=get_x_days_ago(time_created,day_range1)
  days_diff=day_range_ultimate-day_range1
  in_between_days_reviews=get_x_days_ago_comments(days_diff, df,day_range1_date)
  difference=int(10-(len(day_range1_reviews)))
  temp1=sort_and_rearrange(day_range1_reviews)
  temp2=sort_and_rearrange(in_between_days_reviews)
  #print(difference)
  temp=condition_against(temp1,temp2,difference)
  return(temp)



def temp_conditional_return_2(time_created,day_range1,day_range2,day_range_ultimate,df):
  day_range1_reviews=get_x_days_ago_comments(day_range1,df,time_created)
  day_range1_date=get_x_days_ago(time_created,day_range1)
  days_diff_1=day_range2-day_range1
  day_range2_reviews=get_x_days_ago_comments(days_diff_1, df,day_range1_date)
  day_range2_date=get_x_days_ago(time_created,day_range2)
  difference=int(10-(len(day_range1_reviews)))
  temp1=sort_and_rearrange(day_range1_reviews)
  temp2=sort_and_rearrange(day_range2_reviews)
  temp=condition_against(temp1,temp2,difference)
  difference_2=int(10-len(temp))
  days_diff_2=day_range_ultimate-day_range2
  day_range3_reviews=get_x_days_ago_comments(days_diff_2, df,day_range2_date)
  temp3=sort_and_rearrange(day_range3_reviews)
  temp=condition_against(temp,temp3,difference_2)
  return(temp)



def temp_conditional_return_3(time_created,day_range1,day_range2,day_range_3,start_date,df):

  # 0-30 days
  day_range1_reviews=get_x_days_ago_comments(day_range1,df,time_created)
  day_range1_date=get_x_days_ago(time_created,day_range1)

  # 30-90 days
  days_diff_1=day_range2-day_range1
  day_range2_reviews=get_x_days_ago_comments(days_diff_1, df,day_range1_date)
  day_range2_date=get_x_days_ago(time_created,day_range2)
  difference=int(10-(len(day_range1_reviews)))
  temp1=sort_and_rearrange(day_range1_reviews)
  temp2=sort_and_rearrange(day_range2_reviews)
  temp=condition_against(temp1,temp2,difference)

  # 90-180 days
  difference_2=int(10-len(temp))
  days_diff_2=day_range_3-day_range2
  day_range3_reviews=get_x_days_ago_comments(days_diff_2, df,day_range2_date)
  temp3=sort_and_rearrange(day_range3_reviews)
  temp=condition_against(temp,temp3,difference_2)

  # 180 days - forever days
  day_range3_date=get_x_days_ago(time_created,day_range_3)
  day_range_4=int((time_created - start_date) / 86400)
  difference_3=int(10-len(temp))
  days_diff_3=day_range_4-day_range_3
  day_range4_reviews=get_x_days_ago_comments(days_diff_3, df,day_range3_date)
  temp4=sort_and_rearrange(day_range4_reviews)
  temp=condition_against(temp,temp4,difference_3)
  return(temp)

In [None]:
import math

# this implements the algorithm using all the functions above. 
def get_list_of_visible_comments(df):
  order_of_visibility,order_of_scores,order_of_times=[],[],[]
  x=0
  for i in tqdm(range(len(df))):
    game_id=str(df["game_id"][i])
    time_created=int(df["timestamp_created"][i])
    temp=df[(df['game_id']==game_id)]
    temp=temp[(temp['votes_up']!=0)] # needs to have >= 1 vote to be displayed in the list of reviews. 
    temp_dates = get_x_days_ago_comments(30, temp,time_created)
    if len(temp)==0:
      temp=temp_dates
    elif len(temp_dates)>=10:
      temp=temp_dates
    else: 
      temp_90_dates = temp_conditional_return_1(time_created,30,90,temp) # filling the deficit if <10 reviews within 30 days 
      if len(temp_90_dates)>=10:
        temp=temp_90_dates
      else:
        temp_180_dates = temp_conditional_return_2(time_created,30,90,180,temp) # filling the deficit if <10 reviews within 30 days + 30 to 90 days
        if len(temp_180_dates)>=10:
          temp=temp_180_dates
        else:
          start_date=int(temp['timestamp_created'].min())
          temp=temp_conditional_return_3(time_created,30,90,180,start_date,temp) # filling in the deficit if <10 reviews within 30 days + 30 to 90 days + 90 to 180 days

    
    temp["score"] = pd.to_numeric(temp["score"])
    temp=temp.sort_values("score",ascending=False)
    temp["index"]=[i for i in range(len(temp))]
    temp=temp.set_index("index")

    # making filtering to 10 games. 
    if len(temp)>=10:
      n=10
    else:
      n=len(temp)
    list_of_visible=list(temp["recommendation_id"][0:n]) 
    list_of_scores=list(temp["score"][0:n])
    list_of_times=list(temp["timestamp_created"][0:n])
    #print(i,"---",list_of_visible)
    order_of_visibility.append(list_of_visible)
    order_of_scores.append(list_of_scores)
    order_of_times.append(list_of_times)
    #x+=1
  return(order_of_visibility,order_of_scores,order_of_times)

In [None]:
### testing ### 
#df1=df[0:25000]
#(df1["order_of_visible_comments"]),(df1["order_of_scores"]),(df1["order_of_times"])=get_list_of_visible_comments(df1)

In [None]:
(df["order_of_visible_comments"]),(df["order_of_scores"]),(df["order_of_times"])=get_list_of_visible_comments(df)
df
# temporarily saving data for safe-measure (in case code collapses)
df.to_csv('FPS_reviews.csv') # saving temporarily for safe-measure


In [None]:
# rearranging the dataframe
reviews = pd.DataFrame({'recommendation_id' : df["recommendation_id"]})
reviews["review"]=df["review"]
reviews
df=df.drop(['index'], axis=1)

In [None]:
# removing re-simulations with <10 observed default-sorted reviews to ensure consistency in our analysis. 

import ast
index_of_less_than_10=[]
for i in tqdm(range(len(df))):
  #print(df["order_of_visible_comments"])[i])
  if len(ast.literal_eval(df["order_of_visible_comments"][i]))<10:
    index_of_less_than_10.append(i)
  else:
    continue
df=df.drop(index_of_less_than_10)

 90%|████████▉ | 949906/1056940 [00:25<00:02, 37687.82it/s]

In [None]:
# saving the dataframe. 

df["index"]=[i for i in range(len(df))]
df=df.set_index("index")
df.to_csv('reviews_with_visible_comments.csv')
reviews.to_csv('FPS_reviews_list.csv')