In [1]:
from tqdm import tqdm
import pandas as pd
import ast
import pickle
import types
from joblib import Parallel, delayed
import concurrent.futures
def imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            try:
                yield val.__name__, val.__version__
                pass
            except:
                yield val.__name__
                pass
            pass
        pass
    pass

list(imports())

# # reading the appropriate files


['builtins',
 'builtins',
 ('pandas', '2.1.1'),
 'ast',
 'pickle',
 'types',
 'concurrent']

In [2]:
data_location = '../data/interim_data/03_data_wrangling/'
df=pd.read_csv(f"{data_location}reviews_with_main_and_sidebar.csv.gz", compression='gzip',low_memory=False)
reviews_list=pd.read_csv("../data/raw_data/data_filtered.csv.gz", compression="gzip",low_memory=False)

In [3]:
if type(df["order_of_main_bar_reviews"][0]) != list:
    df["order_of_main_bar_reviews"] = [[str(item) for item in ast.literal_eval(lst)] for lst in tqdm((df["order_of_main_bar_reviews"]))]
    df["sidebar_order_reviews"] = [[str(item) for item in ast.literal_eval(lst)] for lst in tqdm(df["sidebar_order_reviews"])]
else:
    # Ensure each element in each list is a string
    df["order_of_main_bar_reviews"] = [[str(item) for item in lst] for lst in tqdm(df["order_of_main_bar_reviews"])]
    df["sidebar_order_reviews"] = [[str(item) for item in lst] for lst in tqdm(df["sidebar_order_reviews"])]



df['recommendation_id'] = df['recommendation_id'].astype(str)
df['game_id'] = df['game_id'].astype(str)
df['timestamp_created'] = df['timestamp_created'].astype(int)

reviews_list['recommendation_id'] = reviews_list['recommendation_id'].astype(str)
reviews_list['game_id'] = reviews_list['game_id'].astype(str)
reviews_list['timestamp_created'] = reviews_list['timestamp_created'].astype(int)

100%|█████████████████████████████████████████████████████████████████████| 3388574/3388574 [01:08<00:00, 49422.97it/s]
100%|█████████████████████████████████████████████████████████████████████| 3388574/3388574 [01:07<00:00, 50453.39it/s]


# Extracting control IDs

How it works: we simply take review IDs that were not visible in either the recently-sorted (unobservable) reviews or the default-ordered (observable) reviews, we then added it to our list of control IDs for comparision 

In [4]:
# # # old code
# def extracting_control_reviews(df,reviews):
#     '''
#     Extracts control review IDs. 
    
#     Args:
#     - df: dataframe with the columns sidebar_reviews_list, order_of_main_bar_reviews, 
#     recommendation_id, timestamp_created and game_id of reviews in which the main bar
#     and sidebar reviews have been sorted. These only includes games that did not meet
#     the 10 threshold in sidebar and mainbar reviews, which have already been previously
#     removed. 
#     - reviews: a datafram with ALL reviews, their recommendation_id, game_id, timestamp_created
    
#     Returns:
#     - A dataframe with "control_ids" inserted into df, where each row in this col contains a 
#     list of the corresponding control review IDs. 
    
#     '''
#     filtered_recommendation_ids = []

#     for i in tqdm(range(len(df))):
#         current_recommendation_id=str(df["recommendation_id"][i])
#         time_id=int(df["timestamp_created"][i])
#         game_id=str(df["game_id"][i])
#         main_bar_reviews_list=list(df["order_of_main_bar_reviews"])
#         side_bar_reviews_list=list(df["sidebar_order_reviews"])

#         # Get reviews for the selected game ID
#         temp = reviews_list[reviews_list["game_id"] == game_id].reset_index(drop=True)
#         # Remove rows where the recommendation IDs are present in the main or side bar or the current review itseld.
#         temp = temp[~temp["recommendation_id"].isin(main_bar_reviews_list[i] + side_bar_reviews_list[i]+[current_recommendation_id])].reset_index(drop=True)
#         temp.sort_values(by=["timestamp_created","recommendation_id"], ascending=False, inplace=True)

#         # Filter rows based on the time range (last 30 days)
#         temp = temp[(temp["timestamp_created"] <= time_id) & (temp["timestamp_created"] > time_id - (86400 * 30))].reset_index(drop=True)

#         # filter to include only 10 
#         control_ids=(temp["recommendation_id"].tolist())[:10]

#         # Append the filtered recommendation IDs to the list
#         filtered_recommendation_ids.append(control_ids)
#     df["control_ids_list"]=filtered_recommendation_ids
#     return(df)


# df2=extracting_control_reviews(df[:2000],reviews_list)

In [5]:
def extracting_control_reviews(df, reviews):
    reviews_filtered_by_game = {game_id: reviews[reviews['game_id'] == game_id].sort_values(by=["timestamp_created","recommendation_id"], ascending=False) for game_id in df['game_id'].unique()}
    
    def filter_control_ids(row):
        current_recommendation_id = str(row['recommendation_id'])
        time_id = int(row['timestamp_created'])
        game_id = str(row['game_id'])
        temp = reviews_filtered_by_game[game_id]
    
        excluded_ids = set(row['order_of_main_bar_reviews'] + row['sidebar_order_reviews'] + [current_recommendation_id])
        temp = temp[~temp['recommendation_id'].isin(excluded_ids) & 
                    (temp['timestamp_created'] <= time_id) & 
                    (temp['timestamp_created'] > time_id - (86400 * 30))]
        
        return temp['recommendation_id'].head(10).tolist()
    
    tqdm.pandas()
    df['control_ids_list'] = df.progress_apply(lambda row: filter_control_ids(row), axis=1)
    
    return df


# # df1=extracting_control_reviews(df[:2000],reviews_list)
# df1.equals(df2)

In [6]:
# Setup for parallel processing
grouped_df = df.groupby('game_id')
group_review_list = reviews_list.groupby('game_id')
common_game_ids = set(grouped_df.groups.keys()) & set(group_review_list.groups.keys())
df_chunks = Parallel(n_jobs=20)(
    delayed(extracting_control_reviews)(grouped_df.get_group(game_id), group_review_list.get_group(game_id))
    for game_id in tqdm(common_game_ids)
)
df1 = pd.concat(df_chunks, ignore_index=True).reset_index(drop=True)

100%|████████████████████████████████████████████████████████████████████████████████| 868/868 [05:21<00:00,  2.70it/s]


In [7]:
# Filter out rows where 'control_ids' list has fewer than 10 elements
game_id_length=len(set(list(df1["game_id"])))
print(f"Before removing reviews with <10 control reviews, we have {len(df1)} reviews across {game_id_length} games")
df1 = (df1[[len(ids) >= 10 for ids in df1['control_ids_list']]]).reset_index(drop=True)
game_id_length=len(set(list(df1["game_id"])))
print(f"After removing reviews with <10 control reviews, we have {len(df1)} reviews across {game_id_length} games")

Before removing reviews with <10 control reviews, we have 3388574 reviews across 868 games
After removing reviews with <10 control reviews, we have 3264943 reviews across 717 games


In [8]:
csv_file_path = f'{data_location}reviews_with_main_sidebar_and_control.csv.gz'  
df1.to_csv(csv_file_path, index=False, compression='gzip')