In [1]:
import pandas as pd
import ast
import pickle
import types
import concurrent.futures
import numpy as np
from collections import defaultdict
import concurrent.futures
from joblib import Parallel, delayed
from tqdm.auto import tqdm

def imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            try:
                yield val.__name__, val.__version__
                pass
            except:
                yield val.__name__
                pass
            pass
        pass
    pass

list(imports())

# Read the files
data_location = '../data/'
main_bar_reviews=np.load(f"{data_location}temp_reconstruction.npy")

In [2]:
reviews_list=pd.read_csv("../data/raw_data/data_filtered.csv.gz", compression="gzip",low_memory=False)

# Convert columns to their supposed data types (for safe measure)
reviews_list['recommendation_id'] = reviews_list['recommendation_id'].astype(str)
reviews_list['game_id'] = reviews_list['game_id'].astype(str)
reviews_list['review'] = reviews_list['review'].astype(str)
reviews_list['score'] = reviews_list['score'].astype(float)
reviews_list['timestamp_created'] = reviews_list['timestamp_created'].astype(int)
reviews_list['timestamp_updated'] = reviews_list['timestamp_updated'].astype(int)
reviews_list['votes_up'] = reviews_list['votes_up'].astype(int)
reviews_list['voted_up'] = reviews_list['voted_up'].astype(int)
reviews_list

Unnamed: 0,recommendation_id,game_id,score,review,timestamp_created,timestamp_updated,votes_up,voted_up
0,142438936,1000410,0.892404,I don't typically put out reviews on a product...,1689814681,1689919141,194,0
1,112290161,1000410,0.870081,"Had great potential, but the game lost all mom...",1647438823,1647438823,92,0
2,97363397,1000410,0.822628,"Hoping the Devs's are going to finish this , i...",1628696941,1628696941,97,1
3,135764158,1000410,0.810227,This game has been in EA for going on two and ...,1680210233,1680210233,68,0
4,121037305,1000410,0.804653,"Game is in early access Limbo, with the devs d...",1661149590,1661149590,85,0
...,...,...,...,...,...,...,...,...
3621922,19379599,377160435870435880435881480630480631490650,0.487133,It has been confirmed that fallout 4 is in the...,1448848523,1451180272,2,1
3621923,19353976,377160435870435880435881480630480631490650,0.487133,best game of the year. Not even super buggy li...,1448772572,1448772572,2,1
3621924,19347200,377160435870435880435881480630480631490650,0.487133,Greatest game,1448748567,1448748567,2,1
3621925,19345490,377160435870435880435881480630480631490650,0.487133,"In a nutshell, it's Skyrim in a Fallout format.",1448744065,1448744065,2,1


In [3]:
# placing main_bar reviews in the dataframe
unique_dict = {}

for subarray in tqdm(main_bar_reviews):
    for row in subarray:
        key = int(row[-1])
        if key not in unique_dict:
            unique_dict[key] = []
        try:
            unique_dict[key].append(str(int(row[0])))
        except:
            unique_dict[key].append(np.nan)
            
reviews_list["order_of_main_bar_reviews"]=(list(unique_dict.values()))

  0%|          | 0/3621927 [00:00<?, ?it/s]

In [4]:
# removing games with <10 main bar reviews
df = (reviews_list[~reviews_list['order_of_main_bar_reviews'].apply(lambda x: np.nan in x)]).reset_index(drop=True)
game_length=len(set(list(df["game_id"])))
print(f"After removing games in which there were <10 main bar reviews, we get {len(df)} across {game_length} games.")

After removing games in which there were <10 main bar reviews, we get 3584746 across 1091 games.


In [5]:
def get_temp_df(df, reviews_list):
    '''
    Generates a dataframe with of column dedicated to sidebar reviews.
    Args:
    df: DataFrame containing the main data.
    reviews_list: DataFrame containing the reviews data.
    
    Returns:
    dataframe with of column dedicated to sidebar reviews.
    '''
    df=df.reset_index(drop=True)
    reviews_list=reviews_list.reset_index(drop=True)
    # Compute start_timestamp for all rows in df
    df['start_timestamp'] = df['timestamp_created'] - (30 * 86400)
    
    # Ensure game_id is of the same type in both DataFrames to avoid type mismatch issues
    df['game_id'] = df['game_id'].astype(str)
    reviews_list['game_id'] = reviews_list['game_id'].astype(str)
    
    # Initialize the ordered array
    ordered_array = np.full((len(df), 10), np.nan, dtype=float)
    unique_game_ids = df['game_id'].unique()
    filtered_reviews_by_game = {
        game_id: reviews_list[(reviews_list['game_id'] == game_id) &
                              (reviews_list['timestamp_created'] > df[df['game_id'] == game_id]['start_timestamp'].min()) &
                              (reviews_list['timestamp_created'] <= df[df['game_id'] == game_id]['timestamp_created'].max())]
        for game_id in tqdm(unique_game_ids)
    }

    for i, row in tqdm(df.iterrows()):
        game_id = row['game_id']
        if game_id in filtered_reviews_by_game:
            relevant_reviews = filtered_reviews_by_game[game_id]
            start_timestamp, time_created = row['start_timestamp'], row['timestamp_created']
            exclude_ids = set(row['order_of_main_bar_reviews'] + [row['recommendation_id']])
            top_reviews = relevant_reviews[(relevant_reviews['timestamp_created'] > start_timestamp) &
                                           (relevant_reviews['timestamp_created'] <= time_created) &
                                           (~relevant_reviews['recommendation_id'].isin(exclude_ids))]
            top_reviews = top_reviews.sort_values('timestamp_created', ascending=False).head(10)['recommendation_id'].values
            
            ordered_array[i, :len(top_reviews)] = top_reviews
    df["sidebar_order_reviews"]=list(ordered_array)
    return df


# Converting data fields to lists
if type(df["order_of_main_bar_reviews"][0])!=list:
    df["order_of_main_bar_reviews"] = [ast.literal_eval(item) for item in df["order_of_main_bar_reviews"]]
    df["order_of_times"] = [ast.literal_eval(item) for item in df["order_of_times"]]
else:
    pass


In [6]:
# running and testing old code here. 

# f1=get_temp_df(df[:1000], reviews_list)
# f1=f1["sidebar_order_reviews"]
# def get_temp_df(df, reviews_list):
#     """
#     Generates a list of unobserved reviews for the past 30 days.

#     Args:
#     df: DataFrame containing the main data.
#     reviews_list: DataFrame containing the reviews data.

#     Returns:
#     A list of unobserved reviews.
#     """
#     side_bar_reviews_list = []
#     for i in tqdm(range(len(df))):
#         current_id=list(df["recommendation_id"])[i]
#         end_int = int(df.timestamp_created[i])
#         start = end_int - (30 * 86400)
#         game_id = str(df.game_id[i])

#         temp_dates = reviews_list[reviews_list['game_id'] == game_id]
#         temp_dates = temp_dates[temp_dates['timestamp_created'].between(start, end_int - 1)]
#         temp_dates = temp_dates.sort_values("timestamp_created", ascending=False).reset_index(drop=True)
#         main_bar_list = df.order_of_main_bar_reviews[i]
#         temp_dates_list = list(temp_dates.recommendation_id)
#         exclude_ids = main_bar_list + [current_id]
#         sidebar_temp_dates_list = temp_dates[~temp_dates['recommendation_id'].isin(exclude_ids)]['recommendation_id'].tolist()
#         if len(temp_dates) >= 10:
            
#             side_bar_reviews_list.append(sidebar_temp_dates_list[:10])
#         else:
#             side_bar_reviews_list.append([])
        
#         # sidebar_temp_dates_list = [x for x in temp_dates_list if x not in main_bar_list+[current_id]]
#         # side_bar_reviews_list.append(sidebar_temp_dates_list[:10])
#     return side_bar_reviews_list

# test_2= get_temp_df(df[:1000], reviews_list)

# # Function to process the array
# def process_array(arr):
#     result = []
#     for subarr in tqdm(arr):
# #         print(subarr)
# #         break
#         if np.isnan(subarr).any():
#             result.append([])
#         else:
#             # Convert each number to string, removing ".0" for whole numbers
#             formatted_subarr = [f"{x:.0f}" if x.is_integer() else str(x) for x in subarr]
#             result.append(formatted_subarr)
#     return result

# # Process the given array
# processed_list = process_array(f1)

# test_2==processed_list

In [7]:

# Setup for parallel processing
grouped_df = df.groupby('game_id')
group_review_list = reviews_list.groupby('game_id')
common_game_ids = set(grouped_df.groups.keys()) & set(group_review_list.groups.keys())
df_chunks = Parallel(n_jobs=20)(
    delayed(get_temp_df)(grouped_df.get_group(game_id), group_review_list.get_group(game_id))
    for game_id in tqdm(common_game_ids)
)


  0%|          | 0/1091 [00:00<?, ?it/s]

In [8]:
df = pd.concat(df_chunks, ignore_index=True).reset_index(drop=True)
df = (df[~df["sidebar_order_reviews"].apply(lambda x: any(pd.isna(y) for y in x))]).reset_index(drop=True)
game_length=len(set(list(df["game_id"])))
print(f"After removing games in which there were <10 side-bar reviews, we get {len(df)} across {game_length} games.")

After removing games in which there were <10 side-bar reviews, we get 3388574 across 868 games.


In [9]:
df["sidebar_order_reviews"] = (df["sidebar_order_reviews"].apply(
    lambda x: [str(int(y)) if y.is_integer() else str(y) for y in x if not pd.isna(y)]
))

In [10]:
df

Unnamed: 0,recommendation_id,game_id,score,review,timestamp_created,timestamp_updated,votes_up,voted_up,order_of_main_bar_reviews,start_timestamp,sidebar_order_reviews
0,32490515,595140,0.912175,Immortal Redneck is a curious beast. Behind a ...,1497661926,1497662640,173,1,"[82674370, 83630452, 82484211, 83375228, 82915...",1495069926,"[32393610, 32351415, 32309405, 32309198, 32287..."
1,31441583,595140,0.816505,"[b]Oh yeah, [i]Immortal Redneck[/i] is great f...",1493477060,1493477132,44,1,"[82674370, 83630452, 82484211, 83375228, 82915...",1490885060,"[31439995, 31436970, 31434654, 31432738, 31432..."
2,35045169,595140,0.786501,If you look at the market for Rogue-like/lite ...,1505741057,1505741057,44,1,"[82674370, 83630452, 82484211, 83375228, 82915...",1503149057,"[34959967, 34915892, 34897627, 34880062, 34852..."
3,31617998,595140,0.741279,Serious Sam and Rogue Legacy have a multiclass...,1494214853,1494214853,24,1,"[82674370, 83630452, 82484211, 83375228, 82915...",1491622853,"[31613746, 31608074, 31598725, 31592037, 31590..."
4,39992542,595140,0.733965,[h1][b] Overview[/b] [/h1]\nImmortal Redneck i...,1518320126,1518321680,32,1,"[82674370, 83630452, 82484211, 83375228, 82915...",1515728126,"[39802970, 39796938, 39761019, 39737678, 39638..."
...,...,...,...,...,...,...,...,...,...,...,...
3388569,50220968,941850,0.501901,Wow. Ignore the negative reviews!\n\nThis game...,1555703996,1555703996,3,1,"[17668201, 18274193, 18750489, 18420751, 18577...",1553111996,"[50202420, 50170991, 50144350, 50114036, 50097..."
3388570,49996188,941850,0.499791,This is WW2 close quarter combat at its best! ...,1554577264,1554577264,3,1,"[18274193, 18750489, 18267163, 18420751, 18577...",1551985264,"[49994898, 49993324, 49989898, 49986941, 49985..."
3388571,50028900,941850,0.498388,"Somebody in the forums asked, ""where's the fre...",1554723309,1554723309,3,1,"[18274193, 18750489, 18267163, 18259036, 18420...",1552131309,"[50022662, 50018723, 50018407, 50015143, 50009..."
3388572,49997552,941850,0.494297,i only got to play in a server with 9 or so o...,1554581256,1554581256,0,1,"[18274193, 18267163, 18259036, 18420751, 18577...",1551989256,"[49996188, 49994898, 49993324, 49989898, 49986..."


In [11]:
csv_file_path = f'{data_location}/interim_data/03_data_wrangling/reviews_with_main_and_sidebar.csv.gz'  
df.to_csv(csv_file_path, index=False, compression='gzip')