In [1]:
import pandas as pd
from tqdm import tqdm
import types 
import ast
import zipfile
def imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            try:
                yield val.__name__, val.__version__
                pass
            except:
                yield val.__name__
                pass
            pass
        pass
    pass

list(imports())

['builtins', 'builtins', ('pandas', '1.4.2'), 'types', 'ast', 'zipfile']

In [2]:
data_location="../data/interim_data/04_text_mining/postprocessed/"
df=pd.read_csv(f'{data_location}data_with_similarities.csv.gz', compression='gzip',low_memory=False)

In [3]:
# adding additional rows to trace which game is a package, which is a package and has identical views compare to the pure game, and which has a package but the package and game view is different. 
df['appears_both_in_game_and_package'] = df.duplicated(subset=["recommendation_id"], keep=False)
df['package_and_game_view_is_identical'] = df.duplicated(subset=["recommendation_id","sidebar_list_of_similarities","main_bar_list_of_similarities","control_list_of_similarities"], keep=False)
df['is_a_package'] = df['game_id'].apply(lambda x: ',' in x)
df["package_review_but_displayed_different"] = (~df["package_and_game_view_is_identical"]) & (df["is_a_package"])

In [4]:
df["appears_both_in_game_and_package"].value_counts()

False    2897361
True      367582
Name: appears_both_in_game_and_package, dtype: int64

In [5]:
df["package_and_game_view_is_identical"].value_counts()

False    3053501
True      211442
Name: package_and_game_view_is_identical, dtype: int64

In [6]:
df["package_review_but_displayed_different"].value_counts()

False    3185941
True       79002
Name: package_review_but_displayed_different, dtype: int64

In [7]:
df["is_a_package"].value_counts()

False    3080220
True      184723
Name: is_a_package, dtype: int64

In [8]:
# number of package reviews that appeared exactly the same as they did in the raw game in itself
len(df[(df["is_a_package"]==True)&(df["package_and_game_view_is_identical"]==True)])

105721

In [9]:
# checking if there are duplicates or not for unique values of recommendation_id and game_id
len(df[["recommendation_id","game_id"]].drop_duplicates())==len(df)

True

In [10]:
# Making sure the data is in its proper data type (for safe-measure)
if type(df["main_bar_list_of_similarities"][0]) != list:
    df["sidebar_list_of_similarities"] = [[str(item) for item in ast.literal_eval(lst)] for lst in tqdm(df["sidebar_list_of_similarities"])]
    df["main_bar_list_of_similarities"] = [[str(item) for item in ast.literal_eval(lst)] for lst in tqdm(df["main_bar_list_of_similarities"])]
    df["control_list_of_similarities"] = [[str(item) for item in ast.literal_eval(lst)] for lst in tqdm(df["control_list_of_similarities"])]
else:
    # Ensure each element in each list is a string
    df["sidebar_list_of_similarities"] = [[str(item) for item in lst] for lst in tqdm(df["sidebar_list_of_similarities"])]
    df["main_bar_list_of_similarities"] = [[str(item) for item in lst] for lst in tqdm(df["main_bar_list_of_similarities"])]
    df["control_list_of_similarities"] = [[str(item) for item in lst] for lst in tqdm(df["control_list_of_similarities"])]

df['recommendation_id'] = df['recommendation_id'].astype(str)
df['game_id'] = df['game_id'].astype(str)
df['timestamp_created'] = df['timestamp_created'].astype(int)
df["votes_up"]= df['votes_up'].astype(int)
df["voted_up"]= df['voted_up'].astype(bool)

100%|██████████████████████████████| 3264943/3264943 [01:32<00:00, 35165.53it/s]
100%|██████████████████████████████| 3264943/3264943 [01:37<00:00, 33493.28it/s]
100%|██████████████████████████████| 3264943/3264943 [01:54<00:00, 28478.72it/s]


In [11]:
def checking_length_of_comparison(df,columns_to_check = ["main_bar_list_of_similarities", 
                                                         "sidebar_list_of_similarities", 
                                                         "control_list_of_similarities"]):
    '''
    ensuring that each review_id has 10 similarities for each list sidebar, mainbar and control reviews
    Args:
    df (pandas dataframe): dataframe of our reviews with the relevant columns where each cell contains
    a list of similarities of comparison. 
    columns_to_check (list): a list of columns in which we want to ensure that each cell contains a list
    with a length of 10 similarities. 
    
    Returns:
    print statement with row and column containing the cells that do not contain a list of 10 similarities. 

    '''

    # Initialize an empty list to store rows and columns that don't meet the criterion
    mismatched_items = []

    # Iterate through each row and column to check the condition
    for col in tqdm(columns_to_check):
        for index, item in enumerate(df[col]):
            # Check if the item is a list and its length is 10
            if not isinstance(item, list) or len(item) != 10:
                mismatched_items.append((index, col))

    # Displaying the results
    if mismatched_items:
        for index, col in mismatched_items:
            print(f"Warning! Row {index} in column '{col}' does not have a list of length 10. Please double check!")
    else:
        print("All items in the specified columns are lists with a length of 10.")

        
checking_length_of_comparison(df,columns_to_check = ["main_bar_list_of_similarities", 
                                                     "sidebar_list_of_similarities", 
                                                     "control_list_of_similarities"])

100%|█████████████████████████████████████████████| 3/3 [00:03<00:00,  1.20s/it]

All items in the specified columns are lists with a length of 10.





In [12]:
def obtain_extended_dataframe_of_similarities(df, column_name, similarity_type):
    '''
    melts and extends data for each given similarity type (ie mainbar, sidebar, and control)
    
    args:
    df (pandas dataframe): dataframe with the columns of containing a list of similarities 
    for the corresponding similarity type
    column_name (str): the column name in which we wish to melt the data
    similarity_type (str): type of similarity we are comparing each review with
    
    returns:
    pandas dataframe of melted data that is ready for analysis. 
    '''
    def obtain_corresponding_feature(org_list):
        return([element for element in list(org_list) for _ in range(10)])
    similarities=[item for sublist in tqdm(list(df[column_name])) for item in (sublist)]
    recommendation_id=obtain_corresponding_feature((df["recommendation_id"]))
    order=[i+1 for i in range(10)]*len(df)
    game_id=obtain_corresponding_feature((df["game_id"]))
    score=obtain_corresponding_feature((df["score"]))
    votes_up=obtain_corresponding_feature((df["votes_up"]))
    voted_up=obtain_corresponding_feature((df["voted_up"]))
    timestamp_created=obtain_corresponding_feature((df["timestamp_created"]))
    appears_both_in_game_and_package=obtain_corresponding_feature((df["appears_both_in_game_and_package"]))
    is_a_package=obtain_corresponding_feature((df["is_a_package"]))
    package_and_game_view_is_identical=obtain_corresponding_feature((df["package_and_game_view_is_identical"]))
    package_review_but_displayed_different=obtain_corresponding_feature((df["package_review_but_displayed_different"]))
    similarity_type=[similarity_type for i in range(len(voted_up))]
    data=pd.DataFrame({
        "recommendation_id":recommendation_id,
        "order":order,
        "similarity_score":similarities,
        "similarity_type":similarity_type,
        "game_id":game_id,
        "score":score,
        "votes_up":votes_up,
        "voted_up":voted_up,
        "time_stampcreated":timestamp_created,
        "is_a_package":is_a_package,
        "appears_both_in_game_and_package":appears_both_in_game_and_package,
        "package_and_game_view_is_identical":package_and_game_view_is_identical,
        "package_review_but_displayed_different":package_review_but_displayed_different
        
    })
    return(data)

df_mainbar=obtain_extended_dataframe_of_similarities(df, "main_bar_list_of_similarities", "main bar")
df_sidebar=obtain_extended_dataframe_of_similarities(df, "sidebar_list_of_similarities", "sidebar")
df_control=obtain_extended_dataframe_of_similarities(df, "control_list_of_similarities", "control")
data=pd.concat([df_mainbar,df_sidebar,df_control]).reset_index(drop=True)

100%|████████████████████████████| 3264943/3264943 [00:03<00:00, 1058269.30it/s]
100%|█████████████████████████████| 3264943/3264943 [00:05<00:00, 595277.52it/s]
100%|█████████████████████████████| 3264943/3264943 [00:05<00:00, 604236.50it/s]


In [13]:
# checking for duplicates
duplicates = data.duplicated(keep=False)
(data[duplicates])

Unnamed: 0,recommendation_id,order,similarity_score,similarity_type,game_id,score,votes_up,voted_up,time_stampcreated,is_a_package,appears_both_in_game_and_package,package_and_game_view_is_identical,package_review_but_displayed_different


In [14]:
data["similarity_score"]=data["similarity_score"].astype(float)

In [15]:
data["votes_up"]=data["votes_up"].astype(int)

In [16]:
data

Unnamed: 0,recommendation_id,order,similarity_score,similarity_type,game_id,score,votes_up,voted_up,time_stampcreated,is_a_package,appears_both_in_game_and_package,package_and_game_view_is_identical,package_review_but_displayed_different
0,137653791,1,0.839532,main bar,1294810,0.964516,8138,False,1682992588,False,False,False,False
1,137653791,2,0.903023,main bar,1294810,0.964516,8138,False,1682992588,False,False,False,False
2,137653791,3,0.639622,main bar,1294810,0.964516,8138,False,1682992588,False,False,False,False
3,137653791,4,0.806373,main bar,1294810,0.964516,8138,False,1682992588,False,False,False,False
4,137653791,5,0.707455,main bar,1294810,0.964516,8138,False,1682992588,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
97948285,10425639,6,0.558849,control,256190,0.336881,7,False,1402477870,False,False,False,False
97948286,10425639,7,0.455615,control,256190,0.336881,7,False,1402477870,False,False,False,False
97948287,10425639,8,0.451328,control,256190,0.336881,7,False,1402477870,False,False,False,False
97948288,10425639,9,0.472071,control,256190,0.336881,7,False,1402477870,False,False,False,False


In [17]:
data.to_csv("../data/processed_data/data.csv.gz",index=False, compression='gzip')

In [18]:
df[df["recommendation_id"]=="156916661"]

Unnamed: 0,recommendation_id,game_id,score,review,timestamp_created,timestamp_updated,votes_up,voted_up,order_of_main_bar_reviews,start_timestamp,sidebar_order_reviews,control_ids_list,sidebar_reviews_list,main_bar_list_of_similarities,sidebar_list_of_similarities,control_list_of_similarities,appears_both_in_game_and_package,package_and_game_view_is_identical,is_a_package,package_review_but_displayed_different
1981861,156916661,"973580,1174630,1174631,1174632,1213590,1228720...",0.0,"Very well optimized, run perfectly on the Stea...",1706326281,1706326281,0,True,"['102548374', '102029332', '102255458', '10314...",1703734281,"['156887143', '156875483', '156776590', '15663...","['156331148', '156320516', '156258334', '15625...","['156887143', '156875483', '156776590', '15663...","[0.3272615373134613, 0.32819241285324097, 0.62...","[0.7179370522499084, 0.7513765096664429, 0.431...","[0.7301662564277649, 0.7391008138656616, 0.690...",True,False,True,True
2431000,156916661,973580,0.0,"Very well optimized, run perfectly on the Stea...",1706326281,1706326281,0,True,"['129530648', '130031378', '130315895', '12894...",1703734281,"['156887143', '156875483', '156776590', '15663...","['156331148', '156320516', '156258334', '15625...","['156887143', '156875483', '156776590', '15663...","[0.690584123134613, 0.6715161204338074, 0.7217...","[0.7179370522499084, 0.7513765096664429, 0.431...","[0.7301662564277649, 0.7391008138656616, 0.690...",True,False,False,False
