In [1]:
import pandas as pd
from tqdm import tqdm
import types 
import ast
import zipfile
def imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            try:
                yield val.__name__, val.__version__
                pass
            except:
                yield val.__name__
                pass
            pass
        pass
    pass

list(imports())

['builtins', 'builtins', ('pandas', '2.1.1'), 'types', 'ast', 'zipfile']

In [2]:
data_location="../data/interim_data/04_text_mining/postprocessed/"
df=pd.read_csv(f'{data_location}data_with_similarities.csv.gz', compression='gzip',low_memory=False)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3264943 entries, 0 to 3264942
Data columns (total 16 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   recommendation_id              int64  
 1   game_id                        object 
 2   score                          float64
 3   review                         object 
 4   timestamp_created              int64  
 5   timestamp_updated              int64  
 6   votes_up                       int64  
 7   voted_up                       int64  
 8   order_of_main_bar_reviews      object 
 9   start_timestamp                int64  
 10  sidebar_order_reviews          object 
 11  control_ids_list               object 
 12  sidebar_reviews_list           object 
 13  main_bar_list_of_similarities  object 
 14  sidebar_list_of_similarities   object 
 15  control_list_of_similarities   object 
dtypes: float64(1), int64(6), object(9)
memory usage: 398.6+ MB


In [4]:
# Making sure the data is in its proper data type (for safe-measure)
if type(df["main_bar_list_of_similarities"][0]) != list:
    df["sidebar_list_of_similarities"] = [[str(item) for item in ast.literal_eval(lst)] for lst in tqdm(df["sidebar_list_of_similarities"])]
    df["main_bar_list_of_similarities"] = [[str(item) for item in ast.literal_eval(lst)] for lst in tqdm(df["main_bar_list_of_similarities"])]
    df["control_list_of_similarities"] = [[str(item) for item in ast.literal_eval(lst)] for lst in tqdm(df["control_list_of_similarities"])]
else:
    # Ensure each element in each list is a string
    df["sidebar_list_of_similarities"] = [[str(item) for item in lst] for lst in tqdm(df["sidebar_list_of_similarities"])]
    df["main_bar_list_of_similarities"] = [[str(item) for item in lst] for lst in tqdm(df["main_bar_list_of_similarities"])]
    df["control_list_of_similarities"] = [[str(item) for item in lst] for lst in tqdm(df["control_list_of_similarities"])]

df['recommendation_id'] = df['recommendation_id'].astype(str)
df['game_id'] = df['game_id'].astype(str)
df['timestamp_created'] = df['timestamp_created'].astype(int)
df["votes_up"]= df['votes_up'].astype(int)
df["voted_up"]= df['voted_up'].astype(bool)

100%|█████████████████████████████████████████████████████████████████████| 3264943/3264943 [01:47<00:00, 30367.87it/s]
100%|█████████████████████████████████████████████████████████████████████| 3264943/3264943 [01:46<00:00, 30634.00it/s]
100%|█████████████████████████████████████████████████████████████████████| 3264943/3264943 [01:49<00:00, 29787.70it/s]


In [5]:
list(df["main_bar_list_of_similarities"])[0]

['0.8395321369171143',
 '0.9030225872993469',
 '0.6396219730377197',
 '0.8063729405403137',
 '0.7074548602104187',
 '0.6428577303886414',
 '0.803161084651947',
 '0.5400782227516174',
 '0.4019930362701416',
 '0.7775624990463257']

In [6]:
def checking_length_of_comparison(df,columns_to_check = ["main_bar_list_of_similarities", 
                                                         "sidebar_list_of_similarities", 
                                                         "control_list_of_similarities"]):
    '''
    ensuring that each review_id has 10 similarities for each list sidebar, mainbar and control reviews
    Args:
    df (pandas dataframe): dataframe of our reviews with the relevant columns where each cell contains
    a list of similarities of comparison. 
    columns_to_check (list): a list of columns in which we want to ensure that each cell contains a list
    with a length of 10 similarities. 
    
    Returns:
    print statement with row and column containing the cells that do not contain a list of 10 similarities. 

    '''

    # Initialize an empty list to store rows and columns that don't meet the criterion
    mismatched_items = []

    # Iterate through each row and column to check the condition
    for col in tqdm(columns_to_check):
        for index, item in enumerate(df[col]):
            # Check if the item is a list and its length is 10
            if not isinstance(item, list) or len(item) != 10:
                mismatched_items.append((index, col))

    # Displaying the results
    if mismatched_items:
        for index, col in mismatched_items:
            print(f"Warning! Row {index} in column '{col}' does not have a list of length 10. Please double check!")
    else:
        print("All items in the specified columns are lists with a length of 10.")

        
checking_length_of_comparison(df,columns_to_check = ["main_bar_list_of_similarities", 
                                                     "sidebar_list_of_similarities", 
                                                     "control_list_of_similarities"])

100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  2.47it/s]

All items in the specified columns are lists with a length of 10.





In [7]:
def obtain_extended_dataframe_of_similarities(df, column_name, similarity_type):
    '''
    melts and extends data for each given similarity type (ie mainbar, sidebar, and control)
    
    args:
    df (pandas dataframe): dataframe with the columns of containing a list of similarities 
    for the corresponding similarity type
    column_name (str): the column name in which we wish to melt the data
    similarity_type (str): type of similarity we are comparing each review with
    
    returns:
    pandas dataframe of melted data that is ready for analysis. 
    '''
    def obtain_corresponding_feature(org_list):
        return([element for element in list(org_list) for _ in range(10)])
    similarities=[item for sublist in tqdm(list(df[column_name])) for item in (sublist)]
    recommendation_id=obtain_corresponding_feature((df["recommendation_id"]))
    order=[i+1 for i in range(10)]*len(df)
    game_id=obtain_corresponding_feature((df["game_id"]))
    score=obtain_corresponding_feature((df["score"]))
    votes_up=obtain_corresponding_feature((df["votes_up"]))
    voted_up=obtain_corresponding_feature((df["voted_up"]))
    similarity_type=[similarity_type for i in range(len(voted_up))]
    data=pd.DataFrame({
        "recommendation_id":recommendation_id,
        "order":order,
        "similarity_score":similarities,
        "similarity_type":similarity_type,
        "game_id":game_id,
        "score":score,
        "votes_up":votes_up,
        "voted_up":voted_up,
    })
    return(data)

df_mainbar=obtain_extended_dataframe_of_similarities(df, "main_bar_list_of_similarities", "main bar")
df_sidebar=obtain_extended_dataframe_of_similarities(df, "sidebar_list_of_similarities", "sidebar")
df_control=obtain_extended_dataframe_of_similarities(df, "control_list_of_similarities", "control")
data=pd.concat([df_mainbar,df_sidebar,df_control]).reset_index(drop=True)

100%|███████████████████████████████████████████████████████████████████| 3264943/3264943 [00:01<00:00, 2127630.10it/s]
100%|███████████████████████████████████████████████████████████████████| 3264943/3264943 [00:01<00:00, 2055133.70it/s]
100%|███████████████████████████████████████████████████████████████████| 3264943/3264943 [00:01<00:00, 2128410.52it/s]


In [8]:
duplicates = data.duplicated(keep=False)
(data[duplicates])

Unnamed: 0,recommendation_id,order,similarity_score,similarity_type,game_id,score,votes_up,voted_up


In [9]:
data["similarity_score"]=data["similarity_score"].astype(float)

In [10]:
data["votes_up"]=data["votes_up"].astype(int)

In [11]:
data

Unnamed: 0,recommendation_id,order,similarity_score,similarity_type,game_id,score,votes_up,voted_up
0,137653791,1,0.839532,main bar,1294810,0.964516,8138,False
1,137653791,2,0.903023,main bar,1294810,0.964516,8138,False
2,137653791,3,0.639622,main bar,1294810,0.964516,8138,False
3,137653791,4,0.806373,main bar,1294810,0.964516,8138,False
4,137653791,5,0.707455,main bar,1294810,0.964516,8138,False
...,...,...,...,...,...,...,...,...
97948285,10425639,6,0.558849,control,256190,0.336881,7,False
97948286,10425639,7,0.455615,control,256190,0.336881,7,False
97948287,10425639,8,0.451328,control,256190,0.336881,7,False
97948288,10425639,9,0.472071,control,256190,0.336881,7,False


In [12]:
data.to_csv("../data/processed_data/data.csv.gz",index=False, compression='gzip')