## Preparing the files and data to implement the fast text embeddings

In [1]:
# # Uncomment these lines if fastText needs to be installed or cloned
# !pip install fasttext-wheel

import pandas as pd
import numpy as np
import fasttext
from scipy.spatial.distance import cosine
from tqdm import tqdm
import ast
import types
import json
from concurrent.futures import ProcessPoolExecutor

# Download an English model of fastText from https://fasttext.cc/docs/en/crawl-vectors.html
model_path = '../data/supplemental_data/cc.en.300.bin'  
model = fasttext.load_model(model_path)

def imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            try:
                yield val.__name__, val.__version__
                pass
            except:
                yield val.__name__
                pass
            pass
        pass
    pass

list(imports())


['builtins',
 'builtins',
 ('pandas', '2.1.1'),
 ('numpy', '1.24.3'),
 'fasttext',
 'ast',
 'types',
 ('json', '2.0.9')]

In [2]:
# loading files
data_location_wrangling = '../data/interim_data/03_data_wrangling/'
data_location_text_preprocessed = '../data/interim_data/04_text_mining/preprocessed/'

reviews = pd.read_csv(f"{data_location_text_preprocessed}reviews_lemmatized.csv.gz", compression='gzip',low_memory=False)
df=pd.read_csv(f'{data_location_wrangling}reviews_with_main_sidebar_and_control.csv.gz', compression='gzip',low_memory=False)

In [3]:
# Making sure the data is in its proper data type (for safe-measure)
if type(df["order_of_main_bar_reviews"][0]) != list:
    df["order_of_main_bar_reviews"] = [[str(item) for item in ast.literal_eval(lst)] for lst in tqdm(df["order_of_main_bar_reviews"])]
    df["sidebar_reviews_list"] = [[str(item) for item in ast.literal_eval(lst)] for lst in tqdm(df["sidebar_order_reviews"])]
    df["control_ids_list"] = [[str(item) for item in ast.literal_eval(lst)] for lst in tqdm(df["control_ids_list"])]
else:
    # Ensure each element in each list is a string
    df["order_of_main_bar_reviews"] = [[str(item) for item in lst] for lst in tqdm(df["order_of_main_bar_reviews"])]
    df["sidebar_reviews_list"] = [[str(item) for item in lst] for lst in tqdm(df["sidebar_order_reviews"])]
    df["control_ids_list"] = [[str(item) for item in lst] for lst in tqdm(df["control_ids_list"])]

df['recommendation_id'] = df['recommendation_id'].astype(str)
df['game_id'] = df['game_id'].astype(str)
df['timestamp_created'] = df['timestamp_created'].astype(int)

reviews['recommendation_id'] = reviews['recommendation_id'].astype(str)
reviews['review'] = reviews['review'].fillna('')
reviews['review'] = reviews['review'].astype(str)

100%|█████████████████████████████████████████████████████████████████████| 3264943/3264943 [01:04<00:00, 50664.34it/s]
100%|█████████████████████████████████████████████████████████████████████| 3264943/3264943 [01:06<00:00, 49095.57it/s]
100%|█████████████████████████████████████████████████████████████████████| 3264943/3264943 [01:03<00:00, 51421.29it/s]


In [4]:
def convert_to_string(list_of_lists):
    """
    Converts elements of each sublist in the given list of lists to strings.

    Args:
    list_of_lists: A list of lists to be converted.

    Returns:
    A new list of lists with all elements converted to strings.
    """
    new_list = []
    for sublist in tqdm(list_of_lists):
        new_list.append([str(item) for item in sublist])
    return new_list

# Convert elements of 'main_bar_review_order_list' and 'sidebar_review_order_list' to strings
main_bar_review_order_list = convert_to_string(list(df["order_of_main_bar_reviews"]))
sidebar_review_order_list = convert_to_string(list(df["sidebar_reviews_list"]))


100%|████████████████████████████████████████████████████████████████████| 3264943/3264943 [00:08<00:00, 382738.63it/s]
100%|████████████████████████████████████████████████████████████████████| 3264943/3264943 [00:06<00:00, 508959.10it/s]


## running the fast-text word embeddings and cosine-similarity matrix


In [10]:
def get_vector(sentence):
    """
    Returns the fastText vector for a given sentence.

    Args:
    sentence: A string representing the sentence.

    Returns:
    A vector representing the sentence.
    """
    return model.get_sentence_vector(str(sentence))



def get_vector_dict(reviews):
    vector = {}
    for _, row in reviews.iterrows():
        review_id = str(row["recommendation_id"])
        fasttext_vector = get_vector(row["review"])
        vector[review_id] = fasttext_vector
    return vector


def cosine_similarity(embedding_1, embedding_2):
    """
    Calculates the cosine similarity between two embeddings.

    Args:
    embedding_1: The first embedding vector.
    embedding_2: The second embedding vector.

    Returns:
    The cosine similarity score.
    """
    return 1 - cosine(embedding_1, embedding_2)


def similarity_matrix(review_id, list_of_compared_reviews, comments_vector):
    review_vector = comments_vector[review_id]
    return [cosine_similarity(review_vector, comments_vector[str(order_id)]) 
            for order_id in list_of_compared_reviews]



def list_of_matrix(df, review_type, comments_vector):
    matrix_list = []
    for _, row in tqdm(df.iterrows()):
        review_id = str(row["recommendation_id"])
        list_of_compared_reviews = row[review_type]
        matrices = similarity_matrix(review_id, list_of_compared_reviews, comments_vector)
        matrix_list.append(matrices)
    return matrix_list



vector_dict = get_vector_dict(reviews)

In [None]:
df["main_bar_list_of_similarities"] = list_of_matrix(df,"order_of_main_bar_reviews", vector_dict)
df["sidebar_list_of_similarities"] = list_of_matrix(df,"sidebar_reviews_list", vector_dict)
df["control_list_of_similarities"] = list_of_matrix(df,"control_ids_list", vector_dict)

3264943it [14:54, 3651.45it/s]
  dist = 1.0 - uv / np.sqrt(uu * vv)
3264943it [14:27, 3763.77it/s]
  dist = 1.0 - uv / np.sqrt(uu * vv)
3241733it [14:15, 3508.96it/s]

In [None]:
# def get_vector_dict(reviews):
#     """
#     Creates a dictionary of fastText vectors for each review.

#     Args:
#     reviews: DataFrame containing 'recommendation_id' and 'review'.

#     Returns:
#     A dictionary with review IDs as keys and fastText vectors as values.
#     """
#     vector = {}
#     for i in tqdm(range(len(reviews))):
#         review_id = str(reviews["recommendation_id"][i])
#         if review_id not in list(vector.keys()):
#             fasttext_vector = get_vector(reviews["review"][i])
#             vector[review_id] = fasttext_vector
#         else:
#             continue
#     return vector


# def similarity_matrix(review_id,list_of_compared_reviews, comments_vector):
#     """
#     Generates a similarity matrix for a list of reviews.

#     Args:
#     list_of_compared_reviews: A list of review IDs.
#     comments_vector: A dictionary of review vectors.

#     Returns:
#     A list of similarity scores.
#     """
#     review_vector = comments_vector[review_id]
#     matrix = []
#     for order_id in list_of_compared_reviews:
#         order_vector = comments_vector[str(order_id)]
#         similarity = cosine_similarity(review_vector, order_vector)
#         matrix.append(similarity)
#     return matrix


# def list_of_matrix(df, review_type ,comments_vector):
#     """
#     Compiles similarity matrices for all reviews in the list.

#     Args:
#     review_order_list: A list of review order lists.
#     comments_vector: A dictionary of review vectors.

#     Returns:
#     A list of similarity matrices.
#     """
#     matrix_list=[]
#     list_of_review_ids=list(df["recommendation_id"])
#     for i in tqdm(range(len(list_of_review_ids))):
#         review_id=str(list_of_review_ids[i])
#         list_of_compared_reviews=list(list(df[review_type])[i])
#         matrices = similarity_matrix(review_id,list_of_compared_reviews, comments_vector)
#         matrix_list.append(matrices)
#     return matrix_list

## saving the data. 

In [None]:
data_location="../data/interim_data/04_text_mining/postprocessed/"
vector_dict = {k: v.tolist() for k, v in vector_dict.items()}
# saving the fastText embeddings for each review in case code needs to be re-analyzed. 
with open(data_location + 'review_embeddings.txt', 'w') as file:
    json.dump(vector_dict, file)
csv_file_path = f'{data_location}data_with_similarities.csv.gz'
df.to_csv(csv_file_path, index=False, compression='gzip')