## Preparing the files to implement the fast text embeddings

In [None]:
from google.colab import drive
from tqdm import tqdm
drive.mount("/content/drive")
%cd '/content/drive/My Drive/Dissertation'
import requests
from bs4 import BeautifulSoup
import pandas as pd
import ast
from numpy import mean
import pickle
reviews=pd.read_csv("FPS_reviews_lemmatized.csv")

# reading the files
with open("observable_comments_list", "rb") as fp1: 
  visible_review_order_list = pickle.load(fp1) 

with open("unobservable_comments_list", "rb") as fp2: 
  non_observable_review_order_list = pickle.load(fp2) 

with open("recommendation_id", "rb") as fp3: 
  recommendation_id_list = pickle.load(fp3) 

#df["order_of_visible_comments"]=[(ast.literal_eval(df["order_of_visible_comments"][i])) for i in tqdm(range(len(df)))] # saving this as a list
#df["unobservable_comments_list"]=[(ast.literal_eval(df["unobservable_comments_list"][i])) for i in tqdm(range(len(df)))] # saving this as a list


# in our wrangling and re-simulation components, we forgot to limit the most recent reviews to 10, per what actually happens in STEAM. We perform that here.  
def reduce_to_ten(non_visible_matrix):
  new_matrix=[]
  for i in tqdm(range(len(non_visible_matrix))):
    if len((non_visible_matrix[i]))<10:
      new_matrix.append([""])
    else:
      new=(((non_visible_matrix[i])[0:10]))      
      new_matrix.append(new)
  return(new_matrix)
def elements_less_ten(non_visible_matrix):
  indexes=[]
  for i in tqdm(range(len(non_visible_matrix))):
    if (len(non_visible_matrix[i]))<10:
      indexes.append(int(i))
    else:
      continue
  return(indexes)
def remove_elements_less_ten(indexes,matrix):
  for index in sorted(indexes, reverse=True):
    del matrix[index]
  return(matrix)
non_observable_review_order_list=reduce_to_ten(non_observable_review_order_list)
indexes=elements_less_ten(non_observable_review_order_list)
non_observable_review_order_list=remove_elements_less_ten(indexes,non_observable_review_order_list)
visible_review_order_list=remove_elements_less_ten(indexes,visible_review_order_list)
recommendation_id_list=remove_elements_less_ten(indexes,recommendation_id_list)

# inserting the actual review into the list of visible reviews
for i in tqdm(range(len(visible_review_order_list))):
  (visible_review_order_list[i]).insert(0,str(recommendation_id_list[i]))
for i in tqdm(range(len(non_observable_review_order_list))):
  (non_observable_review_order_list[i]).insert(0,str(recommendation_id_list[i]))


# saving the files for save measure in case colab crashes. 
with open("observable_comments_list_updated", "wb") as fp1:
  pickle.dump(visible_review_order_list, fp1)

with open("unobservable_comments_list_updated", "wb") as fp2:
  pickle.dump(non_observable_review_order_list, fp2)

with open("recommendation_id_updated", "wb") as fp3:
  pickle.dump(recommendation_id_list, fp3)


In [None]:
from google.colab import drive
from tqdm import tqdm
drive.mount("/content/drive")
%cd '/content/drive/My Drive/Dissertation'
import requests
from bs4 import BeautifulSoup
import pandas as pd
import ast
from numpy import mean
import pickle

# re-reading the files above
with open("unobservable_comments_list_updated", "rb") as fp1:
  unobservable_review_order_list = pickle.load(fp1)
with open("observable_comments_list_updated", "rb") as fp2:
  visible_review_order_list = pickle.load(fp2)
with open("recommendation_id_updated", "rb") as fp3:
  recommendation_id_list = pickle.load(fp3)
reviews=pd.read_csv("FPS_reviews_lemmatized.csv")

# make recommendation_id a string for all
reviews["recommendation_id"]=[str((reviews["recommendation_id"])[i]) for i in (range(len(reviews)))]
reviews["review"]=[str((reviews["review"])[i]) for i in (range(len(reviews)))]
recommendation_id=[str(i) for i in recommendation_id_list]
def convert_to_string(list_of_list):
  new_list=[]
  for i in tqdm(range(len(list_of_list))):
    current_list=list_of_list[i]
    current_list=[str(j) for j in current_list]
    new_list.append(current_list)
  return(new_list)

visible_review_order_list=convert_to_string(visible_review_order_list)
unobservable_review_order_list=convert_to_string(unobservable_review_order_list)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/Dissertation


100%|██████████| 920536/920536 [00:03<00:00, 245360.00it/s]
100%|██████████| 920536/920536 [00:04<00:00, 216017.34it/s]


## running the fast-text word embeddings and cosine-similarity matrix


In [None]:
from scipy import spatial
import numpy as np
from scipy.spatial.distance import cosine
import numpy as np
!git clone https://github.com/facebookresearch/fastText.git
!cd fastText
!pip install fastText
import fasttext
import fasttext.util
# download an english model of fasttext
fasttext.util.download_model('en', if_exists='ignore')  # English
model = fasttext.load_model('cc.en.300.bin')

# getting the fastText word vectors for each reviews
def get_vector(s):
  return (model.get_sentence_vector(str(s)))

# getting the fastText word vectors for across all reviews and saving it in the form of a dictionary
def get_vector_dict(reviews):
  review_ids=list(reviews["recommendation_id"])
  review=list(reviews["review"])
  vector={}
  for i in tqdm(range(len(review))):
    review_id=str(review_ids[i])
    fasttext_vector=get_vector(review[i])
    vector[review_id]=fasttext_vector
  return(vector)

# calculating the cosine-similarity score 
def cosine_similarity(embedding_1, embedding_2):
  # Calculate the cosine similarity of the two embeddings.
  sim = 1 - cosine(embedding_1, embedding_2)
  return(sim)

# getting the cosine similarity matrix for each review
def similiarity_matrix(list_of_visible_reviews,comments_vector):
  review_id=str(list_of_visible_reviews[0])
  review_vector=comments_vector[review_id]
  matrix=[]
  for i in range(1,len(list_of_visible_reviews)):
    order_id=str(list_of_visible_reviews[i])
    order_x=comments_vector[order_id]
    similarity=cosine_similarity(review_vector,order_x)
    matrix.append(similarity)
  return(matrix)

# compiling all the similarity matrices 
def list_of_matrix(review_order_list,comments_vector):
  matrix_list=[]
  for i in tqdm(range(len(review_order_list))):
    list_of_visible_matrices=similiarity_matrix(review_order_list[i],comments_vector)
    matrix_list.append(list_of_visible_matrices)
  return(matrix_list)

vector_dict=get_vector_dict(reviews)
visible_matrix=list_of_matrix(visible_review_order_list,vector_dict)
non_visible_matrix=list_of_matrix(unobservable_review_order_list,vector_dict)

# saving the files 
with open("observable_similarity_list", "wb") as fp1:
  pickle.dump(visible_matrix, fp1)

with open("recommendation_id", "wb") as fp2:
  pickle.dump(recommendation_id_list, fp2)

with open("unobservable_similarity_list", "wb") as fp3:
  pickle.dump(non_visible_matrix, fp3)

fatal: destination path 'fastText' already exists and is not an empty directory.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


100%|██████████| 1056940/1056940 [02:30<00:00, 7014.92it/s]
  dist = 1.0 - uv / np.sqrt(uu * vv)
100%|██████████| 920536/920536 [09:42<00:00, 1580.26it/s]
100%|██████████| 920536/920536 [09:14<00:00, 1660.90it/s]
