In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:

import pandas as pd
import numpy as np
import math
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import tensorflow as tf


# Load Data

In [None]:
# Import category natures
cat_data = pd.read_csv("/content/drive/MyDrive/Code + Data/category_nature.csv")

# Load category embeddings
with open('/content/drive/MyDrive/Code + Data/cat_bert_embeddings.pkl', 'rb') as f:
    cat_bert_embeddings = pickle.load(f)
loaded_cat_data = cat_bert_embeddings.numpy()
cat_data['embeddings'] = loaded_cat_data.tolist()

# Look up tables
cat_data_dict = dict(zip(cat_data['Category'], cat_data['embeddings']))
cat_bn_dict = dict(zip(cat_data['Category'], cat_data['Nature']))

In [None]:
# Load train BERT embeddings and train data

# with open('/content/drive/MyDrive/Code + Data/train_embeddings.pkl', 'rb') as f:
#     summary_embeddings_train = pickle.load(f)

# train = pd.read_parquet('/content/drive/MyDrive/Code + Data/bn_train_data_final_user_cum_helpful.parquet') Full Train

# train['bert_embeddings'] = [embedding.tolist() for embedding in summary_embeddings_train]

train_1 = pd.read_parquet('/content/drive/MyDrive/Code + Data/train_1.parquet') # Broken up into 4 parts
train_2 = pd.read_parquet('/content/drive/MyDrive/Code + Data/train_2.parquet')
train_3 = pd.read_parquet('/content/drive/MyDrive/Code + Data/train_3.parquet')
train_4 = pd.read_parquet('/content/drive/MyDrive/Code + Data/train_4.parquet')
print("Train Files loaded successfully!")


Train Files loaded successfully!


In [None]:
# train_parts = np.array_split(train, 4)
# train_1 = train_parts[0]
# train_2 = train_parts[1]
# train_3 = train_parts[2]
# train_4 = train_parts[3]
# train_1.to_parquet('/content/drive/MyDrive/Code + Data/train_1.parquet') # Save to parquet file as checkpoint
# print('train_1 done')

# train_2.to_parquet('/content/drive/MyDrive/Code + Data/train_2.parquet') # Save to parquet file as checkpoint
# print('train_2 done')

# train_3.to_parquet('/content/drive/MyDrive/Code + Data/train_3.parquet') # Save to parquet file as checkpoint
# print('train_3 done')

# train_4.to_parquet('/content/drive/MyDrive/Code + Data/train_4.parquet') # Save to parquet file as checkpoint
# print('train_4 done')


In [None]:
# Load val BERT embeddings and val data
with open('/content/drive/MyDrive/Code + Data/val_embeddings.pkl', 'rb') as f:
    summary_embeddings_val = pickle.load(f)

val = pd.read_parquet('/content/drive/MyDrive/Code + Data/bn_val_data_final_user_cum_helpful.parquet')


val['bert_embeddings'] = [embedding.tolist() for embedding in summary_embeddings_val]
print("Val Files loaded successfully!")


In [None]:
# Load test BERT embeddings and test data
with open('/content/drive/MyDrive/Code + Data/test_embeddings.pkl', 'rb') as f:
    summary_embeddings_test = pickle.load(f)

test = pd.read_parquet('/content/drive/MyDrive/Code + Data/bn_test_data_final_user_cum_helpful.parquet')


test['bert_embeddings'] = [embedding.tolist() for embedding in summary_embeddings_test]
print("Test Files loaded successfully!")


# Calculate Similarity w/ Bert Embeddings

In [None]:
def calculate_similarity(review_embeddings, categories):
    similarity_dict = {}
    for category in categories:
      print(category)
      category_embeddings = cat_data_dict.get(category)
      similarity = cosine_similarity(np.array(review_embeddings).reshape(1, -1), np.array(category_embeddings).reshape(1, -1))
      similarity_dict[category] = similarity[0][0]
    return similarity_dict

In [None]:
train_1['bert_similarity_results'] = train_1.apply(lambda row: calculate_similarity(row['bert_embeddings'], row['categories']), axis=1)
train_1.to_parquet('/content/drive/MyDrive/Code + Data/bn_train_1_data_bert_sim.parquet')


In [None]:
train_2['bert_similarity_results'] = train_2.apply(lambda row: calculate_similarity(row['bert_embeddings'], row['categories']), axis=1)
train_2.to_parquet('/content/drive/MyDrive/Code + Data/bn_train_2_data_bert_sim.parquet')


In [None]:
train_3['bert_similarity_results'] = train_3.apply(lambda row: calculate_similarity(row['bert_embeddings'], row['categories']), axis=1)
train_3.to_parquet('/content/drive/MyDrive/Code + Data/bn_train_3_data_bert_sim.parquet')


In [None]:
train_4['bert_similarity_results'] = train_4.apply(lambda row: calculate_similarity(row['bert_embeddings'], row['categories']), axis=1)
train_4.to_parquet('/content/drive/MyDrive/Code + Data/bn_train_4_data_bert_sim.parquet')


In [None]:
val['bert_similarity_results'] = val.apply(lambda row: calculate_similarity(row['extractive_summary'], row['categories']), axis=1)


In [None]:
val.to_parquet('/content/drive/MyDrive/Code + Data/bn_val_data_bert_sim.parquet')


In [None]:
test['bert_similarity_results'] = test.apply(lambda row: calculate_similarity(row['extractive_summary'], row['categories']), axis=1)

In [None]:
test.to_parquet('/content/drive/MyDrive/Code + Data/bn_test_data_bert_sim.parquet')


# Calculate Weights by Category

In [None]:
def calculate_weights_by_category(similarity_results):
    category_values = {"0": [], "1": []}
    for key, value in similarity_results.items():
        category = cat_bn_dict.get(key)
        category_values[str(category)].append(value)
    weights = {}
    for category, values in category_values.items():
        if values:
            weights[category] = sum(values) / len(values)  # Average value
        else:
            weights[category] = 0.0  # Default to 0 if no values
    return weights


In [None]:
def filter_none_values(dictionary):
    return {key: value for key, value in dictionary.items() if value is not None}

In [None]:
train_1 = pd.read_parquet('/content/drive/MyDrive/Code + Data/bn_train_1_data_bert_sim.parquet')
print("Train File loaded successfully!")
train_1['bert_similarity_results'] = train_1['bert_similarity_results'].apply(filter_none_values)
print('filter_none_success!')
train_1['bert_weights'] = train_1['bert_similarity_results'].apply(lambda x: calculate_weights_by_category(x))
print('calculate_weights_by_category success!')
train_1.to_parquet('/content/drive/MyDrive/Code + Data/bn_train_1_pre-final.parquet')

Train File loaded successfully!
filter_none_success!
calculate_weights_by_category success!


In [None]:
train_2 = pd.read_parquet('/content/drive/MyDrive/Code + Data/bn_train_2_data_bert_sim.parquet')
print("Train File loaded successfully!")
train_2['bert_similarity_results'] = train_2['bert_similarity_results'].apply(filter_none_values)
print('filter_none_success!')
train_2['bert_weights'] = train_2['bert_similarity_results'].apply(lambda x: calculate_weights_by_category(x))
print('calculate_weights_by_category success!')
train_2.to_parquet('/content/drive/MyDrive/Code + Data/bn_train_2_pre-final.parquet')

Train File loaded successfully!
filter_none_success!
calculate_weights_by_category success!


In [None]:
train_3 = pd.read_parquet('/content/drive/MyDrive/Code + Data/bn_train_3_data_bert_sim.parquet')
print("Train File loaded successfully!")
train_3['bert_similarity_results'] = train_3['bert_similarity_results'].apply(filter_none_values)
print('filter_none_success!')
train_3['bert_weights'] = train_3['bert_similarity_results'].apply(lambda x: calculate_weights_by_category(x))
print('calculate_weights_by_category success!')
train_3.to_parquet('/content/drive/MyDrive/Code + Data/bn_train_3_pre-final.parquet')

Train File loaded successfully!
filter_none_success!
calculate_weights_by_category success!


In [None]:
train_4 = pd.read_parquet('/content/drive/MyDrive/Code + Data/bn_train_4_data_bert_sim.parquet')
print("Train File loaded successfully!")
train_4['bert_similarity_results'] = train_4['bert_similarity_results'].apply(filter_none_values)
print('filter_none_success!')
train_4['bert_weights'] = train_4['bert_similarity_results'].apply(lambda x: calculate_weights_by_category(x))
print('calculate_weights_by_category success!')
train_4.to_parquet('/content/drive/MyDrive/Code + Data/bn_train_4_pre-final.parquet')

Train File loaded successfully!
filter_none_success!
calculate_weights_by_category success!


In [None]:
val = pd.read_parquet('/content/drive/MyDrive/Code + Data/bn_val_data_bert_sim.parquet')
print("Train File loaded successfully!")
val['bert_similarity_results'] = val['bert_similarity_results'].apply(filter_none_values)
print('filter_none_success!')
val['bert_weights'] = val['bert_similarity_results'].apply(lambda x: calculate_weights_by_category(x))
print('calculate_weights_by_category success!')
val.to_parquet('/content/drive/MyDrive/Code + Data/bn_val_pre-final.parquet')

Train File loaded successfully!
filter_none_success!
calculate_weights_by_category success!


In [None]:
test = pd.read_parquet('/content/drive/MyDrive/Code + Data/bn_test_data_bert_sim.parquet')
print("Train File loaded successfully!")
test['bert_similarity_results'] = test['bert_similarity_results'].apply(filter_none_values)
print('filter_none_success!')
test['bert_weights'] = test['bert_similarity_results'].apply(lambda x: calculate_weights_by_category(x))
print('calculate_weights_by_category success!')
test.to_parquet('/content/drive/MyDrive/Code + Data/bn_test_pre-final.parquet')

Train File loaded successfully!
filter_none_success!
calculate_weights_by_category success!


# Final df processing for train/val/test

In [None]:
# # Load processed data
# train_1_pre_final = pd.read_parquet('/content/drive/MyDrive/Code + Data/bn_train_1_pre-final.parquet')
# train_2_pre_final = pd.read_parquet('/content/drive/MyDrive/Code + Data/bn_train_2_pre-final.parquet')
# train_weights_p1= pd.concat([train_1_pre_final, train_2_pre_final], ignore_index=True)
# train_weights_p1.to_parquet('/content/drive/MyDrive/Code + Data/bn_train_pre-final1.parquet')

In [None]:
# train_3_pre_final = pd.read_parquet('/content/drive/MyDrive/Code + Data/bn_train_3_pre-final.parquet')
# train_4_pre_final = pd.read_parquet('/content/drive/MyDrive/Code + Data/bn_train_4_pre-final.parquet')
# train_weights_p2 =  pd.concat([train_3_pre_final, train_4_pre_final], ignore_index=True)
# train_weights_p2.to_parquet('/content/drive/MyDrive/Code + Data/bn_train_pre-final2.parquet')

In [None]:
# train_weights_p1 = pd.read_parquet('/content/drive/MyDrive/Code + Data/bn_train_pre-final1.parquet')
# train_weights_p2 = pd.read_parquet('/content/drive/MyDrive/Code + Data/bn_train_pre-final2.parquet')
# train_weights_processed =  pd.concat([train_weights_p1, train_weights_p2], ignore_index=True)
# train_weights_processed.to_parquet('/content/drive/MyDrive/Code + Data/bn_train_pre-final.parquet')

In [None]:
# train_weights_p1 = pd.read_parquet('/content/drive/MyDrive/Code + Data/bn_train_pre-final1.parquet')

In [None]:
# print(train_weights_p1.shape)
# train_weights_p1.drop('bert_embeddings', axis = 1, inplace=True)
# print(train_weights_p1.shape)


(433592, 26)
(433592, 25)


In [None]:
# train_weights_p1.to_parquet('/content/drive/MyDrive/Code + Data/bn_train_pre-final1_dropped.parquet')

In [None]:
# train_weights_p2 = pd.read_parquet('/content/drive/MyDrive/Code + Data/bn_train_pre-final2.parquet')


In [None]:
# print(train_weights_p2.shape)
# train_weights_p2.drop('bert_embeddings', axis = 1, inplace=True)
# print(train_weights_p2.shape)

(433590, 26)
(433590, 25)


In [None]:
# train_weights_p2.to_parquet('/content/drive/MyDrive/Code + Data/bn_train_pre-final2_dropped.parquet')

In [None]:
# train_weights_p1 = pd.read_parquet('/content/drive/MyDrive/Code + Data/bn_train_pre-final1_dropped.parquet')
# train_weights_p2 = pd.read_parquet('/content/drive/MyDrive/Code + Data/bn_train_pre-final2_dropped.parquet')
# train_weights_processed =  pd.concat([train_weights_p1, train_weights_p2], ignore_index=True)
# train_weights_processed.to_parquet('/content/drive/MyDrive/Code + Data/bn_train_pre-final.parquet')

In [None]:
train_weights_processed = pd.read_parquet('/content/drive/MyDrive/Code + Data/bn_train_pre-final.parquet')
# Extract weights for search and experience nature
train_weights_processed['bert_search_similarity'] = train_weights_processed['bert_weights'].apply(lambda x: x["0"])
train_weights_processed['bert_experience_similarity'] = train_weights_processed['bert_weights'].apply(lambda x: x["1"])

In [None]:
train_weights_processed.to_parquet('/content/drive/MyDrive/Code + Data/bn_train_data_final_bert.parquet')


In [None]:
val_weights_processed = pd.read_parquet('/content/drive/MyDrive/Code + Data/bn_val_pre-final.parquet')
val_weights_processed.drop('bert_embeddings', axis = 1, inplace=True)

# Extract weights for search and experience nature
val_weights_processed['bert_search_similarity'] = val_weights_processed['bert_weights'].apply(lambda x: x["0"])
val_weights_processed['bert_experience_similarity'] = val_weights_processed['bert_weights'].apply(lambda x: x["1"])


In [None]:
val_weights_processed.to_parquet('/content/drive/MyDrive/Code + Data/bn_val_data_final_bert.parquet')

In [None]:
# Load processed data
test_weights_processed = pd.read_parquet('/content/drive/MyDrive/Code + Data/bn_val_pre-final.parquet')
test_weights_processed.drop('bert_embeddings', axis = 1, inplace=True)
# Extract weights for search and experience nature
test_weights_processed['bert_search_similarity'] = test_weights_processed['bert_weights'].apply(lambda x: x["0"])
test_weights_processed['bert_experience_similarity'] = test_weights_processed['bert_weights'].apply(lambda x: x["1"])

In [None]:
test_weights_processed.to_parquet('/content/drive/MyDrive/Code + Data/bn_test_data_final_bert.parquet')