In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import re
import string

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')


from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

import os
os.chdir('/content/drive/MyDrive/Code + Data')
import tobit
from tobit import TobitModel

from statsmodels.regression import quantile_regression
import statsmodels.api as sm
from statsmodels.formula.api import ols

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from sklearn.model_selection import train_test_split



from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import ndcg_score
from sklearn.preprocessing import StandardScaler

import gc


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
yelp_data = pd.read_parquet('/content/drive/MyDrive/Code + Data/yelp_data.parquet')
print(yelp_data.shape)

(1870042, 13)


In [None]:
# Balance Dataset to have equal helpful and non-helpful reviews
print(f"Original dataset size: {len(yelp_data)}")
print("0 helpful reviews:", len(yelp_data[yelp_data['helpful'] == 0]))
minority_count = len(yelp_data[yelp_data['helpful'] > 0])
print(f"helpful > 0 reviews: {minority_count}")

majority_class = yelp_data[yelp_data['helpful'] == 0]
minority_class = yelp_data[yelp_data['helpful'] > 0]

minority_count = len(minority_class)
target_majority_size = int(minority_count)  # Keep 1x as many majority class reviews; ~280k

# Downsample the majority class
downsampled_majority = majority_class.sample(n=target_majority_size, random_state=42)

balanced_data = pd.concat([downsampled_majority, minority_class])

# Shuffle
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

yelp_data = balanced_data

print(f"New dataset size: {len(yelp_data)}")
print("0 helpful reviews", len(yelp_data[yelp_data['helpful'] == 0]))
minority_count = len(yelp_data[yelp_data['helpful'] > 0])
print(f"helpful > 0 reviews: {minority_count}")


Original dataset size: 1870042
0 helpful reviews: 1147389
helpful > 0 reviews: 722652
New dataset size: 1445304
0 helpful reviews 722652
helpful > 0 reviews: 722652


In [None]:
# Add Word Count
def word_count(line):
  return len(line.split())
yelp_data['num_words'] = yelp_data['text'].apply(lambda x: word_count(x))

In [None]:
# Import category natures
cat_classes = pd.read_csv("/content/drive/MyDrive/Code + Data/category_nature.csv")
# Convert to dict
cat_lookup = dict(zip(cat_classes['Category'], cat_classes['Nature']))

In [None]:
# Convert helpfulness to be between 0-100 percent, as oppose to 0-1
yelp_data['helpful'] = yelp_data['helpful'].apply(lambda x: x * 100)

In [None]:
# Convert categories col to array
yelp_data['categories'] = yelp_data['categories'].apply(lambda x: x.split(', '))

In [None]:
cols = yelp_data.columns.tolist()
for c in cols:
  print(c)

review_id
user_id
business_id
stars_reviewer
useful
text
name
postal_code
stars_business
categories
total_reviews_for_business
helpful
num_sentences
num_words


In [None]:
yelp_data.to_parquet('/content/drive/MyDrive/Code + Data/business_nature_yelp_data.parquet')

In [None]:
# Create train, val, test with 60/20/20 split
train_val, test = train_test_split(yelp_data, test_size=0.2, random_state=42)
train, val = train_test_split(train_val, test_size=0.25, random_state=42)

# Hand labeling + taking majority class as the business nature

In [None]:
# Initial code for hand labeling categories

# all_categories = yelp_data['categories'].str.split(', ').explode()
# unique_categories = all_categories.unique()
# print(len(unique_categories))
# # Save to df for export to an Excel file for hand labeling
# nature_df = pd.DataFrame({'Category': unique_categories, 'Nature': ''})

# # Save to Excel file
# excel_file_path = '/content/drive/MyDrive/Code + Data/category_nature.xlsx'
# nature_df.to_excel(excel_file_path, index=False)

In [None]:
# Add a col with list of each category's nature (0 or 1)
def map_categories_to_nature(categories):
  return [cat_lookup[cat] for cat in categories]

train_weights_processed['categories_nature'] = train_weights_processed['categories'].apply(map_categories_to_nature)
val_weights_processed['categories_nature'] = val_weights_processed['categories'].apply(map_categories_to_nature)
test_weights_processed['categories_nature'] = test_weights_processed['categories'].apply(map_categories_to_nature)

In [None]:
# Get majority nature based on 'categories_nature' col - yielded better results than trying to utilize word embeddings
def get_majority_nature(nature_list):
  search_count = 0
  experience_count = 0
  for n in nature_list:
    if n == 0:
      search_count += 1
    elif n == 1:
      experience_count += 1

  if search_count > experience_count:
    return 0
  else:
    return 1 # For equal counts of each, assume experience due to skew of businesses

train_weights_processed['nature'] = train_weights_processed['categories_nature'].apply(get_majority_nature)
val_weights_processed['nature'] = val_weights_processed['categories_nature'].apply(get_majority_nature)
test_weights_processed['nature'] = test_weights_processed['categories_nature'].apply(get_majority_nature)

# Calculating Subjective and Objective Weights based on Similarity b/w Extractive Summary (TF-IDF) and Business Categories


In [None]:
def pre_process_review(review_text):
  review_text = review_text.lower()
  stop_words = set(stopwords.words('english'))
  review_text = ' '.join([word for word in review_text.split() if word not in stop_words])
  return review_text

In [None]:
train['pp_text'] = train['text'].apply(lambda x: pre_process_review(x))
val['pp_text'] = val['text'].apply(lambda x: pre_process_review(x))
test['pp_text'] = test['text'].apply(lambda x: pre_process_review(x))

In [None]:
train.to_parquet('/content/drive/MyDrive/Code + Data/bn_train_data_pp.parquet') # Save to parquet file as checkpoint
val.to_parquet('/content/drive/MyDrive/Code + Data/bn_val_data_pp.parquet') # Save to parquet file as checkpoint
test.to_parquet('/content/drive/MyDrive/Code + Data/bn_test_data_pp.parquet') # Save to parquet file as checkpoint

In [None]:
# Extractive summarization w/ TF-IDF
def preprocess_and_vectorize(text):
    sentences = sent_tokenize(text)  # Tokenize into sentences
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)
    return sentences, tfidf_matrix, vectorizer
def score_sentences(tfidf_matrix):
    sentence_scores = np.array(tfidf_matrix.sum(axis=1)).flatten()
    return sentence_scores
def generate_summary(text, num_sentences=3):
    sentences, tfidf_matrix, vectorizer = preprocess_and_vectorize(text)
    scores = score_sentences(tfidf_matrix)
    ranked_sentences = sorted(((score, index) for index, score in enumerate(scores)), reverse=True)
    top_sentences = sorted(ranked_sentences[:num_sentences], key=lambda x: x[1])
    summary = " ".join([sentences[index] for _, index in top_sentences])
    return summary

In [None]:
train['extractive_summary'] = train['pp_text'].apply(lambda x: generate_summary(x))
val['extractive_summary'] = val['pp_text'].apply(lambda x: generate_summary(x))
test['extractive_summary'] = test['pp_text'].apply(lambda x: generate_summary(x))

In [None]:
train.to_parquet('/content/drive/MyDrive/Code + Data/bn_train_data_pp_summarized.parquet')
val.to_parquet('/content/drive/MyDrive/Code + Data/bn_val_data_pp_summarized.parquet')
test.to_parquet('/content/drive/MyDrive/Code + Data/bn_test_data_pp_summarized.parquet')

In [None]:
def calculate_similarity(processed_text, keywords):
    # Create a CountVectorizer object
    vectorizer = CountVectorizer().fit([processed_text] + keywords)

    # Convert text and keywords to vectors
    text_vector = vectorizer.transform([processed_text])
    keyword_vectors = vectorizer.transform(keywords)

    # Calculate cosine similarity
    similarities = cosine_similarity(text_vector, keyword_vectors)

    # Create a dictionary of keyword-similarity pairs
    similarity_scores = {keyword: score for keyword, score in zip(keywords, similarities[0])}

    return similarity_scores

# Test usage
# text = subset_rev['summary'].iloc[0]
# keywords = subset_rev['categories'].iloc[0]

# similarity_results = calculate_similarity(text, keywords)
# print(similarity_results)

In [None]:
train['similarity_results'] = train.apply(lambda row: calculate_similarity(row['extractive_summary'], row['categories']), axis=1)
val['similarity_results'] = val.apply(lambda row: calculate_similarity(row['extractive_summary'], row['categories']), axis=1)
test['similarity_results'] = test.apply(lambda row: calculate_similarity(row['extractive_summary'], row['categories']), axis=1)

In [None]:
train.to_parquet('/content/drive/MyDrive/Code + Data/bn_train_data_pp_summarized_similarity.parquet')
val.to_parquet('/content/drive/MyDrive/Code + Data/bn_val_data_pp_summarized_similarity.parquet')
test.to_parquet('/content/drive/MyDrive/Code + Data/bn_test_data_pp_summarized_similarity.parquet')

In [None]:
def calculate_weights_by_category(similarity_results):
    category_values = {"0": [], "1": []}
    for key, value in similarity_results.items():
        category = cat_lookup.get(key)
        category_values[str(category)].append(value)
    weights = {}
    for category, values in category_values.items():
        if values:
            weights[category] = sum(values) / len(values)  # Average value
        else:
            weights[category] = 0.0  # Default to 0 if no values
    return weights


In [None]:
# Apply the function to each entry in 'summary' column
train['weights'] = train['similarity_results'].apply(lambda x: calculate_weights_by_category(x))
val['weights'] = val['similarity_results'].apply(lambda x: calculate_weights_by_category(x))
test['weights'] = test['similarity_results'].apply(lambda x: calculate_weights_by_category(x))

In [None]:
train.to_parquet('/content/drive/MyDrive/Code + Data/bn_train_data_pp_summarized_similarity_weights.parquet')
val.to_parquet('/content/drive/MyDrive/Code + Data/bn_val_data_pp_summarized_similarity_weights.parquet')
test.to_parquet('/content/drive/MyDrive/Code + Data/bn_test_data_pp_summarized_similarity_weights.parquet')

# Final df processing for train/val/test

In [None]:
# Load processed data
train_weights_processed = pd.read_parquet('/content/drive/MyDrive/Code + Data/bn_train_data_pp_summarized_similarity_weights.parquet')
val_weights_processed = pd.read_parquet('/content/drive/MyDrive/Code + Data/bn_val_data_pp_summarized_similarity_weights.parquet')
test_weights_processed = pd.read_parquet('/content/drive/MyDrive/Code + Data/bn_test_data_pp_summarized_similarity_weights.parquet')

In [None]:
# Extract weights for search and experience nature
train_weights_processed['search_similarity'] = train_weights_processed['weights'].apply(lambda x: x["0"])
train_weights_processed['experience_similarity'] = train_weights_processed['weights'].apply(lambda x: x["1"])

val_weights_processed['search_similarity'] = val_weights_processed['weights'].apply(lambda x: x["0"])
val_weights_processed['experience_similarity'] = val_weights_processed['weights'].apply(lambda x: x["1"])

test_weights_processed['search_similarity'] = test_weights_processed['weights'].apply(lambda x: x["0"])
test_weights_processed['experience_similarity'] = test_weights_processed['weights'].apply(lambda x: x["1"])

In [None]:
train_weights_processed.to_parquet('/content/drive/MyDrive/Code + Data/bn_train_data_final.parquet')
val_weights_processed.to_parquet('/content/drive/MyDrive/Code + Data/bn_val_data_final.parquet')
test_weights_processed.to_parquet('/content/drive/MyDrive/Code + Data/bn_test_data_final.parquet')