## **Import Libraries**

In [1]:
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
import openai
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import datetime
from bertopic import BERTopic
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file


In [None]:
df = pd.read_csv('../data/Mid_DS_NLP.csv')

In [None]:
def transform_date_to_monthly_date(date):
    # Convert the date to a datetime object
    date = datetime.datetime.strptime(date, "%Y-%m-%d")

    # Set the day to the first day of the month
    date = date.replace(day=1)

    # Return the formatted date
    return date.strftime("%Y-%m-%d")

In [None]:
df['month'] = df['dt'].apply(lambda x: transform_date_to_monthly_date(x))

In [None]:
available_games = df['app_name'].unique()

In [None]:
negative_df  = df[df['rating'].isin([1,2])]
positive_df  = df[df['rating'].isin([4,5])]

## **Dynamic Topic Modeling**

In [None]:
wot_df_timestamp = df[(df['app_name']=='World of Tanks Blitz') & \
            (df['rating'].isin([1,2]))]

In [None]:
# embeddings over time 
def get_embeddings(temp_df):
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = embedding_model.encode(temp_df['review'].values, show_progress_bar=True)
    return embedding_model, embeddings

def get_umap(n_neighbors=15,n_components=5, min_dist=0.0, metric='cosine', random_state=42):
    umap_model = UMAP(n_neighbors=n_neighbors, n_components=n_components, min_dist=min_dist, metric=metric, random_state=random_state)
    return umap_model

def get_hdbscan_model(min_cluster_size=50, metric='euclidean', cluster_selection_method='eom', prediction_data=True):
    hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size, metric=metric, cluster_selection_method=cluster_selection_method, prediction_data=prediction_data)
    return hdbscan_model

def get_vectorizer(ngram_range=(1, 2)):
    vectorizer_model = CountVectorizer(stop_words="english", ngram_range=ngram_range)
    return vectorizer_model

def get_representation_model():
    # GPT-3.5
    openai.api_key = os.environ['openai_key']
    prompt = """
    I have a topic that contains the following documents:
    [DOCUMENTS]
    The topic is described by the following keywords: [KEYWORDS]

    Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
    topic: <topic label>
    """
    openai_model = OpenAI(model="gpt-3.5-turbo", exponential_backoff=True, chat=True, prompt=prompt)

    # All representation models
    representation_model = {
        # "KeyBERT": keybert_model,
        "OpenAI": openai_model,  # Uncomment if you will use OpenAI
        # "MMR": mmr_model,
        # "POS": pos_model
    }
    return representation_model

In [None]:
def create_topic_model_embeddings(temp_df):
     
    embedding_model,embeddings = get_embeddings(temp_df)
    umap_model = get_umap()
    hdbscan_model = get_hdbscan_model()
    vectorizer_model = get_vectorizer()
    representation_model = get_representation_model()

    topic_model_timestamp = BERTopic(
        # Pipeline models
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        representation_model=representation_model,
        # Hyperparameters
        top_n_words=10,
        verbose=True
    )
    return(
        {
            'topic_model' : topic_model_timestamp,
            'embeddings' : {'embedding_model':embedding_model,'embeddings':embeddings}
        }
    )

In [None]:
def topic_modeling_labeling(temp_df,colname,topic_model,embeddings):
    topics, probs = topic_model.fit_transform(temp_df[colname].values, embeddings)

    # Setup ChatGPT's labels
    chatgpt_topic_labels = {topic: " | ".join(list(zip(*values))[0]) for topic, values in topic_model.topic_aspects_["OpenAI"].items()}
    chatgpt_topic_labels[-1] = "Outlier Topic"
    topic_model.set_topic_labels(chatgpt_topic_labels)

    # Get timestamps for dynamic topic modelling
    timestamps = temp_df['dt'].values 
    reviews = temp_df[colname].values
    
    # Extract topics over time
    topics_over_time = topic_model.topics_over_time(reviews, timestamps, nr_bins=20)

    return topics,probs,topics_over_time


In [None]:
def prepare_model_attributes(temp_df,appnames):
    print(f"Start topic modeling for {appnames} \n Shape: {temp_df.shape}")
    print("----------------------------------------------------------------")

    result = create_topic_model_embeddings(temp_df)
    print('Create topic model embeddings - Done')

    topic_model = result['topic_model']
    embeddings = result['embeddings']['embeddings']
    # Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
    reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
    print('Create reduced embeddings - Done')

    topics,probs,topics_over_time = topic_modeling_labeling(temp_df,'review',topic_model,embeddings)
    print('Create topic labeling - Done')

    temp_df['Topic'] = topics
    temp_df['Probability'] = probs

    summarised_topics = topic_model.get_topic_info()[['Topic','CustomName','Representation']]
    print('Create summarised topics - Done')

    response = \
    {
        'topic_model':topic_model,
        'topics_over_time' : topics_over_time,
        'reduced_embeddings' : reduced_embeddings,
        'data':temp_df,
        'summarised_topics' : summarised_topics
    }
    embeddings = topic_model.topic_embeddings_[indices]
    embeddings = UMAP(n_neighbors=2, n_components=2, metric='cosine', random_state=42).fit_transform(embeddings)

    summarised_data = \
    {
     'topics_over_time' : response.get('topics_over_time'),
     'reduced_embeddings': response.get('reduced_embeddings'),
     'data': response.get('data'),
     'summarised_topics':response.get('summarised_topics')
    }

    print(f"Topics: ", response.get('summarised_topics'))
    saved_results = summarised_data
    saved_models = response.get('topic_model')

    print(f"Finished topic modeling for {appnames}")
    print("----------------------------------------------------------------\n\n")
    return {
        'saved_results' : saved_results,
        'saved_models' : saved_models
    }

## **Negative themes per Game**

In [None]:
negative_appname_df = {game:negative_df[negative_df['app_name']==game] for game in available_games}

In [None]:
{game: negative_df[negative_df['app_name']==game].shape for game in available_games}

In [None]:
negative_appname_df_sample = {appnames:negative_appname_df[appnames].sample(n=13000, random_state=42) if len(negative_appname_df[appnames]) > 13000 else negative_appname_df[appnames] for appnames in negative_appname_df.keys()}

In [None]:
negative_themes_results = {}

In [None]:
for appnames in list(negative_appname_df_sample.keys()):
    # if ((appnames not in negative_themes_results.keys()) & (appnames!='Call of Duty: Mobile')):
    if ((appnames not in negative_themes_results.keys())):
        negative_themes_results[appnames] = prepare_model_attributes(negative_appname_df_sample[appnames],appnames)

In [None]:
neg_save_results = {games : negative_themes_results[games]['saved_results'] for games in negative_themes_results.keys()}
neg_save_models = {games : negative_themes_results[games]['saved_models'] for games in negative_themes_results.keys()}

## **Positive themes per Game**

In [None]:
positive_appname_df = {game:positive_df[positive_df['app_name']==game] for game in available_games}

In [None]:
{game: positive_df[positive_df['app_name']==game].shape for game in available_games}

In [None]:
positive_appname_df_sample = {appnames:positive_appname_df[appnames].sample(n=10000, random_state=42) if len(positive_appname_df[appnames]) > 10000 else positive_appname_df[appnames] for appnames in positive_appname_df.keys()}

In [None]:
positive_themes_results = {}

In [None]:
for appnames in list(positive_appname_df_sample.keys()):
    if appnames not in positive_themes_results.keys():
        positive_themes_results[appnames] = prepare_model_attributes(positive_appname_df_sample[appnames],appnames)

In [None]:
pos_save_results = {games : positive_themes_results[games]['saved_results'] for games in positive_themes_results.keys()}
pos_save_models = {games : positive_themes_results[games]['saved_models'] for games in positive_themes_results.keys()}

In [13]:
def create_topic_2d_embeddings(temp_model,temp_metadata):
    freq_df = temp_model.get_topic_freq()
    topics = sorted(freq_df.Topic.to_list())
    all_topics = temp_metadata['summarised_topics']['Topic'].values.tolist()
    indices = np.array([all_topics.index(topic) for topic in topics])
    embeddings = temp_model.topic_embeddings_[indices][1:]
    embeddings = UMAP(n_neighbors=2, n_components=2, metric='cosine', random_state=42).fit_transform(embeddings)
    temp_metadata['topic_embeddings_2d'] = embeddings
    temp_metadata['topic_embeddings'] = temp_model.topic_embeddings_

In [None]:
for appname in neg_save_results['saved_models'].keys():
    create_topic_2d_embeddings(neg_save_models['saved_models'][appname],neg_save_results['saved_results'][appname])
    create_topic_2d_embeddings(pos_save_models['saved_models'][appname],pos_save_results['saved_results'][appname])

In [None]:
for appname in neg_models.keys():
    create_topic_2d_embeddings(neg_models[appname],neg_metadata[appname])
    create_topic_2d_embeddings(pos_models[appname],neg_metadata[appname])

## **Save models**

In [None]:
import pickle 
# Open a file in binary mode for writing
with open('../results/saved_data/positive_themes/positive_data.pkl', 'wb') as f:
    # Pickle the dictionary to the file
    pickle.dump(pos_save_results, f)

with open('../results/saved_models/positive_themes/positive_models.pkl', 'wb') as f:
    # Pickle the dictionary to the file
    pickle.dump(pos_save_models, f)

## **Load models**

In [4]:
import pickle

# Open the pickle file in binary mode for reading
with open('../results/saved_models/negative_themes/negative_models.pkl', 'rb') as f:
    # Unpickle the dictionary from the file
    neg_models = pickle.load(f)
 
# Open the pickle file in binary mode for reading
with open('../results/saved_models/positive_themes/positive_models.pkl', 'rb') as f:
    # Unpickle the dictionary from the file
    pos_models = pickle.load(f)

# Open the pickle file in binary mode for reading
with open('../results/saved_data/negative_themes/negative_data.pkl', 'rb') as f:
    # Unpickle the dictionary from the file
    neg_metadata = pickle.load(f)

# Open the pickle file in binary mode for reading
with open('../results/saved_data/positive_themes/positive_data.pkl', 'rb') as f:
    # Unpickle the dictionary from the file
    pos_metadata = pickle.load(f)


In [14]:
for appname in neg_models.keys():
    create_topic_2d_embeddings(neg_models[appname],neg_metadata[appname])
    create_topic_2d_embeddings(pos_models[appname],pos_metadata[appname])

In [None]:
neg_metadata['Call of Duty: Mobile']['']

In [15]:
import pickle 
# Open a file in binary mode for writing
with open('../results/saved_data//negative_themes/negative_data.pkl', 'wb') as f:
    # Pickle the dictionary to the file
    pickle.dump(neg_metadata, f)

with open('../results/saved_models/negative_themes/negative_models.pkl', 'wb') as f:
    # Pickle the dictionary to the file
    pickle.dump(neg_models, f)

import pickle 
# Open a file in binary mode for writing
with open('../results/saved_data/positive_themes/positive_data.pkl', 'wb') as f:
    # Pickle the dictionary to the file
    pickle.dump(pos_metadata, f)

with open('../results/saved_models/positive_themes/positive_models.pkl', 'wb') as f:
    # Pickle the dictionary to the file
    pickle.dump(pos_models, f)

## Save and Load BERTopic Model

In [None]:
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
topic_model.save("../models/topic_model_wot", serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)

In [None]:
from sentence_transformers import SentenceTransformer

# Define embedding model
loaded_embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Load model and add embedding model
loaded_model = BERTopic.load("../models/topic_model_wot/", embedding_model=loaded_embedding_model)

In [None]:
temp_df = wot_df_timestamp[wot_df_timestamp['month']==max(wot_df_timestamp['month'])]

In [None]:
topics, probs  = loaded_model.transform(temp_df['review'].values)

In [None]:
loaded_model.get_topic_info()

In [None]:
loaded_model.visualize_hierarchy(custom_labels=True)