## **Import Libraries**

In [1]:
import datetime
import pandas as pd
import numpy as np
import os
import openai
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import OpenAI
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# transform datetime to date format in monthly basis
def transform_date_to_monthly_date(date):
    # Convert the date to a datetime object
    date = datetime.datetime.strptime(date, "%Y-%m-%d")

    # Set the day to the first day of the month
    date = date.replace(day=1)

    # Return the formatted date
    return date.strftime("%Y-%m-%d")

## Data collection and division to positive and negative reviews

In [None]:
# Read game reviews as dataframe
df = pd.read_csv('../data/Mid_DS_NLP.csv')
df['month'] = df['dt'].apply(lambda x: transform_date_to_monthly_date(x))

available_games = df['app_name'].unique()

# Keep positive and negative reviews based on their selected rate between the range 1 and 5
negative_df  = df[df['rating'].isin([1,2])]
positive_df  = df[df['rating'].isin([4,5])]

## **Feature Extractiong**

In [None]:
# Pre-calculate embeddings, converting documents (reviews) into numerical values
def get_embeddings(temp_df):
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = embedding_model.encode(temp_df['review'].values, show_progress_bar=True)
    return embedding_model, embeddings

# Use a dimensionality reduction algorithm (in this case UMAP), to reduce the size of the embeddings.
def get_umap(n_neighbors=15,n_components=5, min_dist=0.0, metric='cosine', random_state=42):
    umap_model = UMAP(n_neighbors=n_neighbors, n_components=n_components, min_dist=min_dist, metric=metric, random_state=random_state)
    return umap_model

# Use a clustering model to control the number of grouped topics
def get_hdbscan_model(min_cluster_size=50, metric='euclidean', cluster_selection_method='eom', prediction_data=True):
    hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size, metric=metric, cluster_selection_method=cluster_selection_method, prediction_data=prediction_data)
    return hdbscan_model

# Use the CountVectorizer, a default representation of topics which convert words into tokens.
# Remove stopwords, ignore infrequent words, and increase the n-gram range
def get_vectorizer(ngram_range=(1, 2)):
    vectorizer_model = CountVectorizer(stop_words="english", ngram_range=ngram_range)
    return vectorizer_model

# Use Representation model for labeling the topic extraction from documents (reviews)
def get_representation_model():
    # GPT-3.5
    openai.api_key = os.environ['openai_key']
    prompt = """
    I have a topic that contains the following documents:
    [DOCUMENTS]
    The topic is described by the following keywords: [KEYWORDS]

    Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
    topic: <topic label>
    """
    openai_model = OpenAI(model="gpt-3.5-turbo", exponential_backoff=True, chat=True, prompt=prompt)

    # All representation models
    representation_model = {
        # "KeyBERT": keybert_model,
        "OpenAI": openai_model,  # Uncomment if you will use OpenAI
        # "MMR": mmr_model,
        # "POS": pos_model
    }
    return representation_model

# Based on data provided in parameters, start the procedure of extracting embeddings and topic modeling and labeling 
def create_topic_model_embeddings(temp_df):
     
    embedding_model,embeddings = get_embeddings(temp_df)
    umap_model = get_umap()
    hdbscan_model = get_hdbscan_model()
    vectorizer_model = get_vectorizer()
    representation_model = get_representation_model()

    topic_model_timestamp = BERTopic(
        # Pipeline models
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        representation_model=representation_model,
        # Hyperparameters
        top_n_words=10,
        verbose=True
    )
    return(
        {
            'topic_model' : topic_model_timestamp,
            'embeddings' : {'embedding_model':embedding_model,'embeddings':embeddings}
        }
    )

# After topic modeling and labeling, assign the custom topic themes provided by BERTopic and OpenAI model to the extracted topics over time
def topic_modeling_labeling(temp_df,colname,topic_model,embeddings):
    topics, probs = topic_model.fit_transform(temp_df[colname].values, embeddings)

    # Setup ChatGPT's labels
    chatgpt_topic_labels = {topic: " | ".join(list(zip(*values))[0]) for topic, values in topic_model.topic_aspects_["OpenAI"].items()}
    chatgpt_topic_labels[-1] = "Outlier Topic"
    topic_model.set_topic_labels(chatgpt_topic_labels)

    # Get timestamps for dynamic topic modelling
    timestamps = temp_df['dt'].values 
    reviews = temp_df[colname].values
    
    # Extract topics over time
    topics_over_time = topic_model.topics_over_time(reviews, timestamps, nr_bins=20)

    return topics,probs,topics_over_time


# Prepare BERTopic model for dynamic topic modeling and labeling and extract results
def prepare_model_attributes(temp_df,appnames):
    print(f"Start topic modeling for {appnames} \n Shape: {temp_df.shape}")
    print("----------------------------------------------------------------")

    result = create_topic_model_embeddings(temp_df)
    print('Create topic model embeddings - Done')

    topic_model = result['topic_model']
    embeddings = result['embeddings']['embeddings']
    # Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
    reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
    print('Create reduced embeddings - Done')

    topics,probs,topics_over_time = topic_modeling_labeling(temp_df,'review',topic_model,embeddings)
    print('Create topic labeling - Done')

    temp_df['Topic'] = topics
    temp_df['Probability'] = probs

    summarised_topics = topic_model.get_topic_info()[['Topic','CustomName','Representation']]
    print('Create summarised topics - Done')

    response = \
    {
        'topic_model':topic_model,
        'topics_over_time' : topics_over_time,
        'reduced_embeddings' : reduced_embeddings,
        'data':temp_df,
        'summarised_topics' : summarised_topics
    }
   
    summarised_data = \
    {
     'topics_over_time' : response.get('topics_over_time'),
     'reduced_embeddings': response.get('reduced_embeddings'),
     'data': response.get('data'),
     'summarised_topics':response.get('summarised_topics')
    }

    print(f"Topics: ", response.get('summarised_topics'))
    saved_results = summarised_data
    saved_models = response.get('topic_model')

    print(f"Finished topic modeling for {appnames}")
    print("----------------------------------------------------------------\n\n")
    return {
        'saved_results' : saved_results,
        'saved_models' : saved_models
    }


## Topic Modeling and Labeling for Game App Reviews

**Objective**:
Generate a random sample of 13,000 reviews over time for each game app to create a high-performance generative model with efficient local execution.

**Key Steps**:

 1. **Topic Modeling**:
     - Employ advanced techniques for topic modeling to extract meaningful themes from the game app reviews.
     - Utilize algorithms that enhance the identification of key topics within the large dataset. 
 2. **Labeling**:
     - Implement a robust labeling system to categorize reviews into relevant topics.
     - Ensure the accuracy of labels for training a precise generative model.
 3. **Data Sampling**:
    - Randomly sample 13,000 reviews per game app over different time periods to capture the evolution of sentiments and themes.
    - Consider stratified sampling to maintain representation across various app versions and user demographics.
 4. **Generative Model Performance**:
    - Evaluate and choose a generative model that excels in execution time on a local environment.
    - Optimize parameters for enhanced performance without compromising model accuracy.

### **Negative reviews per Game**

In [None]:
negative_appname_df = {game:negative_df[negative_df['app_name']==game] for game in available_games}
negative_appname_df_sample = {appnames:negative_appname_df[appnames].sample(n=13000, random_state=42) if len(negative_appname_df[appnames]) > 13000 else negative_appname_df[appnames] for appnames in negative_appname_df.keys()}

In [None]:
negative_themes_results = {}

In [None]:
for appnames in list(negative_appname_df_sample.keys()):
    if ((appnames not in negative_themes_results.keys())):
        negative_themes_results[appnames] = prepare_model_attributes(negative_appname_df_sample[appnames],appnames)

In [None]:
neg_save_results = {games : negative_themes_results[games]['saved_results'] for games in negative_themes_results.keys()}
neg_save_models = {games : negative_themes_results[games]['saved_models'] for games in negative_themes_results.keys()}

### **Positive themes per Game**

In [None]:
positive_appname_df = {game:positive_df[positive_df['app_name']==game] for game in available_games}
positive_appname_df_sample = {appnames:positive_appname_df[appnames].sample(n=10000, random_state=42) if len(positive_appname_df[appnames]) > 10000 else positive_appname_df[appnames] for appnames in positive_appname_df.keys()}

In [None]:
positive_themes_results = {}

In [None]:
for appnames in list(positive_appname_df_sample.keys()):
    if appnames not in positive_themes_results.keys():
        positive_themes_results[appnames] = prepare_model_attributes(positive_appname_df_sample[appnames],appnames)

In [None]:
pos_save_results = {games : positive_themes_results[games]['saved_results'] for games in positive_themes_results.keys()}
pos_save_models = {games : positive_themes_results[games]['saved_models'] for games in positive_themes_results.keys()}

## Create 2D embeddings for positive and negative reviews

In [13]:
def create_topic_2d_embeddings(temp_model,temp_metadata):
    freq_df = temp_model.get_topic_freq()
    topics = sorted(freq_df.Topic.to_list())
    all_topics = temp_metadata['summarised_topics']['Topic'].values.tolist()
    indices = np.array([all_topics.index(topic) for topic in topics])
    embeddings = temp_model.topic_embeddings_[indices][1:]
    embeddings = UMAP(n_neighbors=2, n_components=2, metric='cosine', random_state=42).fit_transform(embeddings)
    temp_metadata['topic_embeddings_2d'] = embeddings
    temp_metadata['topic_embeddings'] = temp_model.topic_embeddings_

In [None]:
for appname in neg_save_results['saved_models'].keys():
    create_topic_2d_embeddings(neg_save_models['saved_models'][appname],neg_save_results['saved_results'][appname])
    create_topic_2d_embeddings(pos_save_models['saved_models'][appname],pos_save_results['saved_results'][appname])

## **Save models**

In [None]:
import pickle 
# Open a file in binary mode for writing
with open('../results/saved_data/negative_themes/negative_data.pkl', 'wb') as f:
    # Pickle the dictionary to the file
    pickle.dump(neg_save_results, f)

with open('../results/saved_models/negative_themes/negative_models.pkl', 'wb') as f:
    # Pickle the dictionary to the file
    pickle.dump(neg_save_models, f)


# Open a file in binary mode for writing
with open('../results/saved_data/positive_themes/positive_data.pkl', 'wb') as f:
    # Pickle the dictionary to the file
    pickle.dump(pos_save_results, f)

with open('../results/saved_models/positive_themes/positive_models.pkl', 'wb') as f:
    # Pickle the dictionary to the file
    pickle.dump(pos_save_models, f)