In [1]:
from bertopic import BERTopic
import pandas as pd

# Functions

In [2]:
def visualize_version(model, df, class_model, versions, threshold=1, width=1200, height=600, remove_outliers=False, sort_yaxes=False):
    """
    Parameters:
        model:          topic_model
        df:             review_dataset
        class_model:    class model created using the topic_model.topics_per_class() method
        versions:       array object with the version names
        threshold:      remove topics with freq below threshold
        width:          width of the graph
        height:         height of the graph
    """

    df = class_model[class_model['Class'].isin(versions)].sort_values(by=['Frequency'], ascending=False)
    if threshold:
        df = df[df['Frequency'] >= threshold]
    if remove_outliers:
        df = df[df['Topic'] != -1]

    topic_list = list(dict.fromkeys(list(df['Topic'])))

    graph = BERTopic.visualize_topics_per_class(model, df, topics=topic_list)

    graph.update_layout(margin=dict(l=20, r=20, t=20, b=20), width=width, height=height)

    graph.update_yaxes(categoryorder='total descending')

    if sort_yaxes:
        graph.update_yaxes(categoryorder='array', categoryarray=versions)

    return graph

In [3]:
def get_documents_version(model, df, version, topic_num, connect='content_corrected'):

    df_test2 = pd.merge(model.get_document_info(df['content_corrected']), df, left_on='Document', right_on=connect)

    version_specific = df_test2[df_test2['reviewCreatedVersion'] == version][['content', 'content_corrected', 'Topic']]

    return list(version_specific[version_specific['Topic'] == topic_num]['content'])

In [4]:
def get_document_info_expand(model, df):

    return pd.merge(model.get_document_info(df['content_corrected']), df, left_on='Document', right_on='content_corrected')

# Load data

In [5]:
df_netflix = pd.read_csv(f'preprocessed_data/prep_netflix_v4.csv')
df_youtube = pd.read_csv(f'preprocessed_data/prep_youtube_v4.csv')
df_whatsapp = pd.read_csv(f'preprocessed_data/prep_whatsapp_v4.csv')
df_paypal = pd.read_csv(f'preprocessed_data/prep_paypal_v4.csv')
df_amazon = pd.read_csv(f'preprocessed_data/prep_amazon_v4.csv')

  df_amazon = pd.read_csv(f'preprocessed_data/prep_amazon_v4.csv')


# Load old models

In [6]:
topic_model_netflix = BERTopic.load('models/topicmodel_netflix_hdbscan_v1.model')
topic_model_youtube = BERTopic.load('models/topicmodel_youtube_hdbscan_v1.model')
topic_model_whatsapp = BERTopic.load('models/topicmodel_whatsapp_hdbscan_v1.model')
topic_model_paypal = BERTopic.load('models/topicmodel_paypal_hdbscan_v1.model')
topic_model_amazon = BERTopic.load('models/topicmodel_amazon_hdbscan_v1.model')

  return _core.array(a, dtype, False, order)
  return _core.array(a, dtype, False, order)


# Load new models (Outliers reduced)

In [6]:
topic_model_netflix = BERTopic.load('models/topicmodel_netflix_hdbscan_v2.model')
topic_model_youtube = BERTopic.load('models/topicmodel_youtube_hdbscan_v2.model')
topic_model_whatsapp = BERTopic.load('models/topicmodel_whatsapp_hdbscan_v2.model')
topic_model_paypal = BERTopic.load('models/topicmodel_paypal_hdbscan_v2.model')
topic_model_amazon = BERTopic.load('models/topicmodel_amazon_hdbscan_v2.model')

# Visualization

## Netflix Version Specific Analysis

In [None]:
classes_complete_netflix = list(df_netflix['reviewCreatedVersion'])
versions = list(set(classes_complete_netflix))
versions.sort(key=lambda x: list(map(int, x.split(' ')[0].split('.'))))

In [8]:
topics_per_class_netflix = topic_model_netflix.topics_per_class(df_netflix['content_corrected'], classes=classes_complete_netflix)

442it [00:18, 24.09it/s]


In [None]:
df_info_netflix = get_document_info_expand(topic_model_netflix, df_netflix)

In [10]:
visualize_version(
    topic_model_netflix,
    df_info_netflix,
    topics_per_class_netflix,
    ['8.64.0 build 8 50394'],
    remove_outliers=False
    )

In [None]:
get_documents_version(topic_model_netflix, df_netflix, '8.64.0 build 8 50394', 4)

## YouTube Version Specific Analysis

In [11]:
classes_complete_youtube = list(df_youtube['reviewCreatedVersion'])
versions = list(set(classes_complete_youtube))
versions.sort(key=lambda x: list(map(int, x.split('.'))))

In [None]:
topics_per_class_youtube = topic_model_youtube.topics_per_class(df_youtube['content_corrected'], classes=classes_complete_youtube)

In [13]:
df_info_youtube = get_document_info_expand(topic_model_youtube, df_youtube)

In [14]:
visualize_version(
    topic_model_youtube,
    df_info_youtube,
    topics_per_class_youtube,
    ['18.14.37'],
)

In [None]:
get_documents_version(topic_model_youtube, df_youtube, '18.14.37', 11)

## WhatsApp Version Specific analysis

In [15]:
classes_complete_whatsapp = list(df_whatsapp['reviewCreatedVersion'])
versions = list(set(classes_complete_whatsapp))
versions.sort(key=lambda x: list(map(int, x.split('.'))))

In [None]:
topics_per_class_whatsapp = topic_model_whatsapp.topics_per_class(df_whatsapp['content_corrected'], classes=classes_complete_whatsapp)

In [17]:
df_info_whatsapp = get_document_info_expand(topic_model_whatsapp, df_whatsapp)

In [18]:
visualize_version(topic_model_whatsapp, df_info_whatsapp, topics_per_class_whatsapp, ['2.23.7.14'])

In [None]:
get_documents_version(topic_model_whatsapp, df_whatsapp, '2.23.7.14', 0)

## Paypal Version Specific Analysis

In [19]:
classes_complete_paypal = list(df_paypal['reviewCreatedVersion'])
versions = list(set(classes_complete_paypal))
versions.sort(key=lambda x: list(map(int, x.split('.'))))

In [None]:
topics_per_class_paypal = topic_model_paypal.topics_per_class(df_paypal['content_corrected'], classes=classes_complete_paypal)

In [21]:
df_info_paypal = get_document_info_expand(topic_model_paypal, df_paypal)

In [22]:
visualize_version(topic_model_paypal, df_info_paypal, topics_per_class_paypal, ['8.37.1'])

In [None]:
get_documents_version(topic_model_paypal, df_paypal, '8.37.1', -1)

## Amazon Version Specific Analysis

In [23]:
classes_complete_amazon = list(df_amazon['reviewCreatedVersion'])
versions = list(set(classes_complete_amazon))
versions.sort(key=lambda x: list(map(int, x.split('.'))))

In [None]:
topics_per_class_amazon = topic_model_amazon.topics_per_class(df_amazon['content_corrected'], classes=classes_complete_amazon)

In [25]:
df_info_amazon = get_document_info_expand(topic_model_amazon, df_amazon)

In [26]:
visualize_version(topic_model_amazon, df_info_amazon, topics_per_class_amazon, ['26.8.0.100'])

In [None]:
get_documents_version(topic_model_amazon, df_amazon, '26.8.0.100', 5)