# Proof Of Concept

## Pre-Requisites

In [1]:
from bertopic import BERTopic

  from .autonotebook import tqdm as notebook_tqdm


In [58]:
import plotly.io as pio
import pandas as pd
import nbformat

## Data

In [18]:
df = pd.read_csv("../data/netflix_reviews.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113626 entries, 0 to 113625
Data columns (total 8 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   reviewId              113626 non-null  object
 1   userName              113624 non-null  object
 2   content               113624 non-null  object
 3   score                 113626 non-null  int64 
 4   thumbsUpCount         113626 non-null  int64 
 5   reviewCreatedVersion  96986 non-null   object
 6   at                    113626 non-null  object
 7   appVersion            96986 non-null   object
dtypes: int64(2), object(6)
memory usage: 6.9+ MB


In [19]:
df = df.dropna(subset=['content'])
df.reset_index(drop=True, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113624 entries, 0 to 113623
Data columns (total 8 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   reviewId              113624 non-null  object
 1   userName              113622 non-null  object
 2   content               113624 non-null  object
 3   score                 113624 non-null  int64 
 4   thumbsUpCount         113624 non-null  int64 
 5   reviewCreatedVersion  96985 non-null   object
 6   at                    113624 non-null  object
 7   appVersion            96985 non-null   object
dtypes: int64(2), object(6)
memory usage: 6.9+ MB


In [34]:
df_sample = df.sample(n=1000, random_state=42).reset_index(drop=True)
df_sample.info()
df_sample.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   reviewId              1000 non-null   object
 1   userName              1000 non-null   object
 2   content               1000 non-null   object
 3   score                 1000 non-null   int64 
 4   thumbsUpCount         1000 non-null   int64 
 5   reviewCreatedVersion  868 non-null    object
 6   at                    1000 non-null   object
 7   appVersion            868 non-null    object
dtypes: int64(2), object(6)
memory usage: 62.6+ KB


Unnamed: 0,reviewId,userName,content,score,thumbsUpCount,reviewCreatedVersion,at,appVersion
0,1cf9ad0d-1d0b-4b7c-8b4c-d985e59f9ca1,Richard Law,It came pre-insralled I don't want it and it w...,1,0,,2021-09-26 13:57:00,
1,36f9a65a-88cc-431f-a915-4a72baed7ff1,s r,Needs a netflix lite app because this one no l...,3,1,8.7.0 build 9 40060,2021-11-23 17:24:31,8.7.0 build 9 40060
2,4ef70a8f-b049-4e59-a930-96fe0c28b965,Cameron Werner,They make you pay $7.50/month extra to use you...,1,3,8.96.1 build 16 50568,2023-12-31 04:13:27,8.96.1 build 16 50568
3,aa806eab-c919-48fd-a36e-21084b8cce91,Aliza Zaidi,"Good I guess, doesn't have all the great movie...",4,0,7.83.0 build 25 35223,2020-12-06 03:25:20,7.83.0 build 25 35223
4,8c3cca29-888c-478f-ace2-20c1c20757e3,Shiva Lr,Worst app ever app too bad noo clear voice no ...,1,0,8.58.0 build 8 50362,2024-03-19 10:36:45,8.58.0 build 8 50362


## Model

In [35]:
topic_model = BERTopic()
topics, _ = topic_model.fit_transform(df_sample['content'].tolist())

In [36]:
# Get topic information
topics_info = topic_model.get_topic_info()

# Display the topics
print("Topic Information:\n", topics_info)

Topic Information:
     Topic  Count                              Name  \
0      -1    304                  -1_the_to_it_and   
1       0    160                   0_app_and_to_it   
2       1     79               1_it_app_error_open   
3       2     59                2_video_the_fix_to   
4       3     46       3_language_in_hindi_kannada   
5       4     46         4_netflix_love_and_movies   
6       5     45           5_netflix_open_this_and   
7       6     32        6_seasons_shows_season_and   
8       7     32          7_movies_love_shows_good   
9       8     31               8_netflix_to_for_my   
10      9     30           9_payment_card_my_debit   
11     10     28             10_netflix_app_is_and   
12     11     23  11_brightness_video_control_dark   
13     12     21       12_account_my_email_netflix   
14     13     17           13_good_martin_opp_braw   
15     14     17     14_cast_chromecast_to_netflix   
16     15     16        15_log_account_password_in   
17     1

In [37]:
# Get the most frequent words for a specific topic
for topic in topics_info.head(5).Topic:
    print(f"\nTopic {topic}:")
    print(topic_model.get_topic(topic))



Topic -1:
[('the', 0.0351835014585878), ('to', 0.03218639660099444), ('it', 0.03085384492435402), ('and', 0.028787137138304893), ('netflix', 0.026363759893671315), ('is', 0.025500548186002135), ('you', 0.02298012472608654), ('for', 0.0223788431292308), ('of', 0.022307836586967306), ('but', 0.02202515226690454)]

Topic 0:
[('app', 0.05271274946562883), ('and', 0.04041159139001731), ('to', 0.03571332023972341), ('it', 0.033584747546953904), ('movies', 0.03352807145210893), ('the', 0.03306600212972079), ('this', 0.031512671994573825), ('watch', 0.03050937480556882), ('great', 0.02756344424251322), ('of', 0.026117770340250958)]

Topic 1:
[('it', 0.0555007384096586), ('app', 0.048808558661489455), ('error', 0.041614351777782596), ('open', 0.03893972676875555), ('the', 0.03550685774483105), ('update', 0.03261710848509054), ('and', 0.030281719014699834), ('this', 0.029918414395424756), ('to', 0.029282129784543646), ('not', 0.028841935778380096)]

Topic 2:
[('video', 0.047901964283749016), ('

In [65]:
# Display sample reviews for a specific topic
topic_reviews = df_sample.loc[topics == 1, 'content'].head(10)
print("Sample Reviews for Topic 1:\n", topic_reviews)

KeyError: False