In [1]:
import numpy as np
from sklearn.metrics import f1_score, recall_score, accuracy_score
from collections import defaultdict
from sklearn.svm import LinearSVC, LinearSVR
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split

### Data file path

In [2]:
data_path = "/content/drive/MyDrive/Colab Notebooks/Capstone_govt_of_canada/data/Page feedback-Travel-May17.csv"

### Read the csv file and filter out the NA and unckecked data

In [3]:
df = pd.read_csv(data_path)
df = df[df['Tags confirmed']=='checked'][df['Lang'] == 'EN'].dropna()
# df = df.replace(np.nan, "?")

### Split the data into train and test

In [4]:
train_df, dev_df = train_test_split(df, test_size=0.2, random_state=11)
train_df

Unnamed: 0,Unique ID,Date,URL,Page title,Comment,Tags,Refining details,Status,What's wrong,Lang,Tags confirmed
4300,605548020725e7267433a9ce,"20 March, 2021",travel.gc.ca/travel-covid/travel-restrictions/...,Mandatory quarantine or isolation – Travel res...,I want to know what I need to go from USA stra...,Restrictions or Requirements,Transit,New,The answer I need is missing,EN,checked
8825,603fc37ea178c1115c685ce0,"3 March, 2021",travel.gc.ca/travel-covid/travel-restrictions/...,Mandatory quarantine or isolation – Travel res...,Do I need to have a covid nasal test to travel...,Restrictions or Requirements,Have been vaccinated,New,The information isn’t clear,EN,checked
12078,6030e4e4ff2ed3160c929f35,"20 February, 2021",travel.gc.ca/travel-covid/travel-restrictions/...,Find out if you can travel to Canada - Citizen...,I am Canadian citizen returning after seeing m...,Can I enter Canada,Compassionate,New,The answer I need is missing,EN,checked
10811,60364fb0be856917345675f1,"24 February, 2021",travel.gc.ca/travel-covid/travel-restrictions/...,Mandatory quarantine or isolation – Travel res...,If i am vaccinated against covid should i make...,Quarantine,Have been vaccinated,New,The answer I need is missing,EN,checked
4908,60528bee23caed19c0880950,"17 March, 2021",travel.gc.ca/travel-covid/travel-restrictions/...,Provincial and territorial restrictions - Trav...,A summary of travel restrictions of who or who...,Design / content,Just main topic,New,Other reason,EN,checked
...,...,...,...,...,...,...,...,...,...,...,...
1689,607c2f1229c84e0a18921de3,"18 April, 2021",travel.gc.ca/travel-covid/travel-restrictions/...,Driving to Canada requirements checklist – Tr...,I need to know \r\n\r\nWhy if I live alone I h...,Hotels,Just main topic,New,I can't find the information,EN,checked
5257,604f0b9023caed19c087fbde,"15 March, 2021",travel.gc.ca/travel-covid/travel-restrictions/...,Find out if you can travel to Canada - Foreign...,Visiting Canada for business partnerships. Sen...,Can I enter Canada,"""Foreigners, work permit, CoPR""",New,The answer I need is missing,EN,checked
9933,603a57a4e70c3811d8519fd2,"27 February, 2021",travel.gc.ca/travel-covid/travel-restrictions/...,Mandatory quarantine or isolation – Travel res...,Question: 1) I will be by Moderna vaccines bef...,Hotels,Have been vaccinated,New,The answer I need is missing,EN,checked
6976,60476c0aab4a361274ea75c7,"9 March, 2021",travel.gc.ca/travel-covid/travel-restrictions/...,Find out if you can travel to Canada - Citizen...,I'm Karan Verma with my mother Meena Verma wit...,Can I enter Canada,"""Foreigners, work permit, CoPR""",New,Other reason,EN,checked


### Install and import the BERT topic model (Please restart the runtime after install)

In [5]:
!pip install bertopic



In [6]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

##### Read the comment column as docs list

In [7]:
docs = df.Comment.values

#### Load the model and create embeddings from docs

In [8]:
sentence_model = SentenceTransformer("distilbert-base-nli-mean-tokens")
embeddings = sentence_model.encode(docs, show_progress_bar=True)

HBox(children=(FloatProgress(value=0.0, max=244733649.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Batches', max=289.0, style=ProgressStyle(description_widt…




### Fit the model by docs and embeddings

In [9]:
topic_model = BERTopic(calculate_probabilities=True, top_n_words = 20,
                #  nr_topics = 22,
                 n_gram_range = (1, 2),
                 min_topic_size = 30,
                #  n_neighbors = 15,
                #  n_components = 5,
                 verbose=True)
topics, probs = topic_model.fit_transform(docs, embeddings)

2021-05-28 22:19:03,716 - BERTopic - Reduced dimensionality with UMAP
2021-05-28 22:19:04,910 - BERTopic - Clustered UMAP embeddings with HDBSCAN


### Add the clustering of topics to the dataframe

In [10]:
df['topic'] = topics

#### Output the dataframe to an excel file

In [None]:
df.to_excel("output.xlsx") 

#### Get the topic infomation

In [11]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,1,4524,1_canada_in_and_from
1,-1,1454,-1_the_for_test_have
2,20,593,20_not_no_information_is
3,26,325,26_vaccine_if_have_vaccinated
4,3,325,3_days_hotel_stay_day
5,25,231,25_where_how_looking_test
6,7,214,7_hotel_quarantine_stay_the hotel
7,23,203,23_my_and_she_her
8,27,141,27_vaccinated_vaccinated what_what_been vaccin...
9,19,134,19_not_this_it_but


In [12]:
topic_model.get_topics()

{-1: [('the', 0.01439515411893185),
  ('for', 0.013901189664463158),
  ('test', 0.013868984770863808),
  ('have', 0.01329424551332015),
  ('quarantine', 0.012657740177577828),
  ('and', 0.012620845505386313),
  ('if', 0.01233754889849972),
  ('is', 0.012264025242369803),
  ('my', 0.011309547601893837),
  ('do', 0.010632052274810152),
  ('are', 0.010419790837814212),
  ('of', 0.010175560807168233),
  ('need', 0.010086051375793196),
  ('what', 0.009829660215373974),
  ('hotel', 0.009805572904011057),
  ('covid', 0.009787016265943698),
  ('not', 0.009569699767282237),
  ('can', 0.009241065102942399),
  ('be', 0.008817763450297262),
  ('from', 0.008716285884729903),
  ('it', 0.008483828537912807),
  ('on', 0.008384121674597693),
  ('travel', 0.008381543133110963),
  ('in', 0.008159540492408355),
  ('we', 0.008064688332717226),
  ('need to', 0.008025620188057515),
  ('that', 0.007985390298263407),
  ('you', 0.007985175464638297),
  ('with', 0.007912853438006892),
  ('vaccinated', 0.00770289

### Plot the distribution of target in each topic

In [13]:
import altair as alt
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [14]:
alt.Chart(df).mark_bar().encode(
    x=alt.X('Tags'),
    y='count()',
    color='Tags',
    tooltip=alt.Tooltip('Tags')
).facet('topic', columns=4)

# alt.Chart(df).mark_bar().encode(
#     alt.X('Tags'),
#     y='count()')