In [None]:
# Create a function that returns the required local i.e. UTF-8

import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [None]:
!pip install bertopic
!pip install flair
!apt-get -qq install -y libfluidsynth1

Collecting bertopic
  Downloading bertopic-0.16.0-py2.py3-none-any.whl (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.1/154.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.33.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap_learn-0.5.6-py3-none-any.whl (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Collecting sentence-transformers>=0.4.1 (from bertopic)
  Downloading sentence_transformers-2.6.1-py3-none-any.whl (163 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.3

In [None]:
# Data processing
import pandas as pd
import numpy as np
# Text preprocessiong
import nltk
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')
wn = nltk.WordNetLemmatizer()
# Topic model
from bertopic import BERTopic
# Dimension reduction
from umap import UMAP
# Clustering
from hdbscan import HDBSCAN
from sklearn.cluster import KMeans
# Count vectorization
from sklearn.feature_extraction.text import CountVectorizer
# Sentence transformer
from sentence_transformers import SentenceTransformer
# Flair
from transformers.pipelines import pipeline
from flair.embeddings import TransformerDocumentEmbeddings, WordEmbeddings, DocumentPoolEmbeddings, StackedEmbeddings

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
df = pd.read_csv('/content/nytimes_articles.csv')
df

Unnamed: 0,Article Title,Abstract,Lead_Paragraph
0,An A.I. Pioneer on What We Should Really Fear,“Some people naïvely think if we teach A.I. ‘D...,“Some people naïvely think if we teach A.I. ‘D...
1,The Case for Longtermism,We are living through an extraordinary and pre...,Imagine living the life of every human being w...
2,How Tech Giants Are Devising Real Ethics for A...,Four people involved in the creation of an ind...,"SAN FRANCISCO — For years, science-fiction mov..."
3,New Research Center to Explore Ethics of Artif...,"The president of Carnegie Mellon University, w...",Carnegie Mellon University plans to announce o...
4,Artificial Intelligence as a Threat,Smarter technology requires smarter humans to ...,Ebola sounds like the stuff of nightmares. Bir...
5,Study to Examine Effects of Artificial Intelli...,A study hosted by Stanford University will exa...,Scientists have begun what they say will be a ...
6,Artificial Intelligence Is Far From Matching H...,At an event sponsored by the Office of Science...,SEATTLE — Never mind Terminator-like killer ro...
7,Singapore’s Governing Framework for Artificial...,Could the island state’s new instrument provid...,
8,"Tech’s Ethical ‘Dark Side’: Harvard, Stanford ...",Schools that helped produce some of Silicon Va...,"PALO ALTO, Calif. — The medical profession has..."
9,Can Artificial Intelligence Keep Your Home Sec...,Security companies are hoping to harness the p...,Home security is expected to be a $47.5 billio...


In [None]:
# Get the dataset information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Article Title   50 non-null     object
 1   Abstract        50 non-null     object
 2   Lead_Paragraph  48 non-null     object
dtypes: object(3)
memory usage: 1.3+ KB


In [None]:
# Remove stopwords
stopwords = nltk.corpus.stopwords.words('english')
print(f'There are {len(stopwords)} default stopwords. They are {stopwords}')

There are 179 default stopwords. They are ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'no

In [None]:
# Handle NaN values in the 'Lead_Paragraph' column by replacing them with an empty string
df['Lead_Paragraph'] = df['Lead_Paragraph'].fillna('')

# Remove stopwords
df['lp_without_stop_words'] = df['Lead_Paragraph'].apply(lambda x: ' '.join([w for w in x.split() if w.lower() not in stopwords]))
# Lemmatization
df['lp_lemmatized'] = df['lp_without_stop_words'].apply(lambda x: ' '.join([wn.lemmatize(w) for w in x.split() if w not in stopwords]))
# Take a look at the data
df.head()

Unnamed: 0,Article Title,Abstract,Lead_Paragraph,lp_without_stop_words,lp_lemmatized
0,An A.I. Pioneer on What We Should Really Fear,“Some people naïvely think if we teach A.I. ‘D...,“Some people naïvely think if we teach A.I. ‘D...,“Some people naïvely think teach A.I. ‘Don’t k...,“Some people naïvely think teach A.I. ‘Don’t k...
1,The Case for Longtermism,We are living through an extraordinary and pre...,Imagine living the life of every human being w...,Imagine living life every human ever existed —...,Imagine living life every human ever existed —...
2,How Tech Giants Are Devising Real Ethics for A...,Four people involved in the creation of an ind...,"SAN FRANCISCO — For years, science-fiction mov...","SAN FRANCISCO — years, science-fiction moviema...","SAN FRANCISCO — years, science-fiction moviema..."
3,New Research Center to Explore Ethics of Artif...,"The president of Carnegie Mellon University, w...",Carnegie Mellon University plans to announce o...,Carnegie Mellon University plans announce Wedn...,Carnegie Mellon University plan announce Wedne...
4,Artificial Intelligence as a Threat,Smarter technology requires smarter humans to ...,Ebola sounds like the stuff of nightmares. Bir...,Ebola sounds like stuff nightmares. Bird flu S...,Ebola sound like stuff nightmares. Bird flu SA...


In [None]:
# Initiate UMAP
umap_model = UMAP(n_neighbors=15,
                  n_components=5,
                  min_dist=0.0,
                  metric='cosine',
                  random_state=100)
# Clustering model
# hdbscan_model = HDBSCAN(min_cluster_size=5, min_samples = 5,
# metric='euclidean', prediction_data=True)
kmeans_model = KMeans(n_clusters=9)
# Initiate a sentence transformer model
sentence_model = SentenceTransformer("paraphrase-albert-small-v2")
# Initiate a pretrained model
hf_model = pipeline("feature-extraction", model="distilroberta-base")

# Initiate a pretrained embedding model
roberta_model = TransformerDocumentEmbeddings('roberta-base')
# Initiate another pretrained embedding model
glove_embedding = WordEmbeddings('crawl')
document_glove_embeddings = DocumentPoolEmbeddings([glove_embedding])
# Stack the two pretrained embedding models
stacked_embeddings = StackedEmbeddings(embeddings=[roberta_model,
document_glove_embeddings])

# Count vectorizer
vectorizer_model = CountVectorizer(min_df=10)

# Initiate BERTopic
# topic_model = BERTopic(umap_model=umap_model, language="english", calculate_probabilities=True,hdbscan_model=kmeans_model,
#                        embedding_model=stacked_embeddings,min_topic_size=5, n_gram_range=(1, 3),diversity=0.8)#vectorizer_model=vectorizer_model)# Other options for embedding_model are sentence_model, hf_model,roberta_model

# Initiate BERTopic
topic_model = BERTopic(umap_model=umap_model, language="english", calculate_probabilities=True,hdbscan_model=kmeans_model, n_gram_range=(1, 3))
# Run BERTopic model
topics, probabilities = topic_model.fit_transform(df['lp_lemmatized'])#abstract_lemmatized

In [None]:
# Get the list of topics
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,9,0_girl_people_it_lawson,"[girl, people, it, lawson, toy, woman, ago, sh...","[seven year ago, three researcher University T..."
1,1,8,1_human_system_people_warfare,"[human, system, people, warfare, sciencefictio...","[SAN FRANCISCO — years, science-fiction moviem..."
2,2,7,2_artificial intelligence_intelligence_artific...,"[artificial intelligence, intelligence, artifi...","[SAN FRANCISCO — July, China unveiled plan bec..."
3,3,7,3_artificial intelligence_intelligence_artific...,"[artificial intelligence, intelligence, artifi...","[SAN FRANCISCO — Google fired one engineers, B..."
4,4,5,4_world_make_idea_mind,"[world, make, idea, mind, computer, place amon...",[article part latest Artificial Intelligence s...
5,5,5,5_new_mind_computer_artificial,"[new, mind, computer, artificial, artificial i...",[mathematician Norbert Wiener founded science ...
6,6,3,6_morning_good morning_good_,"[morning, good morning, good, , , , , , , ]","[, Good morning., ]"
7,7,3,7_new_technology_week_times,"[new, technology, week, times, half, tech, wri...","[new documentary Anthony Bourdain’s life, “Roa..."
8,8,3,8_company_impact two key_two key areas_said we...,"[company, impact two key, two key areas, said ...",[technology company running fast future creati...


In [None]:
# Get top 10 terms for a topic
topic_model.get_topic(0)

[('girl', 0.02257436365680437),
 ('people', 0.018155725816704348),
 ('it', 0.017979224736109387),
 ('lawson', 0.017979224736109387),
 ('toy', 0.017979224736109387),
 ('woman', 0.017979224736109387),
 ('ago', 0.016119571745854665),
 ('shed', 0.01297378633510905),
 ('back', 0.01297378633510905),
 ('dark', 0.01297378633510905)]

In [None]:
# Visualize top topic keywords
topic_model.visualize_barchart(top_n_topics=12)

In [None]:
# Visualize term rank decrease
topic_model.visualize_term_rank()

In [None]:
# Visualize intertopic distance
topic_model.visualize_topics()

In [None]:
# Visualize connections between topics using hierachical clustering
topic_model.visualize_hierarchy(top_n_topics=10)

In [None]:
# Visualize similarity using heatmap
topic_model.visualize_heatmap()

In [None]:
# Get the topic predictions
topic_prediction = topic_model.topics_[:]
# Save the predictions in the dataframe
papers_nips['topic_prediction'] = topic_prediction
# Take a look at the data
papers_nips.head()

In [None]:
# New data for the review
new_review = "I like the new headphone. Its sound quality is great."
# Find topics
num_of_topics = 3
similar_topics, similarity = topic_model.find_topics(new_review, top_n=num_of_topics);
# Print results
print(f'The top {num_of_topics} similar topics are {similar_topics}, and the similarities are {np.round(similarity,2)}')