# NLP Final Project - Topic Modeling

For this final project, there is a collection of ~200K news articles on our favorite topics, data science, machine learning, and artificial intelligence. Our task is to identify what industries and job lines are going to be most impacted by AI over the next several years, based on the information/insights you can extract from this text corpus.

Goal: provide actionable recommendations on what can be done with AI to automate the jobs, improve employee productivity, and generally make AI adoption successful. Please pay attention to the introduction of novel technologies and algorithms, such as AI for image generation and Conversational AI, as they represent the entire paradigm shift in adoption of AI technologies and data science in general.


## Loading Data

In [1]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Mounted at /content/drive


In [None]:
import pandas as pd

# Path to the CSV file in Google Drive
file_path = '/content/drive/My Drive/nlp_final/tokenized_text.json'

# Load the JSON file
df = pd.read_json(file_path, orient='records', lines=True)

# Display loaded data
df.head()


Unnamed: 0,text,cleaned_text,topic_tokens,entity_tokens
0,\n\nauckland.scoop.co.nz » AUT boosts AI exper...,aucklandscoopconz AUT boosts AI expertise wi...,"[aucklandscoopconz, aut, boost, ai, expertis, ...","[aucklandscoopconz, AUT, boosts, AI, expertise..."
1,"\n\nObservation, Simulation, And AI Join Force...",Observation Simulation And AI Join Forces To ...,"[observ, simul, ai, join, forc, reveal, clear,...","[Observation, Simulation, And, AI, Join, Force..."
2,\n\nApplitools Visual AI Reaches One Billion I...,Applitools Visual AI Reaches One Billion Imag...,"[applitool, visual, ai, reach, one, billion, i...","[Applitools, Visual, AI, Reaches, One, Billion..."
3,\n\nData Science and Machine-Learning Platform...,Data Science and MachineLearning Platforms Ma...,"[data, scienc, machinelearn, platform, market,...","[Data, Science, and, MachineLearning, Platform..."
4,\n\nHealthcare Artificial Intelligence Market ...,Healthcare Artificial Intelligence Market Ana...,"[healthcar, artifici, intellig, market, analys...","[Healthcare, Artificial, Intelligence, Market,..."


In [None]:
df.shape

(145456, 4)

## Create Dictionary & Corpus

In [None]:
df = df[['topic_tokens', 'cleaned_text']]
df.shape

(145456, 2)

In [None]:
print(type(df['topic_tokens'][69009]))

<class 'list'>


In [None]:
# !pip install gensim

In [2]:
from gensim import corpora

# Create a dictionary representation of the documents
dictionary = corpora.Dictionary(df['topic_tokens'])

# Filter out rare and common tokens
dictionary.filter_extremes(no_below=15, no_above=0.5)

# Create a corpus from the dictionary representation
corpus = [dictionary.doc2bow(tokens) for tokens in df['topic_tokens']]


## Save corpus

In [None]:
import json

# Save as json using pandas
df_corpus = pd.DataFrame(corpus)
df_corpus.to_json('corpus.json', orient='records', lines=True)

In [2]:
# Path to the json file in Google Drive
file_path = '/content/drive/My Drive/nlp_final/corpus.json'

loaded_corpus = pd.read_json(file_path, orient='records', lines=True)
loaded_corpus.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7233,7234,7235,7236,7237,7238,7239,7240,7241,7242
0,"[0, 1]","[1, 8]","[2, 12]","[3, 12]","[4, 12]","[5, 12]","[6, 12]","[7, 12]","[8, 12]","[9, 12]",...,,,,,,,,,,
1,"[16, 1]","[18, 1]","[28, 2]","[30, 1]","[39, 1]","[56, 3]","[66, 1]","[69, 1]","[95, 12]","[105, 1]",...,,,,,,,,,,
2,"[0, 1]","[12, 1]","[15, 1]","[16, 3]","[18, 3]","[20, 1]","[25, 1]","[30, 1]","[39, 2]","[45, 1]",...,,,,,,,,,,
3,"[0, 1]","[11, 1]","[12, 4]","[30, 1]","[37, 2]","[39, 1]","[46, 1]","[51, 2]","[67, 1]","[72, 1]",...,,,,,,,,,,
4,"[18, 1]","[22, 2]","[30, 1]","[31, 1]","[37, 1]","[58, 2]","[87, 2]","[90, 1]","[117, 1]","[129, 1]",...,,,,,,,,,,


## BERTopic

In [4]:
!pip install bertopic

Collecting bertopic
  Downloading bertopic-0.16.2-py2.py3-none-any.whl (158 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.8/158.8 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.33.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap_learn-0.5.6-py3-none-any.whl (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Collecting sentence-transformers>=0.4.1 (from bertopic)
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer

from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer

import numpy as np

### Embedding Model

In [None]:
# instantiating the embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Encoding the entire corpus to get the embeddings
embeddings = embedding_model.encode(corpus, show_progress_bar=True)


In [None]:
# save embeddings

np.save('embeddings.npy', corpus_embeddings)  # Save

In [None]:
loaded_embeddings = np.load('embeddings.npy')  # Load

In [None]:
# Outputting the results
print("Number of sentences:", len(loaded_embeddings))
for i, embedding in enumerate(loaded_embeddings):
    print(f"\nEmbedding for sentence {i+1} (Dimensions: {len(embedding)}):")
    print(embedding)

### HDBSCAN Clustering

In [None]:
hdbscan_model = HDBSCAN(min_cluster_size=780, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

### Topic Representation

In [None]:
# Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 2))

# Create topic representation
ctfidf_model = ClassTfidfTransformer()

## Model

In [None]:
topic_model = BERTopic(
      embedding_model=embedding_model,
      hdbscan_model=hdbscan_model,
      vectorizer_model=vectorizer_model,
      ctfidf_model=ctfidf_model=ctfidf_model)


In [None]:
# training model
topics, probs = topic_model.fit_transform(df['cleaned_text'], loaded_embeddings)


In [None]:
# topics contains the topic assignment for each document

print(
    len(topics) == len(df["cleaned_text"]),
    topics[:10], # topics assignments of first 10 articles
    sep="\n\n"
)

## Topics

In [None]:
# probs that store probabilities of each topic

probs[:10]

In [None]:
# topics from the model

topic_model.get_topic_info()

In [None]:
topic_model.get_topics()

In [None]:
# accessing the frequent topics
topic_model.get_document_info(df["cleaned_text"])

In [None]:
print(len(topic_model.get_topics()))


In [None]:
# visualization can get at the idea their may be little practical difference between clusters
topic_model.visualize_topics()

Ask ChatGPT to identify the labels of each topic -> create labels.

### Save topics

In [None]:
# Get the topic information
topic_info = topic_model.get_topic_info()

# Save the topic information to a CSV file
topic_info.to_csv('topic_info.csv', index=False)

# Alternatively, save to JSON
topic_info.to_json('topic_info.json', orient='records', lines=True)


In [None]:
# Create a dictionary to store the topic representations
topic_representations = {}

# Iterate over each topic
for topic in topic_info['Topic']:
    if topic != -1:  # Skip outliers
        topic_representations[topic] = topic_model.get_topic(topic)

# Save the topic representations to a JSON file
with open('topic_representations.json', 'w') as f:
    json.dump(topic_representations, f, indent=4)


In [None]:
# Add topic and probability columns to the original DataFrame
df['topic'] = topics
df['probability'] = probs

# # Save the DataFrame to a CSV file
# df.to_csv('document_topic_assignments.csv', index=False)

# Alternatively, save to JSON
df.to_json('document_topic_assignments.json', orient='records', lines=True)
