# BerTOPICS

In [131]:
#import packages

import pandas as pd 
import numpy as np
from bertopic import BERTopic

## Step 1. Load Data


In [132]:
#load data  
data = pd.read_csv('incidecoder_descr_clean_sinpunc.csv', sep=",")
data = data.drop(["Unnamed: 0", "Unnamed: 0.1"], axis=1)

data.head()

Unnamed: 0,names,links,description
0,Aluminum Starch Octenylsuccinate,https://incidecoder.com//ingredients/aluminum-...,A handy helper ingredient that comes in a whit...
1,Amaranthus Caudatus Seed Extract,https://incidecoder.com//ingredients/amaranthu...,Its the seed extract of a plant called amarant...
2,Aminomethyl Propanol,https://incidecoder.com//ingredients/aminometh...,An alkaline high pH aka basic material that is...
3,Aminopropyl Ascorbyl Phosphate,https://incidecoder.com//ingredients/aminoprop...,A vitamin C derivative thats created by combin...
4,Ammonium Acryloyldimethyltaurate/​VP Copolymer,https://incidecoder.com//ingredients/ammonium-...,A kind of polymer big molecule from repeated s...


### Clean Data

In [133]:
ignore_words =["skin", "also", "give", "feel", "used", "growth", "help", "agent", "mix", "water", "oil", "ingredient", "thats", "contains", "formula", "liquid"]
nouns=['NN','NNS','NNP']


In [134]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/aveiser/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [135]:
import nltk

def clean (text, ignore_words=ignore_words, nouns=nouns):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ') # Remove Punctuation
    lowercased = text.lower() # Lower Case
    tokenized = word_tokenize(lowercased) # Tokenize
    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
    stop_words = set(stopwords.words('english')) # Make stopword list
    without_stopwords = [word for word in words_only if not word in stop_words] # Remove Stop Words
    lemma=WordNetLemmatizer() # Initiate Lemmatizer
    lemmatized = [lemma.lemmatize(word) for word in without_stopwords] # Lemmatize
    clean_text = [word for word in lemmatized if not word in ignore_words] #ignore words
    tagged = nltk.pos_tag(clean_text)
    base_clean = [x[0] for x in tagged if (y in x for y in nouns)]
    return base_clean

In [136]:
# Apply to all texts
data['description_vect'] = data.description.apply(clean)
data['description_vect'] = data['description_vect'].astype('str')

data.head()

Unnamed: 0,names,links,description,description_vect
0,Aluminum Starch Octenylsuccinate,https://incidecoder.com//ingredients/aluminum-...,A handy helper ingredient that comes in a whit...,"['handy', 'helper', 'come', 'white', 'powder',..."
1,Amaranthus Caudatus Seed Extract,https://incidecoder.com//ingredients/amaranthu...,Its the seed extract of a plant called amarant...,"['seed', 'extract', 'plant', 'called', 'amaran..."
2,Aminomethyl Propanol,https://incidecoder.com//ingredients/aminometh...,An alkaline high pH aka basic material that is...,"['alkaline', 'high', 'ph', 'aka', 'basic', 'ma..."
3,Aminopropyl Ascorbyl Phosphate,https://incidecoder.com//ingredients/aminoprop...,A vitamin C derivative thats created by combin...,"['vitamin', 'c', 'derivative', 'created', 'com..."
4,Ammonium Acryloyldimethyltaurate/​VP Copolymer,https://incidecoder.com//ingredients/ammonium-...,A kind of polymer big molecule from repeated s...,"['kind', 'polymer', 'big', 'molecule', 'repeat..."


## Step 2. Create Model

In [137]:
# create model 
 
model = BERTopic(nr_topics=2, verbose=True)
 
#convert to list 
docs = data.description_vect.to_list()
 
topics, probabilities = model.fit_transform(docs)

Batches:   0%|          | 0/30 [00:00<?, ?it/s]

2022-06-23 23:14:09,605 - BERTopic - Transformed documents to Embeddings
2022-06-23 23:14:14,619 - BERTopic - Reduced dimensionality
2022-06-23 23:14:14,738 - BERTopic - Clustered reduced embeddings
2022-06-23 23:14:15,695 - BERTopic - Reduced number of topics from 17 to 3


## Step 3. Select Top Topics

In [138]:
model.get_topic_freq().head(11)

Unnamed: 0,Topic,Count
0,-1,736
1,0,134
2,1,77


## Step 4. Select One Topic

In [139]:
#You can select a specific topic and get the top n words for that topic and their c-TF-IDF scores.

model.get_topic(-1)


[('acid', 0.052730035226851726),
 ('product', 0.027767043862344944),
 ('molecule', 0.025642516803919727),
 ('study', 0.02442020153382861),
 ('one', 0.023719577368676404),
 ('nice', 0.023533238619789495),
 ('property', 0.02215350928386708),
 ('antioxidant', 0.020192116871265705),
 ('sunscreen', 0.019995341504483812),
 ('thing', 0.019939099624832052)]

## Step 5:Topic Modeling Visualization

BerTopic allows you to visualize the topics that were generated in a way very similar to LDAvis. This will allow you to get more insights into the topic's quality. In this article, we will look at three methods to visualize the topics.

In [140]:
#Visualize Topics 
#model.visualize_topics()

In [142]:
#Visualize Terms
model.visualize_barchart()

In [50]:
#Visualize Topic Similarity 
#model.visualize_heatmap()

In [None]:
#Topic Reduction

#You can set the number of topics you want by setting the argument "nr_topics" with a number of topics you want.
model = BERTopic(nr_topics=20) 

#Another option is to reduce the number of topics automatically. You need to set "nr_topics" to "auto" before training the model.
model = BERTopic(nr_topics="auto")

#The last option is to reduce the number of topics after training the model. 
#This is a great option if retraining the model will take many hours.
#new_topics, new_probs = model.reduce_topics(docs, topics, probabilities, nr_topics=15)

## Step 6:Make Prediction 

In [None]:
topics, probs = model.transform(new_docs)

## Step 7:Save Model

In [None]:
#model.save("my_topics_model")

## Step 8:Load Model

In [None]:
#BerTopic_model = BERTopic.load("my_topics_model")

## Extras

### Language
if you have a dataset for a specific language(by default, it supports the English model) you can choose the language by setting the language parameter while configuring the model.

In [None]:
#model = BERTopic(language="German")

If you have a mixture of languages in your documents, you can set language="multilingual" to support more than 50 languages.