# Sklearn's LDA

## Importing data

In [93]:
import pandas as pd

data = pd.read_csv('incidecoder_descr_clean_sinpunc.csv', sep=",")
data = data.drop(["Unnamed: 0", "Unnamed: 0.1"], axis=1)

data.head()

Unnamed: 0,names,links,description
0,Aluminum Starch Octenylsuccinate,https://incidecoder.com//ingredients/aluminum-...,A handy helper ingredient that comes in a whit...
1,Amaranthus Caudatus Seed Extract,https://incidecoder.com//ingredients/amaranthu...,Its the seed extract of a plant called amarant...
2,Aminomethyl Propanol,https://incidecoder.com//ingredients/aminometh...,An alkaline high pH aka basic material that is...
3,Aminopropyl Ascorbyl Phosphate,https://incidecoder.com//ingredients/aminoprop...,A vitamin C derivative thats created by combin...
4,Ammonium Acryloyldimethyltaurate/​VP Copolymer,https://incidecoder.com//ingredients/ammonium-...,A kind of polymer big molecule from repeated s...


In [94]:
data.isnull().sum()

names          0
links          0
description    0
dtype: int64

## Preprocessing 

In [95]:
from nltk.corpus import stopwords 
import string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize 

def clean (text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ') # Remove Punctuation
    lowercased = text.lower() # Lower Case
    tokenized = word_tokenize(lowercased) # Tokenize
    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
    stop_words = set(stopwords.words('english')) # Make stopword list
    without_stopwords = [word for word in words_only if not word in stop_words] # Remove Stop Words
    lemma=WordNetLemmatizer() # Initiate Lemmatizer
    lemmatized = [lemma.lemmatize(word) for word in without_stopwords] # Lemmatize
    return lemmatized


prueba = clean(data['description'][0])
prueba

['handy',
 'helper',
 'ingredient',
 'come',
 'white',
 'powder',
 'form',
 'work',
 'anticaking',
 'oilabsorbing',
 'agent',
 'also',
 'give',
 'product',
 'good',
 'spreadability',
 'long',
 'lasting',
 'velvet',
 'touch',
 'characteristic',
 'popular',
 'skincare',
 'makeup',
 'product']

In [96]:
# Apply to all texts
data['description_vect'] = data.description.apply(clean)
data['description_vect'] = data['description_vect'].astype('str')

data.head()

Unnamed: 0,names,links,description,description_vect
0,Aluminum Starch Octenylsuccinate,https://incidecoder.com//ingredients/aluminum-...,A handy helper ingredient that comes in a whit...,"['handy', 'helper', 'ingredient', 'come', 'whi..."
1,Amaranthus Caudatus Seed Extract,https://incidecoder.com//ingredients/amaranthu...,Its the seed extract of a plant called amarant...,"['seed', 'extract', 'plant', 'called', 'amaran..."
2,Aminomethyl Propanol,https://incidecoder.com//ingredients/aminometh...,An alkaline high pH aka basic material that is...,"['alkaline', 'high', 'ph', 'aka', 'basic', 'ma..."
3,Aminopropyl Ascorbyl Phosphate,https://incidecoder.com//ingredients/aminoprop...,A vitamin C derivative thats created by combin...,"['vitamin', 'c', 'derivative', 'thats', 'creat..."
4,Ammonium Acryloyldimethyltaurate/​VP Copolymer,https://incidecoder.com//ingredients/ammonium-...,A kind of polymer big molecule from repeated s...,"['kind', 'polymer', 'big', 'molecule', 'repeat..."


## Latent Dirichlet Allocation model

In [97]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

data_vectorized = vectorizer.fit_transform(data['description_vect'])

lda_model = LatentDirichletAllocation(n_components=2)

lda_vectors = lda_model.fit_transform(data_vectorized)

## Visualize potential topics

In [98]:
def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names_out()[i], topic[i])
                        for i in topic.argsort()[:-10 - 1:-1]])

In [99]:
print_topics(lda_model, vectorizer)

Topic 0:
[('oil', 729.607336320494), ('skin', 507.2778622041426), ('also', 405.26128885365137), ('acid', 324.0695822093308), ('used', 294.73082565544104), ('product', 228.47208389922545), ('ingredient', 227.40443867595687), ('nice', 211.93276214438524), ('agent', 200.7250337335521), ('formula', 199.62425664816178)]
Topic 1:
[('skin', 1098.7221377958435), ('acid', 480.9304177906553), ('also', 344.73871114633465), ('study', 261.35287484456825), ('ingredient', 211.59556132402932), ('cell', 197.3971712429659), ('one', 180.0417519611157), ('property', 167.33400633182038), ('manufacturer', 164.0882667175132), ('antioxidant', 163.27033803175087)]


## Predict the document-topic mixture of a new text

In [100]:
example = data['description'][0]
example

'A handy helper ingredient that comes in a white powder form and works as an anticaking and oilabsorbing agent It also gives products good spreadability long lasting and velvet touch characteristics It is popular both in skincare and makeup products'

In [101]:
example_vectorized = vectorizer.transform(example)

lda_vectors = lda_model.transform(example_vectorized)

print("topic 0 :", lda_vectors[0][0])
print("topic 1 :", lda_vectors[0][1])

ValueError: Iterable over raw text documents expected, string object received.

# BerTOPICS

In [34]:
#import packages

import pandas as pd 
import numpy as np
from bertopic import BERTopic

2022-06-23 21:00:21.242509: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-06-23 21:00:21.242625: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## Step 1. Load Data


In [51]:
#load data  
data = pd.read_csv('incidecoder_descr_clean_sinpunc.csv', sep=",")
data = data.drop(["Unnamed: 0", "Unnamed: 0.1"], axis=1)

data.head()

Unnamed: 0,names,links,description
0,Aluminum Starch Octenylsuccinate,https://incidecoder.com//ingredients/aluminum-...,A handy helper ingredient that comes in a whit...
1,Amaranthus Caudatus Seed Extract,https://incidecoder.com//ingredients/amaranthu...,Its the seed extract of a plant called amarant...
2,Aminomethyl Propanol,https://incidecoder.com//ingredients/aminometh...,An alkaline high pH aka basic material that is...
3,Aminopropyl Ascorbyl Phosphate,https://incidecoder.com//ingredients/aminoprop...,A vitamin C derivative thats created by combin...
4,Ammonium Acryloyldimethyltaurate/​VP Copolymer,https://incidecoder.com//ingredients/ammonium-...,A kind of polymer big molecule from repeated s...


### Clean Data

In [115]:
ignore_words =["skin", "also", "give", "feel", "used", "growth", "help", "agent", "mix", "water", "oil", "ingredient", "thats", "contains", "formula", "liquid"]

In [116]:
def clean (text, ignore_words=ignore_words):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ') # Remove Punctuation
    lowercased = text.lower() # Lower Case
    tokenized = word_tokenize(lowercased) # Tokenize
    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
    stop_words = set(stopwords.words('english')) # Make stopword list
    without_stopwords = [word for word in words_only if not word in stop_words] # Remove Stop Words
    lemma=WordNetLemmatizer() # Initiate Lemmatizer
    lemmatized = [lemma.lemmatize(word) for word in without_stopwords] # Lemmatize
    clean_text = [word for word in lemmatized if not word in ignore_words] #ignore words
    return clean_text

In [117]:
# Apply to all texts
data['description_vect'] = data.description.apply(clean)
data['description_vect'] = data['description_vect'].astype('str')

data.head()

Unnamed: 0,names,links,description,description_vect
0,Aluminum Starch Octenylsuccinate,https://incidecoder.com//ingredients/aluminum-...,A handy helper ingredient that comes in a whit...,"['handy', 'helper', 'come', 'white', 'powder',..."
1,Amaranthus Caudatus Seed Extract,https://incidecoder.com//ingredients/amaranthu...,Its the seed extract of a plant called amarant...,"['seed', 'extract', 'plant', 'called', 'amaran..."
2,Aminomethyl Propanol,https://incidecoder.com//ingredients/aminometh...,An alkaline high pH aka basic material that is...,"['alkaline', 'high', 'ph', 'aka', 'basic', 'ma..."
3,Aminopropyl Ascorbyl Phosphate,https://incidecoder.com//ingredients/aminoprop...,A vitamin C derivative thats created by combin...,"['vitamin', 'c', 'derivative', 'created', 'com..."
4,Ammonium Acryloyldimethyltaurate/​VP Copolymer,https://incidecoder.com//ingredients/ammonium-...,A kind of polymer big molecule from repeated s...,"['kind', 'polymer', 'big', 'molecule', 'repeat..."


## Step 2. Create Model

In [118]:
# create model 
 
model = BERTopic(nr_topics=2, verbose=True)
 
#convert to list 
docs = data.description_vect.to_list()
 
topics, probabilities = model.fit_transform(docs)

Batches:   0%|          | 0/30 [00:00<?, ?it/s]

2022-06-23 22:05:26,468 - BERTopic - Transformed documents to Embeddings
2022-06-23 22:05:34,193 - BERTopic - Reduced dimensionality
2022-06-23 22:05:34,276 - BERTopic - Clustered reduced embeddings
2022-06-23 22:05:35,457 - BERTopic - Reduced number of topics from 20 to 3


## Step 3. Select Top Topics

In [119]:
model.get_topic_freq().head(11)

Unnamed: 0,Topic,Count
0,-1,804
1,0,72
2,1,71


## Step 4. Select One Topic

In [120]:
#You can select a specific topic and get the top n words for that topic and their c-TF-IDF scores.

model.get_topic(-1)


[('acid', 0.0487039860924556),
 ('product', 0.02819684396617328),
 ('property', 0.028129432930425507),
 ('one', 0.02509047428305393),
 ('antioxidant', 0.0249162109953704),
 ('nice', 0.023874478575526818),
 ('study', 0.022779405568237227),
 ('extract', 0.02225411199905449),
 ('molecule', 0.02201833639800329),
 ('good', 0.02060698600343736)]

## Step 5:Topic Modeling Visualization

BerTopic allows you to visualize the topics that were generated in a way very similar to LDAvis. This will allow you to get more insights into the topic's quality. In this article, we will look at three methods to visualize the topics.

In [121]:
#Visualize Topics 
#model.visualize_topics()

In [122]:
#Visualize Terms
model.visualize_barchart()

In [50]:
#Visualize Topic Similarity 
#model.visualize_heatmap()

In [None]:
#Topic Reduction

#You can set the number of topics you want by setting the argument "nr_topics" with a number of topics you want.
model = BERTopic(nr_topics=20) 

#Another option is to reduce the number of topics automatically. You need to set "nr_topics" to "auto" before training the model.
model = BERTopic(nr_topics="auto")

#The last option is to reduce the number of topics after training the model. 
#This is a great option if retraining the model will take many hours.
#new_topics, new_probs = model.reduce_topics(docs, topics, probabilities, nr_topics=15)

## Step 6:Make Prediction 

In [None]:
topics, probs = model.transform(new_docs)

## Step 7:Save Model

In [None]:
#model.save("my_topics_model")

## Step 8:Load Model

In [None]:
#BerTopic_model = BERTopic.load("my_topics_model")

## Extras

### Language
if you have a dataset for a specific language(by default, it supports the English model) you can choose the language by setting the language parameter while configuring the model.

In [None]:
#model = BERTopic(language="German")

If you have a mixture of languages in your documents, you can set language="multilingual" to support more than 50 languages.