<a href="https://colab.research.google.com/github/davidelgas/DataSciencePortfolio/blob/main/nlp/lda/notebooks/NLP_with_LDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Topic Modeling with Latent Dirichlet Allocation (LDA)
https://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf

## Corpus Creation

The corpus used was assembled using Beautiful Soup to scrape a pubic forum specific to the BMW E9 (www.e9coupe.com). This active forum has been exsitence since 2003. The data was compiled and stored in a Snowflake database for multiple NLP projects, including LDA, GRU and LSTM. Furture ideas include supplementing the forum text with an existing users guide specific to this model.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install joblib==1.0.1

Collecting joblib==1.0.1
  Downloading joblib-1.0.1-py3-none-any.whl (303 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/303.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/303.1 kB[0m [31m1.6 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m297.0/303.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m303.1/303.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: joblib
  Attempting uninstall: joblib
    Found existing installation: joblib 1.3.2
    Uninstalling joblib-1.3.2:
      Successfully uninstalled joblib-1.3.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
imbalanced-learn 0.10.1 requires joblib

In [3]:
!pip install snowflake-connector-python
import snowflake.connector

import pandas as pd
import os

from gensim import corpora
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

nltk.download('wordnet')

!pip install pyLDAvis
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!




In [4]:
# Step 1:
# Load data

# Set the snowflake account and login information
path_to_credentials = '/content/drive/MyDrive/credentials/snowflake_credentials'

# Load the credentials
with open(path_to_credentials, 'r') as file:
    for line in file:
        key, value = line.strip().split('=')
        os.environ[key] = value

conn = snowflake.connector.connect(
    user=os.environ.get('USER'),
    password=os.environ.get('PASSWORD'),
    account=os.environ.get('ACCOUNT'),
)

# Create a cursor object
cur = conn.cursor()

# Select source data
query = """
SELECT * FROM "E9_CORPUS"."E9_CORPUS_SCHEMA"."E9_FORUM_CORPUS";
"""
cur.execute(query)

# Load data into a df.
e9_forum_corpus = cur.fetch_pandas_all()

# Close the cursor and the connection
cur.close()
conn.close()

# Step 2: Preprocess the Data
df = e9_forum_corpus[['THREAD_ALL_POSTS']].copy()
df.dropna(inplace=True)

# Combine Gensim's STOPWORDS with your additional stopwords
additional_stopwords = {'car', 'csi', 'cs', 'csl','e9'} # Very corpus specific
all_stopwords = STOPWORDS.union(additional_stopwords)

def preprocess(text):
    tokenizer = RegexpTokenizer(r'\w+')
    lemmatizer = WordNetLemmatizer()
    tokens = tokenizer.tokenize(text.lower())
    lemmatized = [lemmatizer.lemmatize(token) for token in tokens if token not in all_stopwords and len(token) > 1]
    return lemmatized

df['processed'] = df['THREAD_ALL_POSTS'].map(preprocess)

# Step 3: Vectorization
dictionary = Dictionary(df['processed'])
corpus = [dictionary.doc2bow(doc) for doc in df['processed']]

# Step 4: Train the LDA Model
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=10, random_state=42, passes=10)

# Step 5: Review the Topics
for idx, topic in lda.print_topics(-1):
    print(f"Topic: {idx} \nWords: {topic}\n")

# Step 6: Assign Documents to Topics
topics = [lda[doc] for doc in corpus]
df['topics'] = topics

# Step 7: Prepare the visualization data
vis_data = gensimvis.prepare(lda, corpus, dictionary)

# Visualize
pyLDAvis.display(vis_data)

  and should_run_async(code)


Topic: 0 
Words: 0.027*"paint" + 0.012*"radio" + 0.010*"use" + 0.010*"compressor" + 0.009*"rust" + 0.007*"metal" + 0.007*"coat" + 0.006*"cover" + 0.006*"product" + 0.006*"water"

Topic: 1 
Words: 0.014*"petri" + 0.011*"momo" + 0.008*"380mm" + 0.004*"squeak" + 0.004*"400mm" + 0.003*"380" + 0.003*"extinguisher" + 0.002*"lithium" + 0.002*"halon" + 0.002*"prototipo"

Topic: 2 
Words: 0.070*"http" + 0.064*"com" + 0.051*"www" + 0.021*"ebay" + 0.016*"bmw" + 0.013*"image" + 0.013*"cgi" + 0.010*"broken" + 0.010*"external" + 0.010*"php"

Topic: 3 
Words: 0.011*"original" + 0.011*"thanks" + 0.010*"part" + 0.009*"seat" + 0.009*"bmw" + 0.008*"email" + 0.008*"look" + 0.008*"wheel" + 0.007*"know" + 0.007*"rear"

Topic: 4 
Words: 0.023*"switch" + 0.021*"wire" + 0.017*"light" + 0.015*"wheel" + 0.010*"relay" + 0.009*"tire" + 0.008*"turn" + 0.008*"battery" + 0.008*"wiring" + 0.007*"coil"

Topic: 5 
Words: 0.009*"rear" + 0.007*"new" + 0.006*"need" + 0.006*"bolt" + 0.006*"thanks" + 0.005*"use" + 0.005*"doo