<a href="https://colab.research.google.com/github/davidelgas/DataSciencePortfolio/blob/main/NLP_with_LDA/noteboooks/NLP_with_LDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Topic Modeling with Latent Dirichlet Allocation (LDA)
https://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf

## Corpus Creation

The corpus used was assembled using Beautiful Soup to scrape a pubic forum specific to the BMW E9 (www.e9coupe.com). This active forum has been exsitence since 2003. The data was compiled and stored in a Snowflake database for multiple NLP projects, including LDA, GRU and LSTM. Furture ideas include supplementing the forum text with an existing users guide specific to this model.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

  and should_run_async(code)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Getting a conflict with some of the libraries. This seems to help.

!pip install joblib==1.0.1

  and should_run_async(code)


Collecting joblib==1.0.1
  Using cached joblib-1.0.1-py3-none-any.whl (303 kB)
Installing collected packages: joblib
  Attempting uninstall: joblib
    Found existing installation: joblib 1.4.2
    Uninstalling joblib-1.4.2:
      Successfully uninstalled joblib-1.4.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
imbalanced-learn 0.10.1 requires joblib>=1.1.1, but you have joblib 1.0.1 which is incompatible.
pyldavis 3.4.1 requires joblib>=1.2.0, but you have joblib 1.0.1 which is incompatible.
scikit-learn 1.2.2 requires joblib>=1.1.1, but you have joblib 1.0.1 which is incompatible.[0m[31m
[0mSuccessfully installed joblib-1.0.1


In [2]:
# Install required libraries
!pip install snowflake-connector-python
!pip install pyLDAvis
!pip install gensim
!pip install sumy

# Import libraries
import snowflake.connector
import pandas as pd
import os
import requests
from bs4 import BeautifulSoup

# Gensim libraries
from gensim import corpora
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

# NLTK libraries
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet, stopwords

nltk.download('wordnet')

# PyLDAvis for visualization
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Sumy libraries for summarization
from sumy.parsers.plaintext import PlaintextParser
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.nlp.tokenizers import Tokenizer


# Install required libraries
!pip install snowflake-connector-python
!pip install gensim
!pip install nltk

# Import libraries
import snowflake.connector
import pandas as pd
import os

# Gensim libraries
from gensim.parsing.preprocessing import STOPWORDS

# NLTK libraries
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet, stopwords

nltk.download('wordnet')

import random
from nltk.tokenize import sent_tokenize
import pandas as pd


import nltk
nltk.download('stopwords')


# re
try:
    import re
except ImportError:
    !pip install re
    import re



Collecting snowflake-connector-python
  Downloading snowflake_connector_python-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting asn1crypto<2.0.0,>0.24.0 (from snowflake-connector-python)
  Downloading asn1crypto-1.5.1-py2.py3-none-any.whl (105 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
Collecting tomlkit (from snowflake-connector-python)
  Downloading tomlkit-0.12.5-py3-none-any.whl (37 kB)
Installing collected packages: asn1crypto, tomlkit, snowflake-connector-python
Successfully installed asn1crypto-1.5.1 snowflake-connector-python-3.10.1 tomlkit-0.12.5
Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m


[nltk_data] Downloading package wordnet to /root/nltk_data...




[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Create Corpus
# Fetch and process forum threads

BASE_PATH = '/content/drive/MyDrive/Colab Notebooks/Data_sets/e9/'

def forum_thread_ids():
    threads = 1000  # Set the number of incremental threads to process here

    file_path = os.path.join(BASE_PATH, 'e9_forum_thread_ids.csv')

    if os.path.exists(file_path) and os.path.getsize(file_path) > 0:
        e9_forum_thread_ids = pd.read_csv(file_path)
        last_thread_id = int(e9_forum_thread_ids['thread_id'].iloc[-1])
    else:
        e9_forum_thread_ids = pd.DataFrame(columns=['thread_id'])
        last_thread_id = 0

    next_thread_id = last_thread_id + 1
    new_urls = [{'thread_id': thread_id} for thread_id in range(next_thread_id, next_thread_id + threads)]

    new_df = pd.DataFrame(new_urls)
    e9_forum_thread_ids = pd.concat([e9_forum_thread_ids, new_df], ignore_index=True)
    e9_forum_thread_ids.to_csv(file_path, index=False)

    print(f"Starting with thread_id {last_thread_id}")
    print(f"Processing additional {threads} thread(s)")
    print(f"Ending with thread_id {next_thread_id + threads - 1}")

    return new_df

def forum_thread_url(df):
    if df.empty:
        print("No new threads to process.")
        return pd.DataFrame()

    pages = 1

    for index, row in df.iterrows():
        thread_id = row['thread_id']
        thread_url = f"https://e9coupe.com/forum/threads/{thread_id}"
        for i in range(1, pages + 1):
            page_url = f"{thread_url}/?page={i}"
            response = requests.get(page_url)
            soup = BeautifulSoup(response.text, 'html.parser')
            title = soup.find('title').get_text()
            thread_title = title.split('|')[0].strip()
            df.at[index, 'thread_url'] = page_url
            df.at[index, 'thread_title'] = thread_title

    df.to_csv(os.path.join(BASE_PATH, 'e9_forum_thread_url.csv'), index=False)
    return df

def forum_thread_first_post(df):
    data = []

    for thread_id, thread_url, thread_title in zip(df['thread_id'], df['thread_url'], df['thread_title']):
        response = requests.get(thread_url)
        soup = BeautifulSoup(response.text, 'html.parser')
        first_post = soup.find('article', class_='message-body')
        post_content = first_post.get_text(strip=True) if first_post else "No content found"
        data.append({'thread_id': thread_id, 'thread_first_post': post_content})

    forum_first_post = pd.DataFrame(data)
    forum_first_post.to_csv(os.path.join(BASE_PATH, 'e9_forum_first_post.csv'), index=False)
    return forum_first_post

def forum_thread_all_post(df):
    post_data = []
    for index, row in df.iterrows():
        response = requests.get(row['thread_url'])
        soup = BeautifulSoup(response.text, 'html.parser')
        articles = soup.find_all('article', class_='message--post')
        for article in articles:
            post_timestamp = article.find('time')['datetime'] if article.find('time') else 'N/A'
            content = article.find('div', class_='bbWrapper').get_text(strip=True)
            post_data.append({'thread_id': row['thread_id'], 'post_raw': content})

    e9_forum_posts = pd.DataFrame(post_data)
    e9_forum_posts['thread_all_posts'] = e9_forum_posts['post_raw'].astype(str)
    e9_forum_thread_all_post = e9_forum_posts.groupby('thread_id')['thread_all_posts'].agg(lambda x: ' '.join(x)).reset_index()
    e9_forum_thread_all_post.to_csv(os.path.join(BASE_PATH, 'e9_forum_thread_all_post.csv'), index=False)
    return e9_forum_thread_all_post

def forum_corpus(e9_forum_thread_url, e9_forum_thread_first_post, e9_forum_thread_all_post):
    agg_df_1 = pd.merge(e9_forum_thread_url, e9_forum_thread_first_post, on='thread_id', how='left')
    agg_df_2 = pd.merge(agg_df_1, e9_forum_thread_all_post, on='thread_id', how='left')

    e9_forum_corpus = agg_df_2.dropna()
    corpus_path = os.path.join(BASE_PATH, 'e9_forum_corpus.csv')
    if os.path.exists(corpus_path) and os.path.getsize(corpus_path) > 0:
        existing_corpus = pd.read_csv(corpus_path)
        e9_forum_corpus = pd.concat([existing_corpus, e9_forum_corpus]).drop_duplicates().reset_index(drop=True)

    e9_forum_corpus.columns = e9_forum_corpus.columns.str.upper()
    e9_forum_corpus.to_csv(os.path.join(BASE_PATH, 'e9_forum_corpus_dirty.csv'), index=False)
    return e9_forum_corpus

def main():
    e9_forum_thread_ids = forum_thread_ids()
    e9_forum_thread_url_df = forum_thread_url(e9_forum_thread_ids)
    e9_forum_thread_first_post_df = forum_thread_first_post(e9_forum_thread_url_df)
    e9_forum_thread_all_post_df = forum_thread_all_post(e9_forum_thread_url_df)
    e9_forum_corpus_df = forum_corpus(e9_forum_thread_url_df, e9_forum_thread_first_post_df, e9_forum_thread_all_post_df)
    print(f"Output saved to {os.path.join(BASE_PATH, 'e9_forum_corpus_dirty.csv')}")

if __name__ == "__main__":
    main()


  and should_run_async(code)


Starting with thread_id 9000
Processing additional 1000 thread(s)
Ending with thread_id 10000


In [3]:
# Save corpus
# Create and populate data in Snowflake


BASE_PATH = '/content/drive/MyDrive/Colab Notebooks/Data_sets/e9/'
CREDENTIALS_PATH = '/content/drive/MyDrive/Colab Notebooks/credentials/snowflake_credentials'

# Load the e9_forum_corpus DataFrame from the CSV file
e9_forum_corpus = pd.read_csv(BASE_PATH + 'e9_forum_corpus_dirty.csv')

def load_credentials(credentials_path):
    """Load Snowflake credentials from a file and set them as environment variables."""
    with open(credentials_path, 'r') as file:
        for line in file:
            key, value = line.strip().split('=')
            os.environ[key] = value

def connect_to_snowflake():
    """Establish a connection to the Snowflake database."""
    return snowflake.connector.connect(
        user=os.environ.get('USER'),
        password=os.environ.get('PASSWORD'),
        account=os.environ.get('ACCOUNT')
    )

def create_db_and_schema(cur):
    """Create the database and schema in Snowflake."""
    try:
        cur.execute("CREATE DATABASE IF NOT EXISTS e9_corpus")
        cur.execute("USE DATABASE e9_corpus")
        cur.execute("CREATE SCHEMA IF NOT EXISTS e9_corpus_schema")
        print("Database and schema created successfully.")
    except Exception as e:
        print(f"Error creating database and schema: {e}")

def create_table_if_not_exists(cur):
    """Create the e9_forum_corpus_dirty table if it does not exist."""
    try:
        cur.execute("""
        CREATE TABLE IF NOT EXISTS e9_corpus.e9_corpus_schema.e9_forum_corpus_dirty (
            THREAD_ID NUMBER(38,0),
            THREAD_URL STRING,
            THREAD_TITLE STRING,
            THREAD_FIRST_POST STRING,
            THREAD_ALL_POSTS STRING
        )
        """)
        print("e9_forum_corpus_dirty table created successfully.")
    except Exception as e:
        print(f"Error creating table: {e}")

def fetch_existing_thread_ids(cur):
    """Fetch existing THREAD_IDs from the e9_forum_corpus_dirty table."""
    query = "SELECT THREAD_ID FROM e9_corpus.e9_corpus_schema.e9_forum_corpus_dirty"
    cur.execute(query)
    existing_ids = cur.fetch_pandas_all()
    return existing_ids['THREAD_ID'].tolist()

def insert_data_into_table(cur, df):
    """Insert data from the DataFrame into the e9_forum_corpus_dirty table."""
    for index, row in df.iterrows():
        row = row.where(pd.notnull(row), None)
        insert_command = f"""
        INSERT INTO e9_corpus.e9_corpus_schema.e9_forum_corpus_dirty
        (THREAD_ID, THREAD_URL, THREAD_TITLE, THREAD_FIRST_POST, THREAD_ALL_POSTS)
        VALUES (%s, %s, %s, %s, %s)
        """
        try:
            cur.execute(insert_command, (
                row['THREAD_ID'], row['THREAD_URL'], row['THREAD_TITLE'],
                row['THREAD_FIRST_POST'], row['THREAD_ALL_POSTS']
            ))
        except Exception as e:
            print(f"Error inserting data: {e}")

def fetch_data_from_table(cur):
    """Fetch all data from the e9_forum_corpus_dirty table."""
    query = "SELECT * FROM e9_corpus.e9_corpus_schema.e9_forum_corpus_dirty"
    cur.execute(query)
    return cur.fetch_pandas_all()

def main():
    # Load Snowflake credentials
    load_credentials(CREDENTIALS_PATH)

    # Connect to Snowflake
    conn = connect_to_snowflake()
    cur = conn.cursor()

    # Create the database, schema, and table if they don't exist
    create_db_and_schema(cur)
    create_table_if_not_exists(cur)

    # Fetch existing thread IDs
    existing_ids = fetch_existing_thread_ids(cur)

    # Identify new entries
    new_entries = e9_forum_corpus[~e9_forum_corpus['THREAD_ID'].isin(existing_ids)]

    # Calculate the starting THREAD_ID of the Snowflake table
    starting_thread_id = max(existing_ids) if existing_ids else 0

    # Calculate ending_thread_id and threads_processed
    ending_thread_id = new_entries['THREAD_ID'].max() if not new_entries.empty else starting_thread_id
    threads_processed = len(new_entries) if not new_entries.empty else 0

    print(f"Starting with thread_id {starting_thread_id}")
    print(f"Processing additional {threads_processed} thread(s)")
    print(f"Ending with thread_id {ending_thread_id}")

    # Insert only new entries into the table
    insert_data_into_table(cur, new_entries)
    conn.commit()
    print("New data inserted into e9_forum_corpus_dirty table.")

    # Fetch data from the table
    e9_forum_corpus_df = fetch_data_from_table(cur)
    print("Additional entries: " + str(len(new_entries)))
    print("Total entries: " + str(len(e9_forum_corpus_df)))

    # Close cursor and connection
    cur.close()
    conn.close()

if __name__ == "__main__":
    main()


  and should_run_async(code)


Database and schema created successfully.
e9_forum_corpus_dirty table created successfully.
Starting with thread_id 8000
Processing additional 974 thread(s)
Ending with thread_id 9000
New data inserted into e9_forum_corpus_dirty table.
Additional entries: 974
Total entries: 7039


In [4]:
#Process data for LDA

def load_credentials(path_to_credentials):
    with open(path_to_credentials, 'r') as file:
        for line in file:
            key, value = line.strip().split('=')
            os.environ[key] = value

def fetch_data_from_snowflake():
    conn = snowflake.connector.connect(
        user=os.environ.get('USER'),
        password=os.environ.get('PASSWORD'),
        account=os.environ.get('ACCOUNT'),
    )

    cur = conn.cursor()

    query = """
    SELECT THREAD_TITLE, THREAD_FIRST_POST FROM "E9_CORPUS"."E9_CORPUS_SCHEMA"."E9_FORUM_CORPUS_DIRTY";
    """
    cur.execute(query)
    e9_forum_corpus = cur.fetch_pandas_all()

    cur.close()
    conn.close()

    return e9_forum_corpus

def engineer_data(e9_forum_corpus):
    #e9_forum_corpus['THREAD_TITLE_EXP'] = e9_forum_corpus['THREAD_TITLE'] + " " + e9_forum_corpus['THREAD_FIRST_POST']
    e9_forum_corpus['THREAD_TITLE_EXP'] = e9_forum_corpus['THREAD_TITLE']
    return e9_forum_corpus

def preprocess_data(df):
    df = df[['THREAD_TITLE_EXP']].copy()
    df.dropna(inplace=True)
    df['THREAD_TITLE_EXP'] = df['THREAD_TITLE_EXP'].astype(str)

    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english')).union({'car', 'csi', 'cs', 'csl', 'e9', 'coupe', 'http', 'https', 'www', 'ebay', 'bmw', 'html'})

    # Function to remove URLs from text
    def remove_urls(text):
        url_pattern = re.compile(r'https?://\S+|www\.\S+')
        return url_pattern.sub(r'', text)

    # Function to preprocess text
    def preprocess(text):
        text = remove_urls(text)
        return [lemmatizer.lemmatize(word) for word in text.lower().split() if word not in stop_words]

    df['processed'] = df['THREAD_TITLE_EXP'].map(preprocess)
    return df

def vectorize_data(df):
    dictionary = Dictionary(df['processed'])
    corpus = [dictionary.doc2bow(doc) for doc in df['processed']]
    return df, dictionary, corpus

def train_lda_model(corpus, dictionary, num_topics=10, random_state=42, passes=10):
    lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=random_state, passes=passes)
    return lda

def review_topics(lda):
    for idx, topic in lda.print_topics(-1):
        print(f"Topic: {idx} \nWords: {topic}\n")

def assign_topics(lda, corpus, df):
    topics = [lda[doc] for doc in corpus]
    df['topics'] = topics
    return df

def prepare_visualization_data(lda, corpus, dictionary):
    vis_data = gensimvis.prepare(lda, corpus, dictionary)
    return vis_data

def score_text_block(text, topic_words):
    if isinstance(text, str):
        return sum(1 for word in topic_words if word in text)
    return 0

def create_representative_sentence(lda, df, topic_id, top_n=10):
    topic_words = [word for word, prob in lda.show_topic(topic_id, topn=top_n)]
    df['score'] = df['THREAD_TITLE_EXP'].map(lambda x: score_text_block(x, topic_words))
    representative_sentence = df.loc[df['score'].idxmax(), 'THREAD_TITLE_EXP']
    return representative_sentence


  and should_run_async(code)


In [5]:
# Create LDA

# Main sequence
path_to_credentials = '/content/drive/MyDrive/Colab Notebooks/credentials/snowflake_credentials'

# Load credentials
load_credentials(path_to_credentials)

# Fetch data from Snowflake
e9_forum_corpus = fetch_data_from_snowflake()

# Engineer the data
e9_forum_corpus = engineer_data(e9_forum_corpus)

# Preprocess the data
df = preprocess_data(e9_forum_corpus)

# Vectorize the data
df, dictionary, corpus = vectorize_data(df)

# Train the LDA Model
lda = train_lda_model(corpus, dictionary)

# Review the Topics
review_topics(lda)

# Assign Documents to Topics
df = assign_topics(lda, corpus, df)

# Prepare the visualization data
vis_data = prepare_visualization_data(lda, corpus, dictionary)

# Visualize
pyLDAvis.display(vis_data)



  and should_run_async(code)


Topic: 0 
Words: 0.018*"2800cs" + 0.016*"73" + 0.014*"sale" + 0.014*"center" + 0.013*"fan" + 0.013*"price" + 0.013*"spring" + 0.012*"wtb:" + 0.011*"turn" + 0.010*"automatic"

Topic: 1 
Words: 0.050*"wheel" + 0.027*"alpina" + 0.027*"steering" + 0.022*"light" + 0.018*"weber" + 0.016*"rear" + 0.016*"looking" + 0.012*"restoration" + 0.012*"another" + 0.011*"interior"

Topic: 2 
Words: 0.027*"?" + 0.020*"kit" + 0.019*"tool" + 0.018*"car" + 0.016*"box" + 0.015*"!" + 0.014*"wiper" + 0.010*"-" + 0.010*"carpet" + 0.009*"auction"

Topic: 3 
Words: 0.062*"part" + 0.048*"sale" + 0.022*"rhd" + 0.014*"show" + 0.014*"pic" + 0.014*"sunroof" + 0.013*"coupe" + 0.013*"seal" + 0.009*"motronic" + 0.008*"shift"

Topic: 4 
Words: 0.024*"help" + 0.019*"need" + 0.016*"bay" + 0.016*"problem" + 0.015*"area" + 0.013*"cover" + 0.013*"plate" + 0.012*"a/c" + 0.010*"body" + 0.009*"valve"

Topic: 5 
Words: 0.044*"rear" + 0.034*"front" + 0.033*"window" + 0.026*"question" + 0.021*"bumper" + 0.018*"side" + 0.017*"-" + 0.

In [6]:
#Create representative sentences for each topic

num_topics = lda.num_topics
representative_sentences = []

for topic_id in range(num_topics):
    try:
        sentence = create_representative_sentence(lda, e9_forum_corpus, topic_id)
        representative_sentences.append({'Topic': topic_id, 'Representative Sentence': sentence})
        print(f"Topic {topic_id}: {sentence}")
    except IndexError as e:
        print(f"Error with topic {topic_id}: {e}")

# Save to CSV
output_df = pd.DataFrame(representative_sentences)
output_df.to_csv('/content/drive/MyDrive/Colab Notebooks/Data_sets/e9/representative_sentences.csv', index=False)


  and should_run_async(code)


Topic 0: 1971 Bmw 2800cs for sale
Topic 1: looking for a CSL steering wheel
Topic 2: ??? Who Makes The BEST carpet kits ????
Topic 3: In LA area Saturday looking at coupes: any for sale?
Topic 4: Does anybody in the Jersey area know this car on e-bay?
Topic 5: FS: Set of outer trims front door/rear window ! new chrome !
Topic 6: 3.5 m30 head removal question
Topic 7: Parting out compete 72 3.0csi - ALL parts
Topic 8: does anyone know a source for scheel-man seats?
Topic 9: E9, E3, E12 Headers 4-sale
