<a href="https://colab.research.google.com/github/davidelgas/DataSciencePortfolio/blob/main/NLP_corpus_creation/notebooks%20/NLP_Corpus_Development.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

 Data Collection and Preprocessing





In [None]:
# Access to Google Drive
# This seems to propagate credentials better from its own cell

from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Install libraries

from IPython.display import Image

# Data Collection
import os

!pip3 install pandas
import pandas as pd

!pip3 install requests
import requests

!pip3 install beautifulsoup4
from bs4 import BeautifulSoup

!pip install snowflake-connector-python
import snowflake.connector

# Data Preprocessing
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

from gensim.parsing.preprocessing import STOPWORDS

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import re

!pip install langdetect
from langdetect import detect

from transformers import BertTokenizer, BertModel, pipeline

import torch

!pip3 install numpy
import numpy as np

!pip install faiss-cpu
import faiss

!pip install langdetect
from langdetect import detect


Data Collection



The project leverages user-generated content from a domain-specific online forum as the training corpus. This data is largely unstructured, with minimal metadata available. The following tools were considered to gather the source text for the corpus:


### Web Scraping
- **Tools:** Beautiful Soup, online SaaS products
    - **Pros:**
        - **Direct Access to Targeted Data:** Enables precise extraction of user-generated content from specific sections or threads within the forum.
        - **Efficiency in Data Collection:** Automated scripts can gather large volumes of data in a short amount of time, making it suitable for assembling significant datasets for NLP.
    - **Cons:**
        - **Potential for Incomplete Data:** May miss embedded content or dynamically loaded data, depending on the website’s structure.
        - **Ethical and Legal Considerations:** Scraping data from forums may raise concerns about user privacy and must adhere to the terms of service of the website.
        - **Very Platform Dependent:** Forum specific solutions result in forum specific data schemas that must be reverse engineered to for successful text extraction.

### Forum-specific APIs
- **Tools:** Python (`requests` library for API calls and `json` library for handling responses)
    - **Pros:**
        - **Structured and Reliable Data Retrieval:** APIs provide structured data, making it easier to process and integrate into your project.
        - **Efficient and Direct Access:** Directly accessing the forum's data through its API is efficient, bypassing the need for HTML parsing.
        - **Compliance and Ethical Data Use:** Utilizing APIs respects the forum's data use policies and ensures access is in line with user agreements.
    - **Cons:**
        - **Rate Limiting:** APIs often have limitations on the number of requests that can be made in a certain timeframe, which could slow down data collection.
        - **API Changes:** Dependence on the forum's API structure means that changes or deprecation could disrupt your data collection pipeline.
        - **Access Restrictions:** Some data or functionalities might be restricted or require authentication, posing additional challenges for comprehensive data collection.


**Conclusion: I will be using Beautiful Soup to create my corpus.**


In [None]:

# Set the base path to save files
BASE_PATH = '/content/drive/MyDrive/Colab Notebooks/Data_sets/e9/'

# Create URLs from the thread_ids and save to a CSV
def create_urls():
    # Define the file path inside the function using the correct base path
    file_path = BASE_PATH + 'e9_forum_thread_ids.csv'

    # Set the number of incremental thread_ids to process
    threads = 1

    # Check if the file exists and has content. If it does, update last_thread_id
    if os.path.exists(file_path) and os.path.getsize(file_path) > 0:
        e9_forum_thread_ids = pd.read_csv(file_path)
        last_thread_id = e9_forum_thread_ids['thread_id'].iloc[-1]
        last_thread_id = int(last_thread_id)  # Convert to integer
    else:
        last_thread_id = 0

    urls = []
    for thread_id in range(last_thread_id + 1, last_thread_id + threads + 1):
        urls.append({'thread_id': thread_id})

    last_thread_id_processed = urls[-1]['thread_id']

    # Convert the list of dictionaries into a DataFrame
    e9_forum_thread_ids = pd.DataFrame(urls)

    # Save DataFrame to CSV file
    e9_forum_thread_ids.to_csv(file_path, mode='a', header=not os.path.exists(file_path), index=False)

    print("Starting with thread_id " + str(last_thread_id))
    print("Processing additional " + str(threads) + " threads")
    print("Ending with thread_id " + str(last_thread_id_processed))


    return last_thread_id, last_thread_id_processed, e9_forum_thread_ids

# Ingest thread_ids and return title, id and URL
def fetch_thread_data(df):
    # Define the file path inside the function using the correct base path
    file_path = BASE_PATH + 'e9_forum_threads.csv'

    # Set the number of pages to process
    pages = 1

    for index, row in df.iterrows():
        thread_id = row['thread_id']
        thread_url = f"https://e9coupe.com/forum/threads/{thread_id}"
        for i in range(1, pages + 1):
            page_url = f"{thread_url}/?page={i}"  # Construct the page URL
            response = requests.get(page_url)
            soup = BeautifulSoup(response.text, 'html.parser')
            title = soup.find('title').get_text()
            thread_title = title.split('|')[0].strip()
            df.at[index, 'thread_title'] = thread_title
            df.at[index, 'thread_url'] = page_url

     df.to_csv(file_path, mode='a', header=not os.path.exists(file_path), index=False)

    return df

# Find the first post in the thread creation
def fetch_first_post_content(df):
    # Define the file path inside the function using the correct base path
    file_path = BASE_PATH + 'e9_forum_threads_decorated.csv'

    data = []

    for thread_id, thread_url, thread_title in zip(df['thread_id'], df['thread_url'], df['thread_title']):
        response = requests.get(thread_url)
        soup = BeautifulSoup(response.text, 'html.parser')

        first_post = soup.find('article', class_='message-body')
        if first_post:
            post_content = first_post.get_text(strip=True)
        else:
            post_content = "No content found"  # Handle case where no post content is found

        data.append({'thread_id': thread_id, 'thread_title': thread_title, 'thread_first_post': post_content})

    # Convert list of dictionaries to DataFrame
    e9_forum_threads_decorated = pd.DataFrame(data)

    # Export and save result
    e9_forum_threads_decorated.to_csv(file_path, mode='a', header=not os.path.exists(file_path), index=False)

    return e9_forum_threads_decorated


# Original UDF to fetch and parse thread posts
def fetch_and_parse_thread(df):
    post_data = []
    processed_posts = set()
    for index, row in df.iterrows():
        response = requests.get(row['thread_url'])
        soup = BeautifulSoup(response.text, 'html.parser')
        articles = soup.find_all('article', class_='message--post')
        for article in articles:
            post_timestamp = article.find('time')['datetime'] if article.find('time') else 'N/A'
            content = article.find('div', class_='bbWrapper').get_text(strip=True)

            post_data.append({
                'thread_id': row['thread_id'],
                'post_timestamp': post_timestamp,
                'post_raw': content
            })

    e9_forum_posts = pd.DataFrame(post_data)

    e9_forum_posts['post_raw'] = e9_forum_posts['post_raw'].astype(str)

    # Define the output path
    output_path = os.path.join(BASE_PATH, 'e9_forum_posts.csv')

    # Export and save result
    e9_forum_posts.to_csv(output_path, index=False)

    return e9_forum_posts



# Define the UDF to process the data
def create_forum_corpus(e9_forum_posts, e9_forum_threads_decorated):

    # Group by THREAD_ID and concatenate the POST_RAW values
    aggregated_data = e9_forum_posts.groupby('thread_id')['post_raw'].agg(lambda x: ' '.join(x)).reset_index()

    # Rename the column to indicate that it contains concatenated post content
    aggregated_data.rename(columns={'post_raw': 'thread_all_posts'}, inplace=True)

    # Ensure thread_id columns are of type int64
    e9_forum_threads_decorated['thread_id'] = e9_forum_threads_decorated['thread_id'].astype('int64')
    aggregated_data['thread_id'] = aggregated_data['thread_id'].astype('int64')

    # Merge the two DataFrames
    e9_forum_corpus = pd.merge(e9_forum_threads_decorated, aggregated_data, on='thread_id', how='left')

    # Define the output path
    output_path = os.path.join(BASE_PATH, 'e9_forum_corpus.csv')

    # Export and save result
    e9_forum_corpus.to_csv(output_path, index=False)

    return e9_forum_corpus


def main():
    # Execute the function and print results
    last_thread_id, last_thread_id_processed, e9_forum_thread_ids = create_urls()

    # Fetch thread URLs and title
    e9_forum_threads = fetch_thread_data(e9_forum_thread_ids)

    # Fetch first post content
    e9_forum_threads_decorated = fetch_first_post_content(e9_forum_threads)

    # Fetch all thread post content
    e9_forum_posts = fetch_and_parse_thread(e9_forum_threads)

    e9_forum_corpus = create_forum_corpus(e9_forum_posts, e9_forum_threads_decorated)


# Ensure the main function is called
if __name__ == "__main__":
    main()

In [None]:

BASE_PATH = '/content/drive/MyDrive/Colab Notebooks/Data_sets/e9/'
CREDENTIALS_PATH = '/content/drive/MyDrive/Colab Notebooks/credentials/snowflake_credentials'

# Load the e9_forum_corpus DataFrame from the CSV file
e9_forum_corpus = pd.read_csv(BASE_PATH + 'e9_forum_corpus.csv')

def load_credentials(credentials_path):
    """Load Snowflake credentials from a file and set them as environment variables."""
    with open(credentials_path, 'r') as file:
        for line in file:
            key, value = line.strip().split('=')
            os.environ[key] = value

def connect_to_snowflake():
    """Establish a connection to the Snowflake database."""
    return snowflake.connector.connect(
        user=os.environ.get('USER'),
        password=os.environ.get('PASSWORD'),
        account=os.environ.get('ACCOUNT')
    )

def create_db_and_schema(cur):
    """Create the database and schema in Snowflake."""
    try:
        cur.execute("CREATE DATABASE IF NOT EXISTS e9_corpus")
        cur.execute("USE DATABASE e9_corpus")
        cur.execute("CREATE SCHEMA IF NOT EXISTS e9_corpus_schema")
        print("Database and schema created successfully.")
    except Exception as e:
        print(f"Error creating database and schema: {e}")

def create_table_if_not_exists(cur):
    """Create the e9_forum_corpus table if it does not exist."""
    try:
        cur.execute("""
        CREATE TABLE IF NOT EXISTS e9_corpus.e9_corpus_schema.e9_forum_corpus (
            thread_id NUMBER(38,0),
            thread_title VARCHAR(16777216),
            thread_first_post VARCHAR(16777216),
            thread_all_posts VARCHAR(16777216)
        )
        """)
        print("e9_forum_corpus table created successfully.")
    except Exception as e:
        print(f"Error creating table: {e}")

def insert_data_into_table(cur, df):
    """Insert data from the DataFrame into the e9_forum_corpus table."""
    for index, row in df.iterrows():
        row = row.where(pd.notnull(row), None)
        insert_command = f"""
        INSERT INTO e9_corpus.e9_corpus_schema.e9_forum_corpus
        (thread_id, thread_title, thread_first_post, thread_all_posts)
        VALUES (%s, %s, %s, %s)
        """
        try:
            cur.execute(insert_command, (
                row['thread_id'], row['thread_title'],
                row['thread_first_post'], row['thread_all_posts']
            ))
        except Exception as e:
            print(f"Error inserting data: {e}")

def fetch_data_from_table(cur):
    """Fetch all data from the e9_forum_corpus table."""
    query = "SELECT * FROM e9_corpus.e9_corpus_schema.e9_forum_corpus"
    cur.execute(query)
    return cur.fetch_pandas_all()

def main():
    # Load Snowflake credentials
    load_credentials(CREDENTIALS_PATH)

    # Connect to Snowflake
    conn = connect_to_snowflake()
    cur = conn.cursor()

    # Create the database, schema, and table if they don't exist
    create_db_and_schema(cur)
    create_table_if_not_exists(cur)

    # Insert data into the table
    insert_data_into_table(cur, e9_forum_corpus)
    conn.commit()
    print("Data inserted into e9_forum_corpus table.")

    # Fetch data from the table
    e9_forum_corpus_df = fetch_data_from_table(cur)
    e9_forum_corpus_df.info()

    # Close cursor and connection
    cur.close()
    conn.close()

if __name__ == "__main__":
    main()
