In [None]:
import fitz  # PyMuPDF
import os
import glob
import pandas as pd

# Define the directories
pdf_dir = "../../data/pdf/"
txt_path = "../../data/txt/pdf_contents.txt"

# Create output directory if it doesn't exist
os.makedirs(os.path.dirname(txt_path), exist_ok=True)

# Get a list of all PDF files in the directory
pdf_files = glob.glob(os.path.join(pdf_dir, "*.pdf"))

# Initialize a variable to hold the combined content
combined_content = ""

# Loop through all PDF files
for pdf_file in pdf_files:
    # Open the PDF document
    pdf_document = fitz.open(pdf_file)
    
    # Extract text from each page of the PDF
    content = ""
    for page_number in range(pdf_document.page_count):
        page = pdf_document[page_number]
        content += page.get_text()
    
    # Close the PDF document
    pdf_document.close()
    
    # Add a header for each PDF (optional)
    combined_content += f"--- Contents of {os.path.basename(pdf_file)} ---\n"
    combined_content += content.strip() + "\n\n"

# Write the combined content to the text file
with open(txt_path, "w", encoding="utf-8") as txt_file:
    txt_file.write(combined_content)

print(f"Combined PDF contents saved to {txt_path}")

In [None]:
import os
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Define additional stop words
ADDITIONAL_STOPWORDS = {"bank", "fdic"}

def clean_text(file_path, output_path):
    # Ensure NLTK resources are downloaded
    try:
        stop_words = set(stopwords.words('english'))
    except LookupError:
        import nltk
        nltk.download('stopwords')
        nltk.download('punkt')
        stop_words = set(stopwords.words('english'))

    stop_words.update(ADDITIONAL_STOPWORDS)

    # Prepare punctuation to exclude periods
    punctuation_to_remove = string.punctuation.replace('.', '')

    # Initialize the stemmer
    stemmer = PorterStemmer()

    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    cleaned_lines = []

    for line in lines:
        # Remove numbers from the line
        line = re.sub(r'\d+', '', line)
        # Tokenize line
        tokens = word_tokenize(line)
        # Filter and process tokens
        tokens = [
            stemmer.stem(token.lower()) for token in tokens
            if token.lower() not in stop_words
            and token not in punctuation_to_remove
            and len(token) > 1
        ]
        # Reconstruct cleaned sentence
        cleaned_line = ' '.join(tokens)
        cleaned_lines.append(cleaned_line)

    # Save cleaned content to a new file
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, 'w', encoding='utf-8') as output_file:
        output_file.write('\n'.join(cleaned_lines))

    print(f"Cleaned text saved to {output_path}")

# File paths
input_file_path = '../../data/txt/pdf_contents.txt'
output_file_path = '../../data/txt/cleaned_contents.txt'

# Run the function
clean_text(input_file_path, output_file_path)

In [None]:
import re
from collections import Counter
from nltk import ngrams
import matplotlib.pyplot as plt

def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'\W+', ' ', text)  # Remove punctuation
    text = text.strip()  # Remove leading/trailing spaces
    return text

def get_ngrams(text, n):
    words = text.split()
    return list(ngrams(words, n))

def visualize_ngrams(ngrams_freq, title, top_n=10):
    top_ngrams = ngrams_freq.most_common(top_n)
    labels, counts = zip(*top_ngrams)
    labels = [' '.join(label) for label in labels]  # Join n-grams with spaces
    
    plt.figure(figsize=(10, 6))
    plt.barh(labels, counts, color='skyblue')
    plt.xlabel('Frequency')
    plt.title(title)
    plt.gca().invert_yaxis()
    plt.show()

# File path
file_path = '../../data/txt/cleaned_contents.txt'  # Replace with your file path

# Main processing
text = preprocess_text(read_file(file_path))

# N-gram extraction
for n in range(1, 5):  # Unigrams to Qualgrams
    ngrams_list = get_ngrams(text, n)
    ngrams_freq = Counter(ngrams_list)
    visualize_ngrams(ngrams_freq, f'Top {n}-grams', top_n=15)

In [None]:
import os
import re
import nltk
import gensim
from gensim import corpora
from gensim.models import LdaModel
from nltk.tokenize import word_tokenize
from gensim.models.coherencemodel import CoherenceModel

import pyLDAvis
import pyLDAvis.gensim_models
from IPython.display import display
import pyLDAvis.gensim_models as gensimvis

# Ensure NLTK dependencies are available
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
def preprocess_for_lda(input_file_path):
    """
    Preprocess the cleaned text for LDA modeling by tokenizing and creating a document-term matrix.
    """
    # Read the cleaned text file
    with open(input_file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # Tokenize the text into words
    processed_lines = [word_tokenize(line.strip()) for line in lines if line.strip()]

    return processed_lines

In [None]:
# File path to the cleaned text
input_file_path = '../../data/txt/cleaned_contents.txt'

# Preprocess text
processed_lines = preprocess_for_lda(input_file_path)

In [None]:
# Create a dictionary representation of the documents
dictionary = corpora.Dictionary(processed_lines)

# Create a document-term matrix
corpus = [dictionary.doc2bow(text) for text in processed_lines]

# Perform LDA topic modeling
lda_model = LdaModel(
    corpus=corpus,
    num_topics=4,
    id2word=dictionary,
    passes=10,
    random_state=493
)

In [None]:
# Print the topics
topics = lda_model.print_topics(num_words=7)  # num_words shows top words in each topic
for topic in topics:
    print(topic)

In [None]:
def get_dominant_topic(bow):
    topic_probs = lda_model.get_document_topics(bow)
    return max(topic_probs, key=lambda x: x[1])[0] if topic_probs else None

topic_numbers = [get_dominant_topic(bow) for bow in corpus]

# Step 6: Create the DataFrame
df = pd.DataFrame({
    'Topic Number': topic_numbers,
    'Sentence': processed_lines
})
df.to_csv('../../data/txt/lda_topics.csv', index=False)

In [None]:
def compute_coherence_scores(processed_lines, start=2, limit=6):
    """Compute coherence scores for various numbers of topics."""
    coherence_scores = []
    for num_topics in range(start, limit + 1):
        lda_model = gensim.models.LdaModel(
            corpus,
            num_topics=num_topics,
            id2word=dictionary,
            passes=10,
            random_state=493
        )
        coherence_model = CoherenceModel(model=lda_model, texts=processed_lines, dictionary=dictionary, coherence='c_v')
        coherence_score = coherence_model.get_coherence()
        coherence_scores.append((num_topics, coherence_score))
        print(f"Number of Topics: {num_topics}, Coherence Score: {coherence_score}")

    return coherence_scores

In [None]:
# Compute coherence scores
coherence_scores = compute_coherence_scores(processed_lines, start=2, limit=6)

In [None]:
def plot_coherence_scores(coherence_scores):
    """Plot coherence scores for different numbers of topics."""
    num_topics = [score[0] for score in coherence_scores]
    scores = [score[1] for score in coherence_scores]

    plt.figure(figsize=(8, 5))
    plt.plot(num_topics, scores, marker='o', linestyle='--', color='b')
    plt.title('Coherence Scores by Number of Topics')
    plt.xlabel('Number of Topics')
    plt.ylabel('Coherence Score')
    plt.grid()
    plt.show()

In [None]:
# Plot coherence scores
plot_coherence_scores(coherence_scores)

In [None]:
pyLDAvis.enable_notebook()
gensimvis.prepare(lda_model, corpus, dictionary)

In [None]:
def get_sub_ngrams(text, n):
    """
    Generate n-grams from text.
    """
    words = word_tokenize(text)
    return list(ngrams(words, n))

def visualize_ngrams(ngrams_freq, title, top_n=10):
    """
    Visualize the top n-grams in a horizontal bar chart.
    """
    top_ngrams = ngrams_freq.most_common(top_n)
    if not top_ngrams:  # Skip empty n-grams
        print(f"No n-grams to display for {title}")
        return
    
    labels, counts = zip(*top_ngrams)
    labels = [' '.join(label) for label in labels]  # Join n-grams with spaces

    plt.figure(figsize=(10, 6))
    plt.barh(labels, counts, color='skyblue')
    plt.xlabel('Frequency')
    plt.title(title)
    plt.gca().invert_yaxis()
    plt.show()

In [None]:
# Load topics DataFrame
topics_df = pd.read_csv('../../data/txt/lda_topics.csv')  # Ensure the path is correct

# Visualize n-grams for each topic
for _, row in topics_df.iterrows():
    topic_number = row['Topic Number']
    content = row['Sentence']
    
    print(f"Visualizing n-grams for Topic {topic_number}...")
    
    for n in range(1, 5):  # Unigrams, Bigrams, Trigrams
        ngrams_list = get_sub_ngrams(content, n)
        ngrams_freq = Counter(ngrams_list)
        visualize_ngrams(ngrams_freq, f'Topic {topic_number} - Top {n}-grams', top_n=15)