# Topic Modeling

In [1]:
import pandas as pd

In [2]:

import re
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.decomposition import NMF
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [3]:
df = pd.read_csv("data/topic_dataset_10k.csv")

In [4]:
df.head(5)

Unnamed: 0,id,timestamp,user,text,ticker,topic_label,engagement
0,topic_0,2025-02-01T12:16:33.957527,user_889,$NVDA launch reported — investors cautious,NVDA,product_launch,6
1,topic_1,2025-04-10T12:16:33.957527,user_1234,$JPM insider sell reported — market reaction m...,JPM,insider_trading,10
2,topic_2,2025-05-07T12:16:33.957527,user_4,$TSLA filing reported — market reaction muted,TSLA,insider_trading,5
3,topic_3,2025-08-09T12:16:33.957527,user_1597,$AAPL gdp reported — stock jumps,AAPL,macro,8
4,topic_4,2024-12-01T12:16:33.957527,user_563,$MSFT layoff reported — investors cautious,MSFT,hiring,2


In [5]:
#Define custom stop words from the generator template
custom_stop_words = [
    'reported', 'market', 'reaction', 'muted', 'stock', 'jumps',
    'investors', 'cautious', 'analysts', 'note'
]


In [6]:
# 2b. Combine with standard English stop words
stop_words = set(ENGLISH_STOP_WORDS).union(custom_stop_words)


In [7]:
# 2c. Create a preprocessing function
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove tickers like $AAPL
    text = re.sub(r'\$\w+', '', text)
    # Remove punctuation and numbers, keep only letters and spaces
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize and remove stop words
    tokens = [word for word in text.split() if word not in stop_words]
    # Join back into a string
    return ' '.join(tokens)

In [8]:




# 2d. Apply the function to the 'text' column
df['cleaned_text'] = df['text'].apply(preprocess_text)

print("--- Preprocessing Complete ---")
print("Example of cleaned text:")
print(df[['text', 'cleaned_text']].head())
print("\n")


--- Preprocessing Complete ---
Example of cleaned text:
                                                text  cleaned_text
0         $NVDA launch reported — investors cautious        launch
1  $JPM insider sell reported — market reaction m...  insider sell
2      $TSLA filing reported — market reaction muted        filing
3                   $AAPL gdp reported — stock jumps           gdp
4         $MSFT layoff reported — investors cautious        layoff




## TF-IDF + NMF

We'll vectorize the texts with TF-IDF and run NMF to extract topics.
We will print the top words for each topic and assign a dominant topic to each document.


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import numpy as np

In [10]:
# --- 3. TF-IDF Vectorization ---
# We removed stop_words='english' here because we handled it in our custom function
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2)

In [11]:
# FIT ON THE NEW 'cleaned_text' COLUMN
W = vectorizer.fit_transform(df['cleaned_text'])

feature_names = vectorizer.get_feature_names_out()


In [12]:
# --- 4. Topic Modeling (NMF) ---

# Set n_components to 10 (the number of original topics)
n_topics = 10
model = NMF(n_components=n_topics, init='nndsvd', random_state=42)

In [13]:
# Fit the model
# model.fit(W) # This is just fitting, use fit_transform to get W_topics
W_topics = model.fit_transform(W) # Document-topic matrix
H_topics = model.components_       # Topic-term matrix



In [14]:
# --- 5. Display Results ---

def print_top_words(H, feature_names, n_top_words=10):
    for topic_idx, topic in enumerate(H):
        top_indices = topic.argsort()[:-n_top_words - 1:-1]
        top_words = [feature_names[i] for i in top_indices]
        print(f"Topic {topic_idx}: {', '.join(top_words)}")

print("--- Top Words (After Cleaning) ---")
print_top_words(H_topics, feature_names, n_top_words=10)
print("\n")


--- Top Words (After Cleaning) ---
Topic 0: insider, sell, lawsuit, logistics, cpi, yield, gdp, merger, inflation, acquire
Topic 1: supplier, released, regulatory, recession, yield, forecast, estimates, filing, fine, form
Topic 2: hiring, shareholder, estimates, form, fine, recruitment, gdp, rollout, yield, economy
Topic 3: compliance, rollout, new, product, forecast, merger, spinoff, buyout, guidance, gdp
Topic 4: probe, recruitment, logistics, headcount, delay, guidance, outlook, beat, yield, filing
Topic 5: port, fine, dividend, released, deal, inflation, acquire, earnings, eps, gdp
Topic 6: quarter, form, delay, buyback, deal, launch, spinoff, acquire, earnings, revenue
Topic 7: filing, guidance, launch, payout, layoff, return, cash, forecast, recession, hike
Topic 8: unveiled, shortage, chip, released, gdp, merger, recession, profits, spinoff, inflation
Topic 9: talent, buyback, estimates, cash, return, forecast, lawsuit, regulatory, integration, yield




In [15]:
# --- 6. Assign Topics to Documents ---
dominant_topics = W_topics.argmax(axis=1)
df['pred_topic'] = dominant_topics

In [16]:

print("--- Example Predictions (After Cleaning) ---")
# Show original text, cleaned text, predicted topic, and true label
print(df[['text', 'cleaned_text', 'topic_label', 'pred_topic']].sample(10, random_state=42))

--- Example Predictions (After Cleaning) ---
                                                   text cleaned_text  \
6252    $NVDA earnings reported — market reaction muted     earnings   
4684         $BAC lawsuit reported — investors cautious      lawsuit   
1731             $MSFT lawsuit reported — analysts note      lawsuit   
4742              $AMZN buyout reported — analysts note       buyout   
4521        $NVDA economy reported — investors cautious      economy   
6340       $GOOG dividend reported — investors cautious     dividend   
576                     $JPM gdp reported — stock jumps          gdp   
5202  $MSFT shareholder reported — market reaction m...  shareholder   
6363               $AMZN acquire reported — stock jumps      acquire   
439      $AMZN outlook reported — market reaction muted      outlook   

     topic_label  pred_topic  
6252    earnings           8  
4684  regulation           9  
1731  regulation           9  
4742      merger           7  
4521   

# logistic Regression

In [17]:
# FIT ON THE NEW 'cleaned_text' COLUMN
X = vectorizer.fit_transform(df['cleaned_text'])

In [18]:
# --- 4. Define Labels (y) ---
y = df['topic_label']

In [19]:
# --- 5. Train-Test Split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [20]:
print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
print("\n")

Training set size: 8000
Test set size: 2000




In [21]:
# Initialize the model
model = LogisticRegression(random_state=42, max_iter=1000)

In [22]:
# Train the model
model.fit(X_train, y_train)
print("Model training complete.")
print("\n")

Model training complete.




In [23]:
# --- 7. Evaluate Model ---
print("--- Model Evaluation ---")
# Make predictions on the test set
y_pred = model.predict(X_test)


--- Model Evaluation ---


In [24]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print("\n")

Model Accuracy: 100.00%




In [25]:
# Show detailed report
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("\n")

Classification Report:
                 precision    recall  f1-score   support

       dividend       1.00      1.00      1.00       197
       earnings       1.00      1.00      1.00       200
       guidance       1.00      1.00      1.00       193
         hiring       1.00      1.00      1.00       198
insider_trading       1.00      1.00      1.00       195
          macro       1.00      1.00      1.00       205
         merger       1.00      1.00      1.00       204
 product_launch       1.00      1.00      1.00       204
     regulation       1.00      1.00      1.00       202
   supply_chain       1.00      1.00      1.00       202

       accuracy                           1.00      2000
      macro avg       1.00      1.00      1.00      2000
   weighted avg       1.00      1.00      1.00      2000





In [26]:
# Get the original text for the test set examples
test_indices = y_test.index
example_df = df.loc[test_indices].copy()
example_df['predicted_label'] = y_pred

In [27]:
# Display examples
print(example_df[['text', 'cleaned_text', 'topic_label', 'predicted_label']].sample(10, random_state=42))

                                                  text cleaned_text  \
6553                $BAC layoff reported — stock jumps       layoff   
6042               $AAPL Form 4 reported — stock jumps         form   
215                 $BAC deal reported — analysts note         deal   
7775  $META estimates reported — market reaction muted    estimates   
3738    $AMZN spinoff reported — market reaction muted      spinoff   
4781                 $AMZN fine reported — stock jumps         fine   
4868                 $MSFT port reported — stock jumps         port   
104          $GOOG compliance reported — analysts note   compliance   
6787    $BAC new product reported — investors cautious  new product   
2018   $AMZN shareholder reported — investors cautious  shareholder   

          topic_label  predicted_label  
6553           hiring           hiring  
6042  insider_trading  insider_trading  
215            merger           merger  
7775         guidance         guidance  
3738         