PREPROCESSING

In [1]:
'''Imports'''
import json
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import spacy
from tqdm import tqdm
import warnings
import os
from rapidfuzz import process, fuzz

In [2]:
'''Reading Data'''
data_dir_comments = r"C:\Users\gungo\OneDrive\Desktop\stocks_comments.ndjson"
data_dir_sub = r"C:\Users\gungo\OneDrive\Desktop\stocks_submissions.ndjson"
df_com = pd.read_json(data_dir_comments, lines=True)
df_sub = pd.read_json(data_dir_sub, lines=True)

  df_com = pd.read_json(data_dir_comments, lines=True)


ValueError: Expected object or value

In [None]:
'''Reducing Data'''
df_com_reduced = df_com[['created_utc','score','body']]
df_sub_reduced = df_sub[['created_utc','score','selftext']]

# Rename columns
df_com_reduced = df_com_reduced.rename(columns={'body': 'text'})
df_sub_reduced = df_sub_reduced.rename(columns={'selftext': 'text'})

# Concatenate
df_merged = pd.concat([df_com_reduced, df_sub_reduced], ignore_index=True)

In [None]:
'''Pre-Processing'''
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    """Cleans, tokenizes, removes stopwords, and stems text."""
    if not isinstance(text, str):
        return ""
    #text = text.lower()       Not needed for NER. Actually makes it worse
    text = re.sub(r'&amp;#x200B;', '', text)
    text = re.sub('&amp;', '', text) # remove some special characters from the data &amp; corresponds to &
    text = re.sub(r'\s+', ' ', text)  # eliminate duplicate whitespaces using regex
    text = re.sub(r'\[[^]]*\]', '', text)  # remove text in square brackets
    text = re.sub(r'http\S+', '', text)  # remove URLs
    text = re.sub(r'\binc\b', '', text)
    text = ' '.join(word for word in text.split() if word not in stop_words)
    #text = ' '.join(stemmer.stem(word) for word in text.split() if word not in stop_words)
    return text

def preprocess(df):
    """Preprocesses the 'body' or 'selftext' column and removes '[removed]' entries."""
    
    text_col = 'text'

    # Remove NaN and '[removed]' rows
    df = df[df[text_col].notna()]
    df = df[~df[text_col].str.contains(r'\[removed\]', na=False)]
    df = df[~df[text_col].str.contains(r'\[deleted\]', na=False)]

    # Apply text preprocessing
    df['processed_text'] = df[text_col].apply(preprocess_text)

    return df.reset_index(drop=True)

pre_processed_df = preprocess(df_merged)

In [None]:
'''Display'''
pre_processed_df.head(20)

Unnamed: 0,created_utc,score,text,processed_text
0,1654041658,-1,Musk is a clown. He knew 50% of his followers ...,musk clown. knew 50% followers bots. knew twit...
1,1654041696,100,What's the cumulative short loss? $50 billion ...,what's cumulative short loss? $50 billion coun...
2,1654041706,2,"Quantum computing is physics, but physics isn'...","quantum computing physics, physics business. p..."
3,1654041743,62,MANGA,manga
4,1654041839,8,AMD?\n\nThey sell on the merits of their produ...,"amd? sell merits products, open source softwar..."
5,1654041840,12,Highly coincidental that this drastic drop in ...,highly coincidental drastic drop price happene...
6,1654041851,2,"Of course you can time the market, on a macro ...","course time market, macro basis - follow fed. ..."
7,1654041856,1,However the issue is with the decay. It may s...,however issue decay. may show 100% gains hits ...
8,1654041865,13,They exclude the 5% they know about.\n\nAnd it...,exclude 5% know about. matter. advertisers get...
9,1654041907,11,The board dgaf what Dorsey days.,board dgaf dorsey days.


NAMED ENTITY RECOGNITION

In [None]:
top100_path = r'C:\Users\gungo\OneDrive\Dokumente\GitHub\NLP-Group-10\Top_100.csv'
Top_100 = pd.read_csv(top100_path, encoding='latin1')

Top_100.columns = [col.strip().lower() for col in Top_100.columns]

# Create dictionaries for fast lookups
ticker_to_name = dict(zip(Top_100['symbol'].str.upper(), Top_100['name']))
valid_tickers = set(ticker_to_name.keys())
company_names = [name.lower() for name in ticker_to_name.values()]
name_to_ticker = {name.lower(): symbol for symbol, name in ticker_to_name.items()}

In [None]:

#model
nlp = spacy.load('en_core_web_sm')

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

#to be ran per comment
def extract_ner_entities(model, text, similarity_threshold=90):
    
    BLACKLIST = {'ev', 'covid', 'etf', 'nyse', 'sec', 'spac', 'fda', 'treasury', 'covid-19', 'rrsp', 'tfsa','fed'}
    doc = model(text)
    detected_companies = []

    #Detect companies via spaCy NER
    for ent in doc.ents:
        if ent.label_ == "ORG" and ent.text.lower() not in BLACKLIST:
            org_name = ent.text.strip()
            # Fuzzy match against official company names from csv file
            match, score, _ = process.extractOne(org_name.lower(), company_names, scorer=fuzz.token_sort_ratio)
            if score >= similarity_threshold:
                matched_ticker = name_to_ticker[match]
                canonical_name = ticker_to_name[matched_ticker]
                detected_companies.append(canonical_name)
            else:
                #keeps companies not in csv file maybe delete later
                detected_companies.append(org_name)

    # --- Match stock tickers in text ---
    for token in doc:
        token_text = token.text.strip()

        # Handle tickers with $ prefix, e.g. $AAPL
        if token_text.startswith("$"):
            token_text = token_text[1:]

        # Check if it’s a valid ticker symbol
        if token_text in valid_tickers:
            company_name = ticker_to_name.get(token_text)
            detected_companies.append(company_name)


    return list(set(detected_companies))


def get_dict_top_companies(dataset, column_name, top_companies=10):
    company_counter = dict()
    for companies in dataset[column_name]:
        for company in companies:
            if company in company_counter:  
                company_counter[company] += 1
            else:
                company_counter[company] = 1
    sorted_dict = dict(sorted(company_counter.items(), key=lambda x: x[1], reverse=True))
    top = dict()
    for company, count in list(sorted_dict.items())[:top_companies]:
        top[company] = count

    return top


In [None]:
# Ensure every cell is a string (NaN -> "")
pre_processed_df['processed_text'] = pre_processed_df['processed_text'].fillna("").astype(str)

# Safe wrapper so extract_ner_entities always receives a string
def safe_extract(text):
    if not text or not isinstance(text, str):
        return []
    return extract_ner_entities(nlp, text, similarity_threshold=60)

pre_processed_df["Companies"] = pre_processed_df['processed_text'].apply(safe_extract)

In [None]:
# --- Submissions ---
top_dict = get_dict_top_companies(pre_processed_df, "Companies")
top_set = set(top_dict.keys())
print(f"Top companies: {sorted(top_set)}")

masked = pre_processed_df['Companies'].apply(lambda lst: bool(top_set.intersection(lst)))
filtered_df = pre_processed_df[masked].copy()

exploded_sub = filtered_df.explode('Companies')

# Keep only rows for top companies 
exploded_sub = exploded_sub[exploded_sub['Companies'].isin(top_set)].copy()

# Create separate dataframes for each top company 
dfs_by_company = {}
os.makedirs("companies_csv", exist_ok=True)

for company in top_set:
    dfs_by_company[company] = exploded_sub[exploded_sub['Companies'] == company].copy()
    file_path = os.path.join("companies_csv", f"{company}_submissions.csv")
    dfs_by_company[company].to_csv(file_path, index=False)
    print(f"{company}: {len(dfs_by_company[company])} rows saved to {file_path}")

print("\nPreview of each company's SUBMISSIONS dataframe:\n")
for company, df_company in dfs_by_company.items():
    print(f"=== {company.upper()} ({len(df_company)} rows) ===")
    display(df_company.head(3))   
    print("\n")

filtered_df.to_csv("filtered_top_companies.csv", index=False)  #not really needed, just the dataframe with all top companies submissions
print("Saved filtered_top_companies.csv")
