In [1]:
# import relevant libraries
import pandas as pd
from utilities import *

In [2]:
# Create a mapping of stock symbols to company names removing common prefixes and suffixes  
# normalize to lowercase
dtypes = {
    'Symbol': str,
    'Security Name': str
}

nasdaq_listed = pd.read_csv('nasdaqlisted.txt', sep='|', usecols=['Symbol', 'Security Name'], dtype=dtypes)
nasdaq_listed['Security Name'] = nasdaq_listed['Security Name'].astype(str).map(get_company_name)

other_listed = pd.read_csv('otherlisted.txt', sep='|', usecols=['ACT Symbol', 'Security Name'])
other_listed['Security Name'] = other_listed['Security Name'].astype(str).map(get_company_name)
other_listed = other_listed.rename(columns={'ACT Symbol': 'Symbol'})

all_symbols = pd.concat([nasdaq_listed, other_listed], ignore_index=True)
all_symbols['Symbol'] = all_symbols['Symbol'].astype(str).map(str.lower)
# Export the concatenated dataframe to CSV
all_symbols.to_csv('data/all_stock_symbols.csv', index=False)

In [3]:
stocks_posts = pd.read_csv('data/stocks/hot/year=2025/month=02/day=19/posts.csv')

In [4]:
stocks_posts.head(5)
all_symbols[all_symbols['Symbol'] == 'dava']

Unnamed: 0,Symbol,Security Name
6095,dava,endava


In [5]:
stocks_posts.head()

Unnamed: 0,title,url,subreddit,author,score,num_comments,created_utc,post_name
0,Rate My Portfolio - r/Stocks Quarterly Thread ...,https://www.reddit.com/r/stocks/comments/1h403...,stocks,AutoModerator,50,481,1733047000.0,t3_1h403u4
1,"r/Stocks Daily Discussion Wednesday - Feb 19, ...",https://www.reddit.com/r/stocks/comments/1it2f...,stocks,AutoModerator,6,12,1739961000.0,t3_1it2fy5
2,Does anyone else feel uneasy about investing g...,https://www.reddit.com/r/stocks/comments/1it2h...,stocks,hekatonkhairez,493,277,1739961000.0,t3_1it2hds
3,GameStop looks to sell its Canadian and French...,https://www.reddit.com/r/stocks/comments/1isr9...,stocks,1slinkydink1,1231,460,1739923000.0,t3_1isr9u7
4,What’s up with UnitedHealth Group falling 4.29...,https://www.reddit.com/r/stocks/comments/1isxk...,stocks,Celestial_Inferno,99,85,1739941000.0,t3_1isxkx9


In [6]:
import string

def find_stock_symbols_in_title(title, all_symbols):
    # Replace punctuation with spaces in the title
    title_no_punct = title.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
    
    # Split the title into words
    words = title_no_punct.lower().split()
     
    # Get the list of stock symbols
    stock_symbols = all_symbols.tolist()
    
    # Find matching symbols
    matching_symbols = [word for word in words if word in stock_symbols]
    
    return matching_symbols

In [7]:
stocks_posts.drop(columns=['matching_symbols', 'matching_titles'], inplace=True)

KeyError: "['matching_symbols', 'matching_titles'] not found in axis"

In [8]:
stocks_posts['matching_symbols'] = stocks_posts['title'].apply(lambda x: find_stock_symbols_in_title(x, all_symbols['Symbol']))
stocks_posts['matching_titles'] = stocks_posts['title'].apply(lambda x: find_stock_symbols_in_title(x, all_symbols['Security Name']))

In [9]:
stocks_posts.head(15)

Unnamed: 0,title,url,subreddit,author,score,num_comments,created_utc,post_name,matching_symbols,matching_titles
0,Rate My Portfolio - r/Stocks Quarterly Thread ...,https://www.reddit.com/r/stocks/comments/1h403...,stocks,AutoModerator,50,481,1733047000.0,t3_1h403u4,"[rate, r]",[]
1,"r/Stocks Daily Discussion Wednesday - Feb 19, ...",https://www.reddit.com/r/stocks/comments/1it2f...,stocks,AutoModerator,6,12,1739961000.0,t3_1it2fy5,[r],[]
2,Does anyone else feel uneasy about investing g...,https://www.reddit.com/r/stocks/comments/1it2h...,stocks,hekatonkhairez,493,277,1739961000.0,t3_1it2hds,"[else, all, u, s]",[]
3,GameStop looks to sell its Canadian and French...,https://www.reddit.com/r/stocks/comments/1isr9...,stocks,1slinkydink1,1231,460,1739923000.0,t3_1isr9u7,[],"[gamestop, star]"
4,What’s up with UnitedHealth Group falling 4.29...,https://www.reddit.com/r/stocks/comments/1isxk...,stocks,Celestial_Inferno,99,85,1739941000.0,t3_1isxkx9,[up],[]
5,Feel like there is no clear plays right now,https://www.reddit.com/r/stocks/comments/1isij...,stocks,FireHamilton,540,653,1739901000.0,t3_1isijjm,[now],[]
6,BAE Systems and Rheinmetall a buy?,https://www.reddit.com/r/stocks/comments/1it2f...,stocks,Napoleonade,12,15,1739961000.0,t3_1it2f72,[a],[]
7,Goog - undervalued mag 7,https://www.reddit.com/r/stocks/comments/1isld...,stocks,DOGEtothemoon21,184,116,1739907000.0,t3_1isldc2,"[goog, mag]",[]
8,Opinions on BlackBerry?,https://www.reddit.com/r/stocks/comments/1isso...,stocks,brethezio,57,42,1739927000.0,t3_1issobj,[on],"[on, blackberry]"
9,Finding new stocks,https://www.reddit.com/r/stocks/comments/1iss7...,stocks,Nearby_Initial8772,26,48,1739926000.0,t3_1iss7uw,[],[new]


In [10]:
import spacy

nlp = spacy.load("en_core_web_trf")

In [17]:
def ner_title_extract_orgs(title): 
    doc = nlp(title)
    return [ent.text for ent in doc.ents if ent.label_ == 'ORG']

In [18]:
stocks_posts['matching_orgs'] = stocks_posts['title'].apply(lambda x: ner_title_extract_orgs(x))

In [19]:
stocks_posts.head(15)

Unnamed: 0,title,url,subreddit,author,score,num_comments,created_utc,post_name,matching_symbols,matching_titles,matching_orgs
0,Rate My Portfolio - r/Stocks Quarterly Thread ...,https://www.reddit.com/r/stocks/comments/1h403...,stocks,AutoModerator,50,481,1733047000.0,t3_1h403u4,"[rate, r]",[],[]
1,"r/Stocks Daily Discussion Wednesday - Feb 19, ...",https://www.reddit.com/r/stocks/comments/1it2f...,stocks,AutoModerator,6,12,1739961000.0,t3_1it2fy5,[r],[],[]
2,Does anyone else feel uneasy about investing g...,https://www.reddit.com/r/stocks/comments/1it2h...,stocks,hekatonkhairez,493,277,1739961000.0,t3_1it2hds,"[else, all, u, s]",[],[]
3,GameStop looks to sell its Canadian and French...,https://www.reddit.com/r/stocks/comments/1isr9...,stocks,1slinkydink1,1231,460,1739923000.0,t3_1isr9u7,[],"[gamestop, star]","[GameStop, Toronto Star]"
4,What’s up with UnitedHealth Group falling 4.29...,https://www.reddit.com/r/stocks/comments/1isxk...,stocks,Celestial_Inferno,99,85,1739941000.0,t3_1isxkx9,[up],[],[UnitedHealth Group]
5,Feel like there is no clear plays right now,https://www.reddit.com/r/stocks/comments/1isij...,stocks,FireHamilton,540,653,1739901000.0,t3_1isijjm,[now],[],[]
6,BAE Systems and Rheinmetall a buy?,https://www.reddit.com/r/stocks/comments/1it2f...,stocks,Napoleonade,12,15,1739961000.0,t3_1it2f72,[a],[],"[BAE Systems, Rheinmetall]"
7,Goog - undervalued mag 7,https://www.reddit.com/r/stocks/comments/1isld...,stocks,DOGEtothemoon21,184,116,1739907000.0,t3_1isldc2,"[goog, mag]",[],[Goog]
8,Opinions on BlackBerry?,https://www.reddit.com/r/stocks/comments/1isso...,stocks,brethezio,57,42,1739927000.0,t3_1issobj,[on],"[on, blackberry]",[BlackBerry]
9,Finding new stocks,https://www.reddit.com/r/stocks/comments/1iss7...,stocks,Nearby_Initial8772,26,48,1739926000.0,t3_1iss7uw,[],[new],[]


In [20]:
stocks_posts.drop(columns=['url', 'subreddit', 'author'], inplace=True)

In [21]:
stocks_posts.head(15)

Unnamed: 0,title,score,num_comments,created_utc,post_name,matching_symbols,matching_titles,matching_orgs
0,Rate My Portfolio - r/Stocks Quarterly Thread ...,50,481,1733047000.0,t3_1h403u4,"[rate, r]",[],[]
1,"r/Stocks Daily Discussion Wednesday - Feb 19, ...",6,12,1739961000.0,t3_1it2fy5,[r],[],[]
2,Does anyone else feel uneasy about investing g...,493,277,1739961000.0,t3_1it2hds,"[else, all, u, s]",[],[]
3,GameStop looks to sell its Canadian and French...,1231,460,1739923000.0,t3_1isr9u7,[],"[gamestop, star]","[GameStop, Toronto Star]"
4,What’s up with UnitedHealth Group falling 4.29...,99,85,1739941000.0,t3_1isxkx9,[up],[],[UnitedHealth Group]
5,Feel like there is no clear plays right now,540,653,1739901000.0,t3_1isijjm,[now],[],[]
6,BAE Systems and Rheinmetall a buy?,12,15,1739961000.0,t3_1it2f72,[a],[],"[BAE Systems, Rheinmetall]"
7,Goog - undervalued mag 7,184,116,1739907000.0,t3_1isldc2,"[goog, mag]",[],[Goog]
8,Opinions on BlackBerry?,57,42,1739927000.0,t3_1issobj,[on],"[on, blackberry]",[BlackBerry]
9,Finding new stocks,26,48,1739926000.0,t3_1iss7uw,[],[new],[]
