In [3]:
!pip install vaderSentiment

Collecting vaderSentiment
  Using cached vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2
[0m

In [7]:
import re
import nltk
import praw
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
nltk.download('stopwords', quiet=True)

True

In [8]:
def preprocess_df(df):
    def preprocess_text(text):
        
        # Remove specific unwanted characters
        text = re.sub(r'[^A-Za-z0-9\s,.!?;:()\'\"-]', '', text)
        
        # Strip whitespace
        text = text.strip()
        
        return text
    
    try:
        df['title'] = df['title'].fillna('')
        df['title'] = df['title'].apply(preprocess_text)

        # Create a new column with shifted values
        df['title_prior'] = df['title'].shift(1)
        # Drop rows where the value in 'ColumnToCheck' is the same as in 'ShiftedColumn'
        df = df[df['title'] != df['title_prior']]
        # drop the 'ShiftedColumn'
        df = df.drop('title_prior', axis=1)
        
    except:
        print(f'No title found, skipping')
    
    
    # handle blank 
    df['selftext'] = df['selftext'].fillna('') 
    
    # preprocess selftext
    df['selftext'] = df['selftext'].apply(preprocess_text)
    
    # localize the UTC time stamp
    df['created_EST_date'] = pd.to_datetime(df['created_utc'], unit='s').dt.tz_localize('UTC').dt.tz_convert('America/New_York').dt.date
    
    
    return df

In [9]:
def find_tickers(df):
 
    # Load tickers from a CSV file
    stocks = pd.read_csv('nasdaq_screener_1700463382148.csv')

    # Directly convert the 'Symbol' column to a set
    tickers_set = set(stocks['Symbol'])

    # Use set union to add additional tickers
    additional_tickers = {'BBBY'}
    tickers_set = tickers_set.union(additional_tickers)
    
    # Adjust the pattern to optionally include a leading '$'
    # pattern = r'\b\$?(?:' + '|'.join(tickers_as_strings) + r')\b'
    # pattern = r'\b\$?(?:\(?)(?:' + '|'.join(tickers_as_strings) + r')(?:\)?)\b'
    pattern = r'\b[A-Z]{2,5}\b'
    compiled_pattern = re.compile(pattern)
    
    
    
    blacklist = {
#                 {'I', 'ELON', 'WSB', 'THE', 'A', 'ROPE', 'YOLO', 'TOS', 'CEO', 'DD', 'IT', 'OPEN', 'ATH', 'PM', 'IRS', 'FOR',
#              'DEC', 'BE',
                'IMO',# 'ALL', 'RH', 'EV', 'TOS', 'CFO', 'CTO',
                'DD',
                #'BTFD', 'WSB', 'OK', 'PDT', 'RH', 'KYS', 'FD',
#              'TYS', 
                'US',
                'USA',
                # 'IT', 'ATH', 'RIP', 'BMW', 'GDP', 'OTM', 'ATM', 'ITM', 
                'IMO', 'LOL', 'AM', 'BE', 'PR', 'PRAY',
#              'PT', 'FBI', 'SEC', 'GOD', 'NOT', 'POS', 'FOMO', 'TL;DR',
                'EDIT', 'STILL', 'WTF', 'RAW', 'PM', 'LMAO', 'LMFAO',
#              'ROFL', 'EZ', 'RED', 'BEZOS', 'TICK', 'IS', 'PM', 'LPT', 'GOAT', 'FL', 'CA', 'IL', 'MACD', 'HQ', 'OP', 'PS', 'AH',
#              'TL', 'JAN', 'FEB', 'JUL', 'AUG', 'SEP', 'SEPT', 'OCT', 'NOV', 'FDA', 'IV', 'ER', 'IPO', 'MILF', 'BUT', 'SSN', 'FIFA',
#              'USD', 'CPU', 'AT', 'GG', 'Mar', 
            
#                # Jake added
                'RUN', # common
                'SAY', # common
                'EOD', # end of day
                'BIG', # common
                'LOW', # low / high
                'RSI', #relative strenght
                'DT', #double top
                'HUGE',
                'U', # you
                'AI', # Artificial Intelligence
                'DC', # washington DC
                'J', # as in J Powell
                'ES', # E-mini SP future
                'F', # f*ck
                'GO',
                'UK', # United Kingdom
                'EU', # european union
                'RH', # Robinhood, not Restoration Hardware
                'E', # E*trade brokerage
                'L', # L for loss, P&L etc
                'R', # common 
                'K', # OK
                'B', # common in BBBY odd spacing (spam?)
                'TD', # TD Ameritrade brokerage
                'RYAN', # Ryan Cohen, CEO of GME
                'NYC', # New York City
                'REG', # reg SHO 
                'SHO', # reg SHO 
                'NEXT', # common
                'FREE', # spam
                'DM', # direct message
                'TV', # television
                'ENS', # ethereum name service, spam
                'IRS', # internal revenue service
                'PR', # public relations
                'IQ', # intelligence quotient
                'VS', # versus
                'PT', # price target
                'IBKR', # interactive brokers
                'GOOD', # common
                'OPEN', # market open
                'FCF', # free cash flow
        
                 
                }
    
    combined_blacklist = set(blacklist) | set(word.upper() for word in stopwords.words('english'))
       
    
    def find_tickers(text, compiled_pattern, tickers_set, blacklist_set):
        # Find all matches
        potential_tickers = compiled_pattern.findall(text)
        # Filter matches against the tickers list and ensure they are not in the blacklist
        return list(set([ticker for ticker in potential_tickers if ticker in tickers_set and ticker not in combined_blacklist]))

    try:
        df['title_tickers'] = df['title'].apply(lambda x: find_tickers(x, compiled_pattern, tickers_set, combined_blacklist))
    except KeyError:
        print('title not found, working with comments?')
        
    
    df['selftext_tickers'] = df['selftext'].apply(lambda x: find_tickers(x, compiled_pattern, tickers_set, combined_blacklist))
    
    df['tickers'] = [list(set(x + y)) for x, y in zip(df['title_tickers'], df['selftext_tickers'])]
    
    return df

In [10]:
def add_vader_sentiment(df):
    
    vader = SentimentIntensityAnalyzer()

    added_words = {
            'citron': -4.0,  
            'hidenburg': -4.0,        
            'moon': 4.0,
            'highs': 2.0,
            'mooning': 4.0,
            'long': 2.0,
            'short': -2.0,
            'call': 4.0,
            'calls': 4.0,    
            'put': -4.0,
            'puts': -4.0,    
            'break': 2.0,
            'tendie': 2.0,
            'tendies': 2.0,
            'town': 2.0,     
            'overvalued': -3.0,
            'undervalued': 3.0,
            'buy': 4.0,
            'sell': -4.0,
            'gone': -1.0,
            'gtfo': -1.7,
            'paper': -1.7,
            'bullish': 3.7,
            'bearish': -3.7,
            'bagholder': -1.7,
            'stonk': 1.9,
            'green': 1.9,
            'money': 1.2,
            'print': 2.2,
            'rocket': 2.2,
            'bull': 2.9,
            'bear': -2.9,
            'pumping': -1.0,
            'sus': -3.0,
            'offering': -2.3,
            'rip': -4.0,
            'downgrade': -3.0,
            'upgrade': 3.0,     
            'maintain': 1.0,          
            'pump': 1.9,
            'hot': 1.5,
            'drop': -2.5,
            'rebound': 1.5,  
            'crack': 2.5,
            '🚀': 3, # Jake ADDED THESE
            '🌕': 3, # Jake ADDED THESE
            'YOLO': 4, # Jake ADDED THESE
            'ripping': 3,# Jake ADDED THESE
            'regarded': 0, # Jake ADDED THESE
            'squeeze':3, # Jake ADDED THESE
            }
    
    vader.lexicon.update(added_words)

    def safe_sentiment(text):
        try:
            # Ensure the input is a non-empty string
            if not isinstance(text, str) or not text.strip():
                return 0

            # Analyze the sentiment
            sentiment_dict = vader.polarity_scores(text)
            return sentiment_dict.get('compound', 0)
        except Exception as e:
            print(f"Error processing text: '{text}' (type: {type(text)}). Error: {e}")
            return 0

    # Apply the function
    try:
        df['title_sentiment'] = df['title'].apply(safe_sentiment)
    except:
        print('Titles not found, is this a comments file?')
        df['title_sentiment'] = 0
        
    df['selftext_sentiment'] = df['selftext'].apply(safe_sentiment)
    
    
    def non_zero_average(row):
        sentiments = [row['title_sentiment'], row['selftext_sentiment']]
        non_zero_sentiments = [s for s in sentiments if s != 0]

        if not non_zero_sentiments:
            return 0  # Return 0 if both sentiments are zero

        return sum(non_zero_sentiments) / len(non_zero_sentiments)

    # Apply the function to calculate overall sentiment
    df['overall_sentiment'] = df.apply(non_zero_average, axis=1)
    
    df['score_weighted_sentiment'] = df['overall_sentiment'] * df['score']

    return df

In [11]:
def get_reddit_praw_submissions(limit):
    
    client_id = 'aWEYVIaAoJGlCPja3awh0A'
    secret = 'gOR5FfkvsTH3MJ0IHRSImToTwt0PSQ'

    reddit = praw.Reddit(
                        client_id=client_id,
                        client_secret=secret,
                        user_agent="MADS/0.1 by TeamSafari",
                    )
    submissions_data = []

    for submission in reddit.subreddit("wallstreetbets").new(limit=limit):
        # print(dir(submission))
        data = {
            'id': submission.id,
            'subreddit_id': submission.subreddit_id,
            'subreddit': submission.subreddit,
            'author': submission.author,
            'created_utc': submission.created_utc,
            'permalink': submission.permalink,
            'title': submission.title,
            'selftext': submission.selftext,
            'num_comments': submission.num_comments,
            'score': submission.score,
            'flair': submission.link_flair_text,
            'removal_reason':submission.removal_reason,
            

            # Add more fields as needed
        }
        submissions_data.append(data)

    df = pd.DataFrame(submissions_data)
    return df

In [12]:
df = get_reddit_praw_submissions(limit=1000)

In [13]:
praw_df = preprocess_df(df)
praw = find_tickers(praw_df)
praw = add_vader_sentiment(praw_df)

In [14]:
praw

Unnamed: 0,id,subreddit_id,subreddit,author,created_utc,permalink,title,selftext,num_comments,score,flair,removal_reason,created_EST_date,title_tickers,selftext_tickers,tickers,title_sentiment,selftext_sentiment,overall_sentiment,score_weighted_sentiment
0,18e3lhy,t5_2th52,wallstreetbets,Snoo-27151,1.702090e+09,/r/wallstreetbets/comments/18e3lhy/4_things_th...,4 Things That Could Cause Stocks to Plunge,The first is all about the number 4607. Thats ...,2,1,News,,2023-12-08,[],[SP],[SP],0.0000,-0.9557,-0.95570,-0.9557
1,18e3koh,t5_2th52,wallstreetbets,Grinkol,1.702090e+09,/r/wallstreetbets/comments/18e3koh/watching_le...,Watching Leave the world behind... Puts on TSL...,Tldr watch it on Netflix and you'll see why...,1,1,Meme,,2023-12-08,"[TSLA, NFLX]",[],"[TSLA, NFLX]",-0.0516,0.0000,-0.05160,-0.0516
2,18e3kgd,t5_2th52,wallstreetbets,B3stAuD1t0rofA11tiME,1.702090e+09,/r/wallstreetbets/comments/18e3kgd/check_out_t...,Check Out These Shitty Christmas Tree Ornaments,,2,1,Meme,,2023-12-08,[],[],[],-0.5574,0.0000,-0.55740,-0.5574
3,18e3dci,t5_2th52,wallstreetbets,Snoo-27151,1.702089e+09,/r/wallstreetbets/comments/18e3dci/why_we_expe...,Why We Expect the Job Markets Slowdown to Rene...,Nonfarm payroll employment increased at a 1.6 ...,3,1,News,,2023-12-08,[],[],[],0.0000,0.1699,0.16990,0.1699
4,18e39nc,t5_2th52,wallstreetbets,lavatonic,1.702089e+09,/r/wallstreetbets/comments/18e39nc/i_love_ya_2...,"I love ya, 2 MARA positions",,3,2,YOLO,,2023-12-08,[MARA],[],[MARA],0.6369,0.0000,0.63690,1.2738
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
892,180hlin,t5_2th52,wallstreetbets,devpassion,1.700575e+09,/r/wallstreetbets/comments/180hlin/is_it_wise_...,Is it Wise to Invest in Every Stock in the Mar...,"Hey fellow investors, I've been pondering the ...",44,0,Discussion,,2023-11-21,[],[],[],0.5255,0.7220,0.62375,0.0000
893,180h4a3,t5_2th52,wallstreetbets,Superduke1010,1.700574e+09,/r/wallstreetbets/comments/180h4a3/wanna_a_spe...,Wanna a Spec Cannabis stock with Pharma Potent...,Getting mercilessly manipulated and is poised ...,11,5,DD,,2023-11-21,[],[],[],0.0000,0.7003,0.70030,3.5015
894,180h2pc,t5_2th52,wallstreetbets,thejungleboomer,1.700574e+09,/r/wallstreetbets/comments/180h2pc/charting_an...,Charting and TradingView,Anyone else notice how they are almost identic...,12,2,Discussion,,2023-11-21,[],[],[],0.0000,0.7147,0.71470,1.4294
895,180h1qe,t5_2th52,wallstreetbets,Natural_Tea484,1.700574e+09,/r/wallstreetbets/comments/180h1qe/need_advice...,"Need advice, buy SP 500 now or wait for a nega...",SP 500 is almost all time high now. Should I ...,59,0,Discussion,,2023-11-21,[SP],[SP],[SP],0.3182,0.9622,0.64020,0.0000


In [22]:
# create cumulative_sentiment_sorted_df
exploded_df = praw.explode('tickers')
cumulative_sentiment = exploded_df.groupby('tickers')['overall_sentiment'].sum().reset_index() # Group by 'tickers'
cumulative_sentiment.columns = ['Ticker', 'Cumulative Overall Sentiment'] # Rename columns for clarity
cumulative_sentiment_sorted = cumulative_sentiment.sort_values(by='Cumulative Overall Sentiment', ascending=False)
# display(cumulative_sentiment_sorted)

# Group by 'tickers' and sum the 'score_weighted_sentiment'
cumulative_weighted_sentiment = exploded_df.groupby('tickers')['score_weighted_sentiment'].sum().reset_index()
cumulative_weighted_sentiment.columns = ['Ticker', 'Cumulative Weighted Sentiment'] # Rename columns for clarity
cumulative_weighted_sentiment_sorted = cumulative_weighted_sentiment.sort_values(by='Cumulative Weighted Sentiment', ascending=False)
cumulative_weighted_sentiment_sorted['Date'] = praw['created_EST_date']
cumulative_weighted_sentiment_sorted['Date'].fillna(praw['created_EST_date'].unique()[0],inplace=True)
# display(cumulative_weighted_sentiment_sorted)

daily_sentiment_df = cumulative_sentiment_sorted.merge(cumulative_weighted_sentiment_sorted, on='Ticker')

In [23]:
daily_sentiment_df

Unnamed: 0,Ticker,Cumulative Overall Sentiment,Cumulative Weighted Sentiment,Date
0,SAVE,8.10645,303.49115,2023-12-06
1,NVDA,7.58940,4016.17505,2023-12-07
2,META,4.60945,6752.27870,2023-12-07
3,MSFT,4.24275,767.54190,2023-12-07
4,AMD,4.04835,-301.14940,2023-12-08
...,...,...,...,...
194,BBVA,-0.95110,-10.46210,2023-12-08
195,TGS,-0.95110,-10.46210,2023-12-06
196,SUPV,-0.95110,-10.46210,2023-12-06
197,YPF,-0.95110,-10.46210,2023-12-05
