In [3]:
#!pip install vaderSentiment

In [5]:
import re
import nltk
import praw
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
#nltk.download('stopwords', quiet=True)

In [5]:
def preprocess_df(df):
    def preprocess_text(text):
        
        # Remove specific unwanted characters
        text = re.sub(r'[^A-Za-z0-9\s,.!?;:()\'\"-]', '', text)
        
        # Strip whitespace
        text = text.strip()
        
        return text
    
    try:
        df['title'] = df['title'].fillna('')
        df['title'] = df['title'].apply(preprocess_text)

        # Create a new column with shifted values
        df['title_prior'] = df['title'].shift(1)
        # Drop rows where the value in 'ColumnToCheck' is the same as in 'ShiftedColumn'
        df = df[df['title'] != df['title_prior']]
        # drop the 'ShiftedColumn'
        df = df.drop('title_prior', axis=1)
        
    except:
        print(f'No title found, skipping')
    
    
    # handle blank 
    df['selftext'] = df['selftext'].fillna('') 
    
    # preprocess selftext
    df['selftext'] = df['selftext'].apply(preprocess_text)
    
    # localize the UTC time stamp
    df['created_EST_date'] = pd.to_datetime(df['created_utc'], unit='s').dt.tz_localize('UTC').dt.tz_convert('America/New_York').dt.date
    
    
    return df

In [6]:
def find_tickers(df):
 
    # Load tickers from a CSV file
    stocks = pd.read_csv('r/root/Git_Repo/SIADS_Capstone_Group17/Data/nasdaq_screener.csv')

    # Directly convert the 'Symbol' column to a set
    tickers_set = set(stocks['Symbol'])

    # Use set union to add additional tickers
    additional_tickers = {'BBBY'}
    tickers_set = tickers_set.union(additional_tickers)
    
    # Adjust the pattern to optionally include a leading '$'
    # pattern = r'\b\$?(?:' + '|'.join(tickers_as_strings) + r')\b'
    # pattern = r'\b\$?(?:\(?)(?:' + '|'.join(tickers_as_strings) + r')(?:\)?)\b'
    pattern = r'\b[A-Z]{2,5}\b'
    compiled_pattern = re.compile(pattern)
    
    
    
    blacklist = {
#                 {'I', 'ELON', 'WSB', 'THE', 'A', 'ROPE', 'YOLO', 'TOS', 'CEO', 'DD', 'IT', 'OPEN', 'ATH', 'PM', 'IRS', 'FOR',
#              'DEC', 'BE',
                'IMO',# 'ALL', 'RH', 'EV', 'TOS', 'CFO', 'CTO',
                'DD',
                #'BTFD', 'WSB', 'OK', 'PDT', 'RH', 'KYS', 'FD',
#              'TYS', 
                'US',
                'USA',
                # 'IT', 'ATH', 'RIP', 'BMW', 'GDP', 'OTM', 'ATM', 'ITM', 
                'IMO', 'LOL', 'AM', 'BE', 'PR', 'PRAY',
#              'PT', 'FBI', 'SEC', 'GOD', 'NOT', 'POS', 'FOMO', 'TL;DR',
                'EDIT', 'STILL', 'WTF', 'RAW', 'PM', 'LMAO', 'LMFAO',
#              'ROFL', 'EZ', 'RED', 'BEZOS', 'TICK', 'IS', 'PM', 'LPT', 'GOAT', 'FL', 'CA', 'IL', 'MACD', 'HQ', 'OP', 'PS', 'AH',
#              'TL', 'JAN', 'FEB', 'JUL', 'AUG', 'SEP', 'SEPT', 'OCT', 'NOV', 'FDA', 'IV', 'ER', 'IPO', 'MILF', 'BUT', 'SSN', 'FIFA',
#              'USD', 'CPU', 'AT', 'GG', 'Mar', 
            
#                # Jake added
                'RUN', # common
                'SAY', # common
                'EOD', # end of day
                'BIG', # common
                'LOW', # low / high
                'RSI', #relative strenght
                'DT', #double top
                'HUGE',
                'U', # you
                'AI', # Artificial Intelligence
                'DC', # washington DC
                'J', # as in J Powell
                'ES', # E-mini SP future
                'F', # f*ck
                'GO',
                'UK', # United Kingdom
                'EU', # european union
                'RH', # Robinhood, not Restoration Hardware
                'E', # E*trade brokerage
                'L', # L for loss, P&L etc
                'R', # common 
                'K', # OK
                'B', # common in BBBY odd spacing (spam?)
                'TD', # TD Ameritrade brokerage
                'RYAN', # Ryan Cohen, CEO of GME
                'NYC', # New York City
                'REG', # reg SHO 
                'SHO', # reg SHO 
                'NEXT', # common
                'FREE', # spam
                'DM', # direct message
                'TV', # television
                'ENS', # ethereum name service, spam
                'IRS', # internal revenue service
                'PR', # public relations
                'IQ', # intelligence quotient
                'VS', # versus
                'PT', # price target
                'IBKR', # interactive brokers
                'GOOD', # common
                'OPEN', # market open
                'FCF', # free cash flow
        
                 
                }
    
    combined_blacklist = set(blacklist) | set(word.upper() for word in stopwords.words('english'))
       
    
    def find_tickers(text, compiled_pattern, tickers_set, blacklist_set):
        # Find all matches
        potential_tickers = compiled_pattern.findall(text)
        # Filter matches against the tickers list and ensure they are not in the blacklist
        return list(set([ticker for ticker in potential_tickers if ticker in tickers_set and ticker not in combined_blacklist]))

    try:
        df['title_tickers'] = df['title'].apply(lambda x: find_tickers(x, compiled_pattern, tickers_set, combined_blacklist))
    except KeyError:
        print('title not found, working with comments?')
        
    
    df['selftext_tickers'] = df['selftext'].apply(lambda x: find_tickers(x, compiled_pattern, tickers_set, combined_blacklist))
    
    df['tickers'] = [list(set(x + y)) for x, y in zip(df['title_tickers'], df['selftext_tickers'])]
    
    return df

In [7]:
def add_vader_sentiment(df):
    
    vader = SentimentIntensityAnalyzer()

    added_words = {
            'citron': -4.0,  
            'hidenburg': -4.0,        
            'moon': 4.0,
            'highs': 2.0,
            'mooning': 4.0,
            'long': 2.0,
            'short': -2.0,
            'call': 4.0,
            'calls': 4.0,    
            'put': -4.0,
            'puts': -4.0,    
            'break': 2.0,
            'tendie': 2.0,
            'tendies': 2.0,
            'town': 2.0,     
            'overvalued': -3.0,
            'undervalued': 3.0,
            'buy': 4.0,
            'sell': -4.0,
            'gone': -1.0,
            'gtfo': -1.7,
            'paper': -1.7,
            'bullish': 3.7,
            'bearish': -3.7,
            'bagholder': -1.7,
            'stonk': 1.9,
            'green': 1.9,
            'money': 1.2,
            'print': 2.2,
            'rocket': 2.2,
            'bull': 2.9,
            'bear': -2.9,
            'pumping': -1.0,
            'sus': -3.0,
            'offering': -2.3,
            'rip': -4.0,
            'downgrade': -3.0,
            'upgrade': 3.0,     
            'maintain': 1.0,          
            'pump': 1.9,
            'hot': 1.5,
            'drop': -2.5,
            'rebound': 1.5,  
            'crack': 2.5,
            '🚀': 3, # Jake ADDED THESE
            '🌕': 3, # Jake ADDED THESE
            'YOLO': 4, # Jake ADDED THESE
            'ripping': 3,# Jake ADDED THESE
            'regarded': 0, # Jake ADDED THESE
            'squeeze':3, # Jake ADDED THESE
            }
    
    vader.lexicon.update(added_words)

    def safe_sentiment(text):
        try:
            # Ensure the input is a non-empty string
            if not isinstance(text, str) or not text.strip():
                return 0

            # Analyze the sentiment
            sentiment_dict = vader.polarity_scores(text)
            return sentiment_dict.get('compound', 0)
        except Exception as e:
            print(f"Error processing text: '{text}' (type: {type(text)}). Error: {e}")
            return 0

    # Apply the function
    try:
        df['title_sentiment'] = df['title'].apply(safe_sentiment)
    except:
        print('Titles not found, is this a comments file?')
        df['title_sentiment'] = 0
        
    df['selftext_sentiment'] = df['selftext'].apply(safe_sentiment)
    
    
    def non_zero_average(row):
        sentiments = [row['title_sentiment'], row['selftext_sentiment']]
        non_zero_sentiments = [s for s in sentiments if s != 0]

        if not non_zero_sentiments:
            return 0  # Return 0 if both sentiments are zero

        return sum(non_zero_sentiments) / len(non_zero_sentiments)

    # Apply the function to calculate overall sentiment
    df['overall_sentiment'] = df.apply(non_zero_average, axis=1)
    
    df['score_weighted_sentiment'] = df['overall_sentiment'] * df['score']

    return df

In [8]:
def get_reddit_praw_submissions(limit):
    
    client_id = 'aWEYVIaAoJGlCPja3awh0A'
    secret = 'gOR5FfkvsTH3MJ0IHRSImToTwt0PSQ'

    reddit = praw.Reddit(
                        client_id=client_id,
                        client_secret=secret,
                        user_agent="MADS/0.1 by TeamSafari",
                    )
    submissions_data = []

    for submission in reddit.subreddit("wallstreetbets").new(limit=limit):
        # print(dir(submission))
        data = {
            'id': submission.id,
            'subreddit_id': submission.subreddit_id,
            'subreddit': submission.subreddit,
            'author': submission.author,
            'created_utc': submission.created_utc,
            'permalink': submission.permalink,
            'title': submission.title,
            'selftext': submission.selftext,
            'num_comments': submission.num_comments,
            'score': submission.score,
            'flair': submission.link_flair_text,
            'removal_reason':submission.removal_reason,
            

            # Add more fields as needed
        }
        submissions_data.append(data)

    df = pd.DataFrame(submissions_data)
    return df

In [9]:
df = get_reddit_praw_submissions(limit=1000)

In [31]:
praw_df = preprocess_df(df)
praw_df = find_tickers(praw_df)
praw_df = add_vader_sentiment(praw_df)

In [32]:
praw_df

Unnamed: 0,id,subreddit_id,subreddit,author,created_utc,permalink,title,selftext,num_comments,score,flair,removal_reason,created_EST_date,title_tickers,selftext_tickers,tickers,title_sentiment,selftext_sentiment,overall_sentiment,score_weighted_sentiment
0,18enqaz,t5_2th52,wallstreetbets,Ok-Atmosphere-6272,1.702158e+09,/r/wallstreetbets/comments/18enqaz/what_is_goi...,What is going on with BLUE?,Just curious what peoples thoughts are about t...,3,3,Discussion,,2023-12-09,[BLUE],[],[BLUE],0.0000,0.8305,0.83050,2.4915
1,18enliw,t5_2th52,wallstreetbets,FunnyGagYT,1.702157e+09,/r/wallstreetbets/comments/18enliw/eu_ai_act_e...,"EU AI Act, EU's New Landmark AI Act: What Does...",,1,3,News,,2023-12-09,[],[],[],0.0772,0.0000,0.07720,0.2316
2,18enepz,t5_2th52,wallstreetbets,Dull-Menu-5023,1.702157e+09,/r/wallstreetbets/comments/18enepz/anyone_feel...,Anyone feel like its 2021 all over again?,Both the stock market and crypto are rapidly c...,9,2,Discussion,,2023-12-09,[],[],[],0.3612,0.2500,0.30560,0.6112
3,18enbtj,t5_2th52,wallstreetbets,realstocknear,1.702157e+09,/r/wallstreetbets/comments/18enbtj/created_a_f...,Created a free website that simplifies stock a...,,5,7,Chart,,2023-12-09,[],[],[],0.6486,0.0000,0.64860,4.5402
4,18emba6,t5_2th52,wallstreetbets,Naive-Historian-2110,1.702154e+09,/r/wallstreetbets/comments/18emba6/if_my_autis...,If my autism is correct SPY will crash...,x200B;\n\nIf declining interest in real estate...,22,18,Meme,,2023-12-09,[],[],[],-0.4019,0.0772,-0.16235,-2.9223
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
894,180o7ck,t5_2th52,wallstreetbets,LasisuKibiras,1.700593e+09,/r/wallstreetbets/comments/180o7ck/gay_bears_g...,Gay bears get fcked,We can see a new pattern forming up that will ...,13,43,Meme,,2023-11-21,[],[],[],0.0000,0.5719,0.57190,24.5917
895,180o13a,t5_2th52,wallstreetbets,steveneverforgotme,1.700592e+09,/r/wallstreetbets/comments/180o13a/100_green_d...,100 green days from here on out,Finally found a good strategy with a 100 succe...,11,170,Meme,,2023-11-21,[],[],[],0.4404,0.9308,0.68560,116.5520
896,180npxe,t5_2th52,wallstreetbets,realstocknear,1.700591e+09,/r/wallstreetbets/comments/180npxe/how_things_...,How things are changing just in 1 Week,,4,219,Meme,,2023-11-21,[],[],[],0.0000,0.0000,0.00000,0.0000
897,180nm1q,t5_2th52,wallstreetbets,realstocknear,1.700591e+09,/r/wallstreetbets/comments/180nm1q/just_when_w...,just when we thought this week couldn't get an...,,1200,9434,News,,2023-11-21,[],[],[],0.0191,0.0000,0.01910,180.1894


In [35]:
# create cumulative_sentiment_sorted_df
exploded_df = praw_df.explode('tickers')
cumulative_sentiment = exploded_df.groupby('tickers')['overall_sentiment'].sum().reset_index() # Group by 'tickers'
cumulative_sentiment.columns = ['Ticker', 'Cumulative Overall Sentiment'] # Rename columns for clarity
cumulative_sentiment_sorted = cumulative_sentiment.sort_values(by='Cumulative Overall Sentiment', ascending=False)
# display(cumulative_sentiment_sorted)

# Group by 'tickers' and sum the 'score_weighted_sentiment'
cumulative_weighted_sentiment = exploded_df.groupby('tickers')['score_weighted_sentiment'].sum().reset_index()
cumulative_weighted_sentiment.columns = ['Ticker', 'Cumulative Weighted Sentiment'] # Rename columns for clarity
cumulative_weighted_sentiment_sorted = cumulative_weighted_sentiment.sort_values(by='Cumulative Weighted Sentiment', ascending=False)
cumulative_weighted_sentiment_sorted['Date'] = praw_df['created_EST_date']
cumulative_weighted_sentiment_sorted['Date'].fillna(praw_df['created_EST_date'].unique()[0],inplace=True)
# display(cumulative_weighted_sentiment_sorted)

daily_sentiment_df = cumulative_sentiment_sorted.merge(cumulative_weighted_sentiment_sorted, on='Ticker')

In [36]:
daily_sentiment_df

Unnamed: 0,Ticker,Cumulative Overall Sentiment,Cumulative Weighted Sentiment,Date
0,SAVE,8.10645,310.38380,2023-12-07
1,NVDA,7.58940,4020.85095,2023-12-07
2,META,5.32785,6872.84705,2023-12-07
3,MSFT,4.24275,807.51960,2023-12-07
4,AMD,4.04835,-292.11000,2023-12-09
...,...,...,...,...
191,PAM,-0.95110,-7.60880,2023-12-07
192,SUPV,-0.95110,-7.60880,2023-12-06
193,BBVA,-0.95110,-7.60880,2023-12-09
194,YPF,-0.95110,-7.60880,2023-12-06


In [37]:
# Explode the DataFrame if 'tickers' column contains lists
exploded_df = praw_df.explode('tickers')

expected_columns = ['tickers', 'title', 'permalink', 'overall_sentiment']

# Check if all expected columns are in the DataFrame
if not all(column in exploded_df.columns for column in expected_columns):
    raise ValueError("DataFrame does not contain the expected columns.")

# Aggregate submission details
submission_details = exploded_df.groupby('tickers').apply(
    lambda x: [(title, permalink, sentiment) 
               for title, permalink, sentiment in zip(x['title'], x['permalink'], x['overall_sentiment'])]
).reset_index(name='Submissions')

# Rename columns for clarity
submission_details.columns = ['Ticker', 'Submissions']

# Merge with daily_sentiment_df
daily_sentiment_df = daily_sentiment_df.merge(submission_details, on='Ticker', how='left')

In [38]:
daily_sentiment_df

Unnamed: 0,Ticker,Cumulative Overall Sentiment,Cumulative Weighted Sentiment,Date,Submissions
0,SAVE,8.10645,310.38380,2023-12-07,"[(SAVE merger question, /r/wallstreetbets/comm..."
1,NVDA,7.58940,4020.85095,2023-12-07,[(AMD AI Chip release - Up 6.5 in the last 24h...
2,META,5.32785,6872.84705,2023-12-07,[(The guy who went full send on META calls sho...
3,MSFT,4.24275,807.51960,2023-12-07,"[(Trying to recoup my loss: Day 1, /r/wallstre..."
4,AMD,4.04835,-292.11000,2023-12-09,"[(Thank you Lisa Su AMD Gainz, /r/wallstreetb..."
...,...,...,...,...,...
191,PAM,-0.95110,-7.60880,2023-12-07,"[(Shorting select Argentinian companies, /r/wa..."
192,SUPV,-0.95110,-7.60880,2023-12-06,"[(Shorting select Argentinian companies, /r/wa..."
193,BBVA,-0.95110,-7.60880,2023-12-09,"[(Shorting select Argentinian companies, /r/wa..."
194,YPF,-0.95110,-7.60880,2023-12-06,"[(Shorting select Argentinian companies, /r/wa..."


## For use in dashboard

In [7]:
import re
import nltk
import praw
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
#nltk.download('stopwords', quiet=True)

In [13]:
def PRAW_for_dashboard():
    def preprocess_df(df):
        def preprocess_text(text):

            # Remove specific unwanted characters
            text = re.sub(r'[^A-Za-z0-9\s,.!?;:()\'\"-]', '', text)

            # Strip whitespace
            text = text.strip()

            return text

        try:
            df['title'] = df['title'].fillna('')
            df['title'] = df['title'].apply(preprocess_text)

            # Create a new column with shifted values
            df['title_prior'] = df['title'].shift(1)
            # Drop rows where the value in 'ColumnToCheck' is the same as in 'ShiftedColumn'
            df = df[df['title'] != df['title_prior']]
            # drop the 'ShiftedColumn'
            df = df.drop('title_prior', axis=1)

        except:
            print(f'No title found, skipping')


        # handle blank 
        df['selftext'] = df['selftext'].fillna('') 

        # preprocess selftext
        df['selftext'] = df['selftext'].apply(preprocess_text)

        # localize the UTC time stamp
        df['created_EST_date'] = pd.to_datetime(df['created_utc'], unit='s').dt.tz_localize('UTC').dt.tz_convert('America/New_York').dt.date


        return df
    
    def find_tickers(df):
 
        # Load tickers from a CSV file
        # stocks = pd.read_csv(r'/root/Git Repo/SIADS_Capstone_Group17/Data/nasdaq_screener.csv')
        stocks = pd.read_csv(r'./Data/nasdaq_screener.csv')

        # Directly convert the 'Symbol' column to a set
        tickers_set = set(stocks['Symbol'])

        # Use set union to add additional tickers
        additional_tickers = {'BBBY'}
        tickers_set = tickers_set.union(additional_tickers)

        # Adjust the pattern to optionally include a leading '$'
        # pattern = r'\b\$?(?:' + '|'.join(tickers_as_strings) + r')\b'
        # pattern = r'\b\$?(?:\(?)(?:' + '|'.join(tickers_as_strings) + r')(?:\)?)\b'
        pattern = r'\b[A-Z]{2,5}\b'
        compiled_pattern = re.compile(pattern)



        blacklist = {
    #                 {'I', 'ELON', 'WSB', 'THE', 'A', 'ROPE', 'YOLO', 'TOS', 'CEO', 'DD', 'IT', 'OPEN', 'ATH', 'PM', 'IRS', 'FOR',
    #              'DEC', 'BE',
                    'IMO',# 'ALL', 'RH', 'EV', 'TOS', 'CFO', 'CTO',
                    'DD',
                    #'BTFD', 'WSB', 'OK', 'PDT', 'RH', 'KYS', 'FD',
    #              'TYS', 
                    'US',
                    'USA',
                    # 'IT', 'ATH', 'RIP', 'BMW', 'GDP', 'OTM', 'ATM', 'ITM', 
                    'IMO', 'LOL', 'AM', 'BE', 'PR', 'PRAY',
    #              'PT', 'FBI', 'SEC', 'GOD', 'NOT', 'POS', 'FOMO', 'TL;DR',
                    'EDIT', 'STILL', 'WTF', 'RAW', 'PM', 'LMAO', 'LMFAO',
    #              'ROFL', 'EZ', 'RED', 'BEZOS', 'TICK', 'IS', 'PM', 'LPT', 'GOAT', 'FL', 'CA', 'IL', 'MACD', 'HQ', 'OP', 'PS', 'AH',
    #              'TL', 'JAN', 'FEB', 'JUL', 'AUG', 'SEP', 'SEPT', 'OCT', 'NOV', 'FDA', 'IV', 'ER', 'IPO', 'MILF', 'BUT', 'SSN', 'FIFA',
    #              'USD', 'CPU', 'AT', 'GG', 'Mar', 

    #                # Jake added
                    'RUN', # common
                    'SAY', # common
                    'EOD', # end of day
                    'BIG', # common
                    'LOW', # low / high
                    'RSI', #relative strenght
                    'DT', #double top
                    'HUGE',
                    'U', # you
                    'AI', # Artificial Intelligence
                    'DC', # washington DC
                    'J', # as in J Powell
                    'ES', # E-mini SP future
                    'F', # f*ck
                    'GO',
                    'UK', # United Kingdom
                    'EU', # european union
                    'RH', # Robinhood, not Restoration Hardware
                    'E', # E*trade brokerage
                    'L', # L for loss, P&L etc
                    'R', # common 
                    'K', # OK
                    'B', # common in BBBY odd spacing (spam?)
                    'TD', # TD Ameritrade brokerage
                    'RYAN', # Ryan Cohen, CEO of GME
                    'NYC', # New York City
                    'REG', # reg SHO 
                    'SHO', # reg SHO 
                    'NEXT', # common
                    'FREE', # spam
                    'DM', # direct message
                    'TV', # television
                    'ENS', # ethereum name service, spam
                    'IRS', # internal revenue service
                    'PR', # public relations
                    'IQ', # intelligence quotient
                    'VS', # versus
                    'PT', # price target
                    'IBKR', # interactive brokers
                    'GOOD', # common
                    'OPEN', # market open
                    'FCF', # free cash flow


                    }

        combined_blacklist = set(blacklist) | set(word.upper() for word in stopwords.words('english'))


        def find_tickers(text, compiled_pattern, tickers_set, blacklist_set):
            # Find all matches
            potential_tickers = compiled_pattern.findall(text)
            # Filter matches against the tickers list and ensure they are not in the blacklist
            return list(set([ticker for ticker in potential_tickers if ticker in tickers_set and ticker not in combined_blacklist]))

        try:
            df['title_tickers'] = df['title'].apply(lambda x: find_tickers(x, compiled_pattern, tickers_set, combined_blacklist))
        except KeyError:
            print('title not found, working with comments?')

        df['selftext_tickers'] = df['selftext'].apply(lambda x: find_tickers(x, compiled_pattern, tickers_set, combined_blacklist))
        df['tickers'] = [list(set(x + y)) for x, y in zip(df['title_tickers'], df['selftext_tickers'])]

        return df
    
    def add_vader_sentiment(df):

        vader = SentimentIntensityAnalyzer()

        added_words = {
                'citron': -4.0,  
                'hidenburg': -4.0,        
                'moon': 4.0,
                'highs': 2.0,
                'mooning': 4.0,
                'long': 2.0,
                'short': -2.0,
                'call': 4.0,
                'calls': 4.0,    
                'put': -4.0,
                'puts': -4.0,    
                'break': 2.0,
                'tendie': 2.0,
                'tendies': 2.0,
                'town': 2.0,     
                'overvalued': -3.0,
                'undervalued': 3.0,
                'buy': 4.0,
                'sell': -4.0,
                'gone': -1.0,
                'gtfo': -1.7,
                'paper': -1.7,
                'bullish': 3.7,
                'bearish': -3.7,
                'bagholder': -1.7,
                'stonk': 1.9,
                'green': 1.9,
                'money': 1.2,
                'print': 2.2,
                'rocket': 2.2,
                'bull': 2.9,
                'bear': -2.9,
                'pumping': -1.0,
                'sus': -3.0,
                'offering': -2.3,
                'rip': -4.0,
                'downgrade': -3.0,
                'upgrade': 3.0,     
                'maintain': 1.0,          
                'pump': 1.9,
                'hot': 1.5,
                'drop': -2.5,
                'rebound': 1.5,  
                'crack': 2.5,
                '🚀': 3, # Jake ADDED THESE
                '🌕': 3, # Jake ADDED THESE
                'YOLO': 4, # Jake ADDED THESE
                'ripping': 3,# Jake ADDED THESE
                'regarded': 0, # Jake ADDED THESE
                'squeeze':3, # Jake ADDED THESE
                }

        vader.lexicon.update(added_words)

        def safe_sentiment(text):
            try:
                # Ensure the input is a non-empty string
                if not isinstance(text, str) or not text.strip():
                    return 0

                # Analyze the sentiment
                sentiment_dict = vader.polarity_scores(text)
                return sentiment_dict.get('compound', 0)
            except Exception as e:
                print(f"Error processing text: '{text}' (type: {type(text)}). Error: {e}")
                return 0

        # Apply the function
        try:
            df['title_sentiment'] = df['title'].apply(safe_sentiment)
        except:
            print('Titles not found, is this a comments file?')
            df['title_sentiment'] = 0

        df['selftext_sentiment'] = df['selftext'].apply(safe_sentiment)


        def non_zero_average(row):
            sentiments = [row['title_sentiment'], row['selftext_sentiment']]
            non_zero_sentiments = [s for s in sentiments if s != 0]

            if not non_zero_sentiments:
                return 0  # Return 0 if both sentiments are zero

            return sum(non_zero_sentiments) / len(non_zero_sentiments)

        # Apply the function to calculate overall sentiment
        df['overall_sentiment'] = df.apply(non_zero_average, axis=1)

        df['score_weighted_sentiment'] = df['overall_sentiment'] * df['score']

        return df
    
    def get_reddit_praw_submissions(limit):
    
        client_id = 'aWEYVIaAoJGlCPja3awh0A'
        secret = 'gOR5FfkvsTH3MJ0IHRSImToTwt0PSQ'

        reddit = praw.Reddit(
                            client_id=client_id,
                            client_secret=secret,
                            user_agent="MADS/0.1 by TeamSafari",
                        )
        submissions_data = []

        for submission in reddit.subreddit("wallstreetbets").new(limit=limit):
            # print(dir(submission))
            data = {
                'id': submission.id,
                'subreddit_id': submission.subreddit_id,
                'subreddit': submission.subreddit,
                'author': submission.author,
                'created_utc': submission.created_utc,
                'permalink': submission.permalink,
                'title': submission.title,
                'selftext': submission.selftext,
                'num_comments': submission.num_comments,
                'score': submission.score,
                'flair': submission.link_flair_text,
                'removal_reason':submission.removal_reason,


                # Add more fields as needed
            }
            submissions_data.append(data)

        df = pd.DataFrame(submissions_data)
        return df
    
    
    df = get_reddit_praw_submissions(limit=1000)
    praw_df = preprocess_df(df)
    praw_df = find_tickers(praw_df)
    praw_df = add_vader_sentiment(praw_df)
    
    
    # display(praw_df)
    
    
    # create cumulative_sentiment_sorted_df
    exploded_df = praw_df.explode('tickers')
    cumulative_sentiment = exploded_df.groupby('tickers')['overall_sentiment'].sum().reset_index() # Group by 'tickers'
    cumulative_sentiment.columns = ['Ticker', 'Cumulative Overall Sentiment'] # Rename columns for clarity
    cumulative_sentiment_sorted = cumulative_sentiment.sort_values(by='Cumulative Overall Sentiment', ascending=False)
    # display(cumulative_sentiment_sorted)

    # Group by 'tickers' and sum the 'score_weighted_sentiment'
    cumulative_weighted_sentiment = exploded_df.groupby('tickers')['score_weighted_sentiment'].sum().reset_index()
    cumulative_weighted_sentiment.columns = ['Ticker', 'Cumulative Weighted Sentiment'] # Rename columns for clarity
    cumulative_weighted_sentiment_sorted = cumulative_weighted_sentiment.sort_values(by='Cumulative Weighted Sentiment', ascending=False)
    cumulative_weighted_sentiment_sorted['Date'] = praw_df['created_EST_date']
    cumulative_weighted_sentiment_sorted['Date'].fillna(praw_df['created_EST_date'].unique()[0],inplace=True)
    # display(cumulative_weighted_sentiment_sorted)

    daily_sentiment_df = cumulative_sentiment_sorted.merge(cumulative_weighted_sentiment_sorted, on='Ticker')
    
    # Explode the DataFrame if 'tickers' column contains lists
    # exploded_df = praw_df.explode('tickers')

    expected_columns = ['tickers', 'title', 'permalink', 'overall_sentiment']

    # Check if all expected columns are in the DataFrame
    if not all(column in exploded_df.columns for column in expected_columns):
        raise ValueError("DataFrame does not contain the expected columns.")

    # Base URL to prepend to permalinks
    base_url = 'https://www.reddit.com/'
    
    # Aggregate submission details with the base URL prepended to permalinks
    submission_details = exploded_df.groupby('tickers').apply(
        lambda x: [(title, base_url + permalink, sentiment) 
                   for title, permalink, sentiment in zip(x['title'], x['permalink'], x['overall_sentiment'])]
    ).reset_index(name='Submissions')

    # Rename columns for clarity
    submission_details.columns = ['Ticker', 'Submissions']

    # Merge with daily_sentiment_df
    daily_sentiment_df = daily_sentiment_df.merge(submission_details, on='Ticker', how='left')
    
    return daily_sentiment_df, exploded_df

In [14]:
%%time
output1, output2 = PRAW_for_dashboard()
output1

CPU times: user 1.89 s, sys: 0 ns, total: 1.89 s
Wall time: 13.8 s


Unnamed: 0,Ticker,Cumulative Overall Sentiment,Cumulative Weighted Sentiment,Date,Submissions
0,SAVE,8.89415,314.09580,2023-12-07,[(Odds of the SAVE case being decided on or be...
1,NVDA,6.59190,4004.81645,2023-12-07,[(AMD AI Chip release - Up 6.5 in the last 24h...
2,META,5.32785,6937.98870,2023-12-08,[(The guy who went full send on META calls sho...
3,MSFT,4.24275,807.29140,2023-12-07,"[(Trying to recoup my loss: Day 1, https://www..."
4,TSLA,3.97220,2090.27530,2023-12-06,[(Only took me 9 days to lose 290k yoloing on ...
...,...,...,...,...,...
194,BMA,-0.95110,-8.55990,2023-12-09,"[(Shorting select Argentinian companies, https..."
195,BBVA,-0.95110,-8.55990,2023-12-09,"[(Shorting select Argentinian companies, https..."
196,NOV,-0.95110,-8.55990,2023-12-07,"[(Shorting select Argentinian companies, https..."
197,YPF,-0.95110,-8.55990,2023-12-06,"[(Shorting select Argentinian companies, https..."


In [20]:
output3 = output2[["tickers", "title", "permalink", "overall_sentiment"]].dropna()

(199,)

In [12]:
output["Submissions"].values[0].dropna()

[('Odds of the SAVE case being decided on or before January 19th?',
  'https://www.reddit.com//r/wallstreetbets/comments/18epb7p/odds_of_the_save_case_being_decided_on_or_before/',
  0.7877000000000001),
 ('SAVE merger question',
  'https://www.reddit.com//r/wallstreetbets/comments/18cl5jf/save_merger_question/',
  0.772),
 ('SAVE trial bullish quotes from the judge',
  'https://www.reddit.com//r/wallstreetbets/comments/18by8rd/save_trial_bullish_quotes_from_the_judge/',
  0.89165),
 ('What happens to SAVE if JetBlue merger is blocked by DOJ?',
  'https://www.reddit.com//r/wallstreetbets/comments/18bjkra/what_happens_to_save_if_jetblue_merger_is_blocked/',
  0.64795),
 ('What are your thoughts on the SpiritJetblue merger court case given the HawaiianAlaska deal?',
  'https://www.reddit.com//r/wallstreetbets/comments/18aq3j1/what_are_your_thoughts_on_the_spiritjetblue/',
  0.7603),
 ('SAVE put assignment',
  'https://www.reddit.com//r/wallstreetbets/comments/189i3um/save_put_assignment/

In [10]:
output.iloc[0,4]

[('Odds of the SAVE case being decided on or before January 19th?',
  'https://www.reddit.com//r/wallstreetbets/comments/18epb7p/odds_of_the_save_case_being_decided_on_or_before/',
  0.7877000000000001),
 ('SAVE merger question',
  'https://www.reddit.com//r/wallstreetbets/comments/18cl5jf/save_merger_question/',
  0.772),
 ('SAVE trial bullish quotes from the judge',
  'https://www.reddit.com//r/wallstreetbets/comments/18by8rd/save_trial_bullish_quotes_from_the_judge/',
  0.89165),
 ('What happens to SAVE if JetBlue merger is blocked by DOJ?',
  'https://www.reddit.com//r/wallstreetbets/comments/18bjkra/what_happens_to_save_if_jetblue_merger_is_blocked/',
  0.64795),
 ('What are your thoughts on the SpiritJetblue merger court case given the HawaiianAlaska deal?',
  'https://www.reddit.com//r/wallstreetbets/comments/18aq3j1/what_are_your_thoughts_on_the_spiritjetblue/',
  0.7603),
 ('SAVE put assignment',
  'https://www.reddit.com//r/wallstreetbets/comments/189i3um/save_put_assignment/