# Introduction

For the final section of this study, we have conducted a sentiment analysis on all the posts in both subreddits. The sentiment from post containing mentions of a particular stock will then be analyzed, and compared against the stock's future performance.

# Imports and Settings

In [1]:
## library imports

# data processing imports
import pandas as pd
import seaborn as sns

# sentiment analysis imports
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from emoji import demojize

# import json

from sklearn import preprocessing

# Data Imports

<span style="color:red">Note: To access the pre-scraped datasets, you will need to export the the .rar files from the 'data_compressed' folder, and put them in the 'data' folder</span>.

In [2]:
df = pd.read_csv('data/df_cleaned.csv') # importing data from saved csv file
df['created'] = pd.to_datetime(df['created']) # convert the created column to a datetime column
df.head() # displaying the top 5 row of the loaded dataframe

Unnamed: 0,author,created_utc,id,is_self,num_comments,score,selftext,title,upvote_ratio,created,subreddit,subreddit_stocks,title_char_len,title_word_len,score_pmaw
0,cloudboyy,1627751015,ovatu6,False,1,1,,Any thoughts on OSTK? It sold out of the wedge...,1.0,2021-08-01 01:03:35,wsb,0,208,40,1
1,itbc1info,1627751020,ovatwa,False,2,2,,"U.S. prosecutors charge Trevor Milton, founder...",1.0,2021-08-01 01:03:40,wsb,0,61,8,1
2,dadryp,1627751064,ovaudw,True,0,1,,Stocks have been horrible for me 2021,1.0,2021-08-01 01:04:24,stocks,1,37,7,1
3,insta_man,1627751065,ovauee,True,40,7,Long story I started investing in the market w...,Want to get back into investing. Any tips?,1.0,2021-08-01 01:04:25,stocks,1,42,8,1
4,PenIslandGaylien,1627751202,ovavxl,True,1,2,So I already contributed 6k to my Roth in 2021...,Roth Income Limits,1.0,2021-08-01 01:06:42,wsb,0,18,3,1


# Data Cleaning

In [3]:
# clear empty and removed selftexts, and replacing them with an empty string
df['selftext'].replace('[removed]','',inplace=True)
df['selftext'] = df['selftext'].fillna('')
# combine selftext and title
df['title'] = df['title'] + " | " + df['selftext'] # we will be using both the title and selftext for the sentiment analysis

In [4]:
df['title'] = df['title'].apply(demojize)
df['title'] = df['title'].str.replace('_',' ')
df['title'] = df['title'].str.replace(':',' ')

In [5]:
# Adding wsb/reddit flavor to vader to improve sentiment analysis, score: 4.0 to -4.0
# source: https://medium.com/nerd-for-tech/wallstreetbets-sentiment-analysis-on-stock-prices-using-natural-language-processing-ed1e9e109a37
new_words = {
    'citron': -4.0,  
    'hidenburg': -4.0,        
    'moon': 4.0,
    'highs': 2.0,
    'mooning': 4.0,
    'long': 2.0,
    'short': -2.0,
    'call': 4.0,
    'calls': 4.0,    
    'put': -4.0,
    'puts': -4.0,    
    'break': 2.0,
    'tendie': 2.0,
     'tendies': 2.0,
     'town': 2.0,     
     'overvalued': -3.0,
     'undervalued': 3.0,
     'buy': 4.0,
     'sell': -4.0,
     'gone': -1.0,
     'gtfo': -1.7,
     'paper': -1.7,
     'bullish': 3.7,
     'bearish': -3.7,
     'bagholder': -1.7,
     'stonk': 1.9,
     'green': 1.9,
     'money': 1.2,
     'print': 2.2,
     'rocket': 2.2,
     'bull': 2.9,
     'bear': -2.9,
     'pumping': -1.0,
     'sus': -3.0,
     'offering': -2.3,
     'rip': -4.0,
     'downgrade': -3.0,
     'upgrade': 3.0,     
     'maintain': 1.0,          
     'pump': 1.9,
     'hot': 1.5,
     'drop': -2.5,
     'rebound': 1.5,  
     'crack': 2.5,}

In [6]:
# adding even more tokens/words for the VADER sentiment analysis lexicon list
# this token list is obtained from a research done by Researchers at the National Taiwan University (source: http://mx.nthu.edu.tw/~chungchichen/papers/NTUSD-Fin_Market_Sentiment_Dictionary_for_Financial_Social_Media_Data.pdf)
new_tokens = pd.read_json('data/NTUSD_Fin_word_v1.0.json')[['token','market_sentiment']] # reading the data from a pre-downloaded .json file
new_tokens['market_sentiment_scaled'] = new_tokens['market_sentiment']
new_tokens = dict(zip(new_tokens['token'],new_tokens['market_sentiment_scaled']))


In [7]:
vader = SentimentIntensityAnalyzer() # instantiating the sentiment intensity analyzer

# adding custom words from the two sources
vader.lexicon.update(new_words)
vader.lexicon.update(new_tokens)
len(vader.lexicon) # printing the length of the updated lexicon

14566

In [8]:
# running the vader sentiment analysis on the full dataframe
df['polarity_scores'] = df['title'].apply(lambda x: vader.polarity_scores(x))

In [9]:
df[['polarity_scores']] # displaying the polarity score columns

Unnamed: 0,polarity_scores
0,"{'neg': 0.131, 'neu': 0.377, 'pos': 0.491, 'co..."
1,"{'neg': 0.297, 'neu': 0.519, 'pos': 0.184, 'co..."
2,"{'neg': 0.36, 'neu': 0.547, 'pos': 0.093, 'com..."
3,"{'neg': 0.272, 'neu': 0.404, 'pos': 0.324, 'co..."
4,"{'neg': 0.203, 'neu': 0.577, 'pos': 0.22, 'com..."
...,...
272411,"{'neg': 0.309, 'neu': 0.511, 'pos': 0.18, 'com..."
272412,"{'neg': 0.262, 'neu': 0.181, 'pos': 0.557, 'co..."
272413,"{'neg': 0.0, 'neu': 0.61, 'pos': 0.39, 'compou..."
272414,"{'neg': 0.432, 'neu': 0.525, 'pos': 0.043, 'co..."


# Data Export

In [10]:
# exporting the sentiment analysis result
df[['polarity_scores']].to_csv('data/df_sentiment.csv',index=True)