In [45]:
import pandas as pd
import emoji
import re 
import os
import pprint

# packages for text processing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from nltk import FreqDist

nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/fredschaefer/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [69]:
df_raw = pd.read_pickle('data/reddit_extract.pkl')
df_raw['comment'] = df_raw['comment'].str.lower()
df_raw

Unnamed: 0,comment,dt,sub
0,* for help finding the latest skeptics discuss...,2021-10-20 17:00:31,qcekzf
1,*bitcoin drops 2%*\n\npeople with $35 in bitco...,2021-10-21 08:01:56,qcekzf
2,3 straight weeks of green for btc then one bad...,2021-10-21 07:47:30,qcekzf
3,the sentiment changes in this sub are fucking ...,2021-10-21 08:45:35,qcekzf
4,if you think this is what the real alt season ...,2021-10-21 06:50:41,qcekzf
...,...,...,...
6240,you're right. dao and staking in schedule to c...,2022-01-08 13:39:31,rylwuh
6241,oh that's great. hope it works out!,2022-01-08 10:57:24,rylwuh
6242,it sure was booming when t was at the helm and...,2022-01-08 06:35:10,rylwuh
6243,did you see the clip of him taking a nap at co...,2022-01-08 07:27:47,rylwuh


In [70]:
# filter to BTC comments
df_raw = df_raw[df_raw['comment'].str.contains('btc|bitcoin')]
df_raw

Unnamed: 0,comment,dt,sub
1,*bitcoin drops 2%*\n\npeople with $35 in bitco...,2021-10-21 08:01:56,qcekzf
2,3 straight weeks of green for btc then one bad...,2021-10-21 07:47:30,qcekzf
5,btc ath ✅\n\neth ath ✅\n\nalt season - **l...,2021-10-21 06:49:27,qcekzf
11,"if btc hits $69,000 and eth $4,200 reddit is g...",2021-10-21 01:38:02,qcekzf
14,i don’t know wtf i’m doing. i just buy in to b...,2021-10-21 11:55:47,qcekzf
...,...,...,...
6176,how so ? back then doge but now btc eth dot lrc,2022-01-08 15:09:07,rylwuh
6187,2017 brother. still here. i totally agree with...,2022-01-08 02:14:59,rylwuh
6214,i imagine that you are a billionaire with your...,2022-01-08 14:24:29,rylwuh
6228,thats what i am doing started yesterday. bit g...,2022-01-08 07:23:12,rylwuh


Initialize Sentiment Intensity Analyzer and update with some custom crypto slang.

In [97]:
sia = SIA()
crypto_words = {
    'bear':-5,
    'bearish':-5,
    'ramen':-5,
    'dump':-5,
    'crash':-5,
    'paper':-5,
    'paperhand':-5,
    'rekt':-5,
    'red':-5,
    'winter':-5,
    'fud':-3,
    'hodl':-3,
    'shitcoin':-3,
    'short':-3,
    'wick':-3,
    'dip':-3,
    'crab':-1,
    'sell':-1,
    'dca':0,
    'diamond':1,
    'diamondhand':1,
    'buy':1,
    'ath':3,
    'ape':3,
    'bull':5,
    'lambo':5,
    'moon':5,
    'pump':5,
    'pamp':5,
    'fomo':5,
    'rocket':5,
    'green':5    
}

sia.lexicon.update(crypto_words)

Loop through comments, create sentiment score and save in dataframe

In [100]:
df_collect = pd.DataFrame()

for sub in df_raw['sub'].unique().tolist() :
    results = []

    # create list from the comments in dataframe for selected submission
    comment_str_raw = df_raw[df_raw['sub']==sub]['comment'].tolist()

    # remove emoji
    comment_str_noemoji = [emoji.get_emoji_regexp().sub(u'',comment).lower() for comment in comment_str_raw] 
    comment_str_noemoji

    # create polarity score
    for comment in comment_str_noemoji :
        comment = comment.replace("\n", "")
        pol_score = sia.polarity_scores(comment)
        pol_score['comment'] = comment
        results.append(pol_score)

    df = pd.DataFrame(results)  
    df['sub'] = sub
    
    df_collect = df_collect.append(df)

df_collect

Unnamed: 0,neg,neu,pos,compound,comment,sub
0,0.189,0.631,0.180,0.1779,*bitcoin drops 2%*people with $35 in bitcoin: ...,qcekzf
1,0.229,0.435,0.336,0.5696,3 straight weeks of green for btc then one bad...,qcekzf
2,0.000,0.385,0.615,0.8402,btc ath eth ath alt season - **loading...**,qcekzf
3,0.353,0.647,0.000,-0.7906,"if btc hits $69,000 and eth $4,200 reddit is g...",qcekzf
4,0.101,0.798,0.101,-0.2500,i don’t know wtf i’m doing. i just buy in to b...,qcekzf
...,...,...,...,...,...,...
531,0.000,1.000,0.000,0.0000,how so ? back then doge but now btc eth dot lrc,rylwuh
532,0.025,0.745,0.229,0.8932,2017 brother. still here. i totally agree with...,rylwuh
533,0.000,0.867,0.133,0.3182,i imagine that you are a billionaire with your...,rylwuh
534,0.032,0.859,0.109,0.5927,thats what i am doing started yesterday. bit g...,rylwuh


Classify each comment as positive (=1), neutral (=0), or negative (-1)

In [101]:
thresh = .1

df_collect['label'] = 0
df_collect.loc[df_collect['compound'] > 1*thresh, 'label'] = 1
df_collect.loc[df_collect['compound'] < -1*thresh, 'label'] = -1

df_collect.groupby(['sub'])['label'].value_counts(normalize=True).unstack(fill_value=0)

label,-1,0,1
sub,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
p9o1td,0.16651,0.381938,0.451552
qcekzf,0.237844,0.27167,0.490486
r9t7ks,0.264463,0.294215,0.441322
rylwuh,0.311567,0.261194,0.427239


To do: 
* tokenize
* normalized word count by sub
* word cloud by sub