In [17]:
import pandas as pd
import numpy as np
import os
import spacy
import json
import pathlib
import time
import gensim
import itertools
import pickle
import seaborn as sns
from helpers import preprocess_tweets, preprocess_tweet, preprocess_reddit, preprocess_reddits
from simpletransformers.language_representation import RepresentationModel
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

## Download links for data

- Bitcoin_tweeets.csv -> https://www.kaggle.com/kaushiksuresh147/bitcoin-tweets
- GoEmotions.csv -> https://www.kaggle.com/datasets/debarshichanda/goemotions
- sentiment140 -> http://help.sentiment140.com/for-students/
- GoogleNews-vectors-etc... (Word2Vec) -> Canvas(?)
- NRC Emotion Lexicon -> https://saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm
- nrc_emotion_lexicon_dict -> Google Drive
- BTC-USD -> https://finance.yahoo.com/quote/BTC-USD/

In [3]:
# 90th percentile length (after removing stopwords and punctuation) was about 14 when I tested, so this is a good cutoff (99th percentile = 18)
MAX_SENTENCE_LENGTH = 25
TRAINING_SIZE = 60000

# Import Datasets

## Stanford 140

In [4]:
# df_sf_train = pd.read_csv(
#     os.path.abspath('data/sentiment140-train.csv'), 
#     encoding='ISO-8859-1', 
#     header=None, 
#     names=['polarity', 'id', 'date', 'query', 'user', 'text'],
#     usecols=['polarity', 'text']
# )

# df_sf_test = pd.read_csv(
#     os.path.abspath('data/sentiment140-test.csv'), 
#     encoding='ISO-8859-1', 
#     header=None, 
#     names=['polarity', 'id', 'date', 'query', 'user', 'text'],
#     usecols=['polarity', 'text']
# )

# df_sf_train['polarity'] = df_sf_train['polarity'].replace(4, 1)
# df_sf_test['polarity'] = df_sf_train['polarity'].replace(4, 1)

# df_sf_train['text'] = df_sf_train['text'].apply(preprocess_tweet)
# df_sf_test['text'] = df_sf_test['text'].apply(preprocess_tweet)

In [5]:
# df_sf_test.tail()

## GoEmotion

In [10]:
df_goemotion = pd.read_csv(os.path.abspath('data/GoEmotions.csv'))

In [11]:
len(df_goemotion)

211225

In [12]:
df_goemotion.head()

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,False,0,...,0,0,0,0,0,0,0,1,0,0
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1548084000.0,37,True,0,...,0,0,0,0,0,0,0,0,0,0
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1546428000.0,37,False,0,...,0,0,0,0,0,0,0,0,0,1
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1547965000.0,18,False,0,...,1,0,0,0,0,0,0,0,0,0
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,t3_ackt2f,t1_eda65q2,1546669000.0,2,False,0,...,0,0,0,0,0,0,0,0,0,1


In [13]:
len(df_goemotion)

211225

In [14]:
df_gosentiment = df_goemotion.copy()

In [15]:
positive_emotions = ['admiration', 'amusement', 'approval', 'caring', 'curiosity', 'desire', 'excitement', 'gratitude', 'joy', 'love', 'optimism', 'pride', 'relief']
negative_emotions = ['anger', 'annoyance', 'disappointment', 'disapproval', 'disgust', 'fear', 'grief', 'nervousness', 'remorse', 'sadness', 'embarrassment']
neutral_emotions = ['neutral', 'confusion', 'realization', 'surprise']

df_gosentiment['Positive'] = df_gosentiment[positive_emotions].sum(axis=1).apply(lambda x: min(1, x))
df_gosentiment['Negative'] = df_gosentiment[negative_emotions].sum(axis=1).apply(lambda x: min(1, x))
df_gosentiment['Neutral'] = df_gosentiment[neutral_emotions].sum(axis=1).apply(lambda x: min(1, x))

also_drop_columns = ['subreddit', 'id', 'link_id', 'author', 'parent_id', 'rater_id']

df_gosentiment.drop(labels=positive_emotions + negative_emotions + neutral_emotions + also_drop_columns, axis=1, inplace=True)

df_gosentiment['Polarity'] = 0

for index, row in df_gosentiment.iterrows():
    if row['Positive'] == 1:
        df_gosentiment.at[index, 'Polarity'] = 1
    elif row['Negative'] == 1:
        df_gosentiment.at[index, 'Polarity'] = -1

df_gosentiment = df_gosentiment.astype({
    'Positive': 'int',
    'Negative': 'int',
    'Neutral': 'int',
})

In [10]:
df_gosentiment.head(20)

Unnamed: 0,text,created_utc,example_very_unclear,Positive,Negative,Neutral,Polarity
0,That game hurt.,1548381000.0,False,0,1,0,-1
1,>sexuality shouldn’t be a grouping category I...,1548084000.0,True,0,0,0,0
2,"You do right, if you don't care then fuck 'em!",1546428000.0,False,0,0,1,0
3,Man I love reddit.,1547965000.0,False,1,0,0,1
4,"[NAME] was nowhere near them, he was by the Fa...",1546669000.0,False,0,0,1,0
5,Right? Considering it’s such an important docu...,1548280000.0,False,1,0,0,1
6,"He isn't as big, but he's still quite popular....",1546320000.0,False,0,1,0,-1
7,That's crazy; I went to a super [RELIGION] hig...,1546536000.0,False,1,0,0,1
8,that's adorable asf,1548764000.0,False,1,0,0,1
9,"""Sponge Blurb Pubs Quaw Haha GURR ha AAa!"" fin...",1546984000.0,False,1,0,0,1


## NRC Emotion Lexicon

In [11]:
# with open(os.path.abspath('data/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt'), 'r') as f:
#     lines = f.readlines()
#     for line in lines:
#         word, emotion, 

nrc_df = pd.read_csv(os.path.abspath('data/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt'), sep='\t', header=0, names=['word', 'emotion', 'intensity'])

In [12]:
# nrc_dict = {}

# # Iterate over nrc_df
# for index, row in nrc_df.iterrows():
#     # Get the word and emotion
#     word = row['word']
#     emotion = row['emotion']
#     intensity = row['intensity']
#     # If the word is not in the dict yet
#     if word not in nrc_dict:
#         # Initialize the word in the dict
#         nrc_dict[word] = []
#     # Add the emotion to the word
#     if intensity == 1:
#         nrc_dict[word].append(emotion)

# # Writee nrc_dict to file
# with open(os.path.abspath('data/nrc_emotion_lexicon_dict.json'), 'w') as f:
#     f.write(json.dumps(nrc_dict))

In [13]:
nrc_dict = json.load(open(os.path.abspath('data/nrc_emotion_lexicon_dict.json')))

## Bitcoin Tweets

# Creating Sentence Vector

- Word embeddings
- PoS
- Positive/Neutral word 000110110

## Load Spacy

In [14]:
nlp = spacy.load('en_core_web_sm')

## Word Embedding

### BERT

In [15]:
# model = RepresentationModel(
#     model_type="bert",
#     model_name="bert-base-uncased",
#     use_cuda=False
# )

# model = RepresentationModel(
#     model_type="roberta",
#     model_name="roberta-base",
#     use_cuda=False
# )

### Word2Vec

In [16]:
word2vec_path = os.path.abspath('data/GoogleNews-vectors-negative300.bin')
word_embedding_model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True, limit=750000)

In [17]:
def encode_word2vec_from_scratch(sentences):
    # return gensim.models.Word2Vec(sentences, size=300, window=5, min_count=5, workers=4)
    return sentences

def encode_bert(sentences, model):
    return model.encode_sentences(sentences, combine_strategy=None)

def encode_word2vec(sentences, max_length=MAX_SENTENCE_LENGTH):
    model = word_embedding_model

    sentences_vector = []

    for sentence in sentences:
        sent_vector = []

        for token in nlp(sentence):
            if token.is_stop or token.is_space or token.is_punct:
                continue

            lemma = token.lemma_.lower()
            if lemma in model:
                sent_vector.append(model[lemma])
            elif token.text.lower() in model:
                sent_vector.append(model[token.text.lower()])
            else:
                sent_vector.append([0] * 300)
        
        if len(sent_vector) > max_length:
            sent_vector = sent_vector[:max_length]
        else:
            sent_vector = sent_vector + [[0] * 300] * (max_length - len(sent_vector))

        sentences_vector.append(sent_vector)
    
    return np.array(sentences_vector)

## Part of Speech Embedding

In [18]:
def encode_other_features(sentences, max_length=MAX_SENTENCE_LENGTH, fit_vectorizer=False):    
    vectors = []

    for sentence in sentences:
        vector = []

        for token in nlp(sentence):
            dictionary = {}

            if not (token.is_stop or token.is_space or token.is_punct):
                dictionary['pos'] = token.pos_
                
                associated_emotions = nrc_dict.get(token.lemma_, [])

                for emotion in associated_emotions:
                    dictionary[emotion] = True
                
                vector.append(dictionary)

        if len(vector) > max_length:
            vector = vector[:max_length]
        else:
            vector = vector + [{}] * (max_length - len(vector))

        vectors.append(vector)
    
    print(len(vectors))

    if fit_vectorizer:
        dict_vectorizer = DictVectorizer()
        dict_vectorizer = dict_vectorizer.fit(list(itertools.chain.from_iterable(vectors)))

        with open('dict_vectorizer.pkl', 'wb') as f:
            pickle.dump(dict_vectorizer, f)
    else:
        dict_vectorizer = pickle.load(open('dict_vectorizer.pkl', 'rb'))

    encoded = []

    for vector in vectors:
        encoded.append(dict_vectorizer.transform(vector).toarray())
    
    return np.array(encoded)

# Machine Learning Approach

## On GoSentiment Data

In [21]:
start_time = time.time()

sentences = list(df_gosentiment.text)[:TRAINING_SIZE]

# sentences = preprocess_tweets(sentences)
sentences_embedded = encode_word2vec(sentences)
features_embedded = encode_other_features(sentences, fit_vectorizer=True)

# Combine sentences_embedded and features_embedded on the third dimension
combined_embedded = np.concatenate((sentences_embedded, features_embedded), axis=2)

combined_embedded_2d = combined_embedded.reshape(combined_embedded.shape[0], -1) # combined_embedded.reshape(combined_embedded.shape[0], -1)

# with open('combined_embedded_gosentiment.pkl', 'wb') as f:
#     pickle.dump(combined_embedded_2d, f)

print(sentences_embedded.shape)
print(features_embedded.shape)
print(combined_embedded.shape)
print(combined_embedded_2d.shape)

60000
(60000, 25, 300)
(60000, 25, 27)
(60000, 25, 327)
(60000, 8175)


In [22]:
labels = df_gosentiment.Polarity.to_list()[:TRAINING_SIZE]

X_train, X_test, y_train, y_test = train_test_split(combined_embedded_2d, labels, test_size=0.2, random_state=42)

model = LogisticRegression(solver='newton-cg', multi_class='ovr', max_iter=250)

model.fit(X_train, y_train)

with open('LogisticRegression-60000.pkl', 'wb') as f:
    pickle.dump(model, f)

pred = model.predict(X_test)

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

          -1       0.49      0.40      0.44      2651
           0       0.50      0.53      0.51      4107
           1       0.64      0.67      0.66      5242

    accuracy                           0.56     12000
   macro avg       0.54      0.53      0.54     12000
weighted avg       0.56      0.56      0.56     12000



In [23]:
# with open('LogisticRegression-60000.pkl', 'rb') as f:
#     model = pickle.load(f)

In [24]:
# labels = df_gosentiment.Polarity.to_list()[:TRAINING_SIZE]

# X_train, X_test, y_train, y_test = train_test_split(combined_embedded_2d, labels, test_size=0.2, random_state=42)

# model_2 = SVC(kernel='linear')

# model_2.fit(X_train, y_train)

# with open('SVCModel-60000.pkl', 'wb') as f:
#     pickle.dump(model_2, f)

# pred_2 = model_2.predict(X_test)

# print(classification_report(y_test, pred))

# VADER

In [25]:
# df_vader = df_sf_train.sample(frac=0.01).reset_index(drop=True)
# df_vader.text = df_vader.text.astype('string')

# analyzer = SentimentIntensityAnalyzer()
# #Add VADER metrics to dataframe
# df_vader['compound'] = [analyzer.polarity_scores(v)['compound'] for v in df_vader['text']]
# df_vader['neg'] = [analyzer.polarity_scores(v)['neg'] for v in df_vader['text']]
# df_vader['neu'] = [analyzer.polarity_scores(v)['neu'] for v in df_vader['text']]
# df_vader['pos'] = [analyzer.polarity_scores(v)['pos'] for v in df_vader['text']]

# Reddit Data

### Bitcoin Reddit

In [26]:
# with open('LogisticRegression-60000.pkl', 'rb') as f:
#     model = pickle.load(f)

In [27]:
CRYPTO_PATHS = {
    'Bitcoin': os.path.abspath('data/reddit-crypto/Bitcoin_12htop100_DailySub_0101_to_0817_PushShift_raw.csv'),
    'Dogecoin': os.path.abspath('data/reddit-crypto/doge_12htop100_DailySub_0101_to_0710_PushShift.csv'),
    'Solana': os.path.abspath('data/reddit-crypto/Solana_12htop100_DailySub_0101_to_0817_PushShift_raw.csv'),
    'Shiba': os.path.abspath('data/reddit-crypto/Shiba_Inu_12htop100_DailySub_0101_to_0817_PushShift_raw.csv')
}

In [28]:
reddit_df = pd.read_csv(CRYPTO_PATHS['Bitcoin'])

In [29]:
len(reddit_df)

42088

In [30]:
reddit_df = reddit_df[reddit_df.selftext != '[removed]']

reddit_df.title.fillna('', inplace=True)
reddit_df.selftext.fillna('', inplace=True)

In [31]:
# # Get number of rows per day
# reddit_df.groupby('date', as_index=False)['title'].count()

In [32]:
reddit_df['fulltext'] = reddit_df.title + ': ' + reddit_df.selftext

In [33]:
reddit_df.fulltext = reddit_df.fulltext.apply(preprocess_reddit)

In [34]:
# Remove empty from fulltext
reddit_df.fulltext = reddit_df.fulltext.astype('string')

In [35]:
reddit_df.head(10)

Unnamed: 0,date,title,subreddit,selftext,score,upvote_ratio,total_awards_received,full_link,link_flair_text,author,id,permalink,url,num_comments,fulltext
0,2021-01-01,Habt ihr Literatueempfehlungen um sich in kryp...,Finanzen,Suche gute Bücher etc. zu Kryptowährungen (nic...,1,1.0,0,https://www.reddit.com/r/Finanzen/comments/kog...,Investieren,True_Divide5477,kogyn0,/r/Finanzen/comments/kogyn0/habt_ihr_literatue...,https://www.reddit.com/r/Finanzen/comments/kog...,42,Habt ihr Literatueempfehlungen um sich in kryp...
1,2021-01-01,Ethereum or bitcoin?,binance,"Hi, I have a question that's maybe kind of dum...",1,1.0,0,https://www.reddit.com/r/binance/comments/koex...,General,Unknown_Investor,koexj2,/r/binance/comments/koexj2/ethereum_or_bitcoin/,https://www.reddit.com/r/binance/comments/koex...,7,"Ethereum or bitcoin?: Hi, I have a question th..."
2,2021-01-01,I want some help,wirexappofficial,Guys what is the exchange rate for bitcoin i t...,1,1.0,0,https://www.reddit.com/r/wirexappofficial/comm...,,pelllos,ko8lq4,/r/wirexappofficial/comments/ko8lq4/i_want_som...,https://www.reddit.com/r/wirexappofficial/comm...,3,I want some help: Guys what is the exchange ra...
3,2021-01-01,Buying Bitcoin in Revolut,BitcoinBeginners,Do you recommend buying bitcoin in Revolut in ...,1,1.0,0,https://www.reddit.com/r/BitcoinBeginners/comm...,,nevermindera,kogkf8,/r/BitcoinBeginners/comments/kogkf8/buying_bit...,https://www.reddit.com/r/BitcoinBeginners/comm...,13,Buying Bitcoin in Revolut: Do you recommend bu...
4,2021-01-01,Top 15 Cryptocurrency by Market Capitalization...,IOTAmarkets,,1,1.0,0,https://www.reddit.com/r/IOTAmarkets/comments/...,,accappatoiviola,kog7yv,/r/IOTAmarkets/comments/kog7yv/top_15_cryptocu...,https://youtu.be/71ExZk1YWWE,5,Top 15 Cryptocurrency by Market Capitalization...
5,2021-01-01,What will be the point of Monero when all futu...,Monero,I understand the argument for Monero now since...,1,1.0,0,https://www.reddit.com/r/Monero/comments/kodpv...,,r-bitcoin,kodpv4,/r/Monero/comments/kodpv4/what_will_be_the_poi...,https://www.reddit.com/r/Monero/comments/kodpv...,19,What will be the point of Monero when all futu...
6,2021-01-01,BCH Merchant Adoptions,btc,"Hey all, \nAs I continue to be a BCH ambassad...",1,1.0,0,https://www.reddit.com/r/btc/comments/kof6xw/b...,,shanytc,kof6xw,/r/btc/comments/kof6xw/bch_merchant_adoptions/,https://www.reddit.com/r/btc/comments/kof6xw/b...,18,"BCH Merchant Adoptions: Hey all, As I continue..."
7,2021-01-01,Can’t receive Crypto to Shakepay from external...,BitcoinCA,Trying to move some BTC to sell \n\nMobile app...,1,1.0,0,https://www.reddit.com/r/BitcoinCA/comments/ko...,,alanpartridge69,ko3ate,/r/BitcoinCA/comments/ko3ate/cant_receive_cryp...,https://www.reddit.com/r/BitcoinCA/comments/ko...,7,Can’t receive Crypto to Shakepay from external...
8,2021-01-01,"Tony has said that he looks at players like ""S...",survivor,,1,1.0,0,https://www.reddit.com/r/survivor/comments/kog...,Meme,mysteryfan420,kog6lb,/r/survivor/comments/kog6lb/tony_has_said_that...,https://i.redd.it/37dlv8gpkr861.png,6,"Tony has said that he looks at players like ""S..."
9,2021-01-01,Redditors who have asked their employers to pa...,Bitcoin,,1,1.0,0,https://www.reddit.com/r/Bitcoin/comments/kolm...,,thebrazengeek,kolmrl,/r/Bitcoin/comments/kolmrl/redditors_who_have_...,https://www.reddit.com/r/AskReddit/comments/ko...,15,Redditors who have asked their employers to pa...


In [36]:
sentences = list(reddit_df.fulltext)
sentences_embedded = encode_word2vec(sentences)
features_embedded = encode_other_features(sentences)

# Combine sentences_embedded and features_embedded on the third dimension
combined_embedded = np.concatenate((sentences_embedded, features_embedded), axis=2)

combined_embedded_2d = combined_embedded.reshape(combined_embedded.shape[0], -1) # combined_embedded.reshape(combined_embedded.shape[0], -1)

with open('combined_embedded_bitcoinreddit.pkl', 'wb') as f:
    pickle.dump(combined_embedded_2d, f)

print(sentences_embedded.shape)
print(features_embedded.shape)
print(combined_embedded.shape)
print(combined_embedded_2d.shape)

39757
(39757, 25, 300)
(39757, 25, 27)
(39757, 25, 327)
(39757, 8175)


In [37]:
reddit_sentiment = model.predict(combined_embedded_2d)

In [38]:
reddit_df['sentiment'] = list(reddit_sentiment)

In [39]:
reddit_df.to_csv(os.path.abspath('data/results/reddit_sentiment.csv'), sep=';', encoding='utf-8', index=False)

### Bitcoin Shiba

In [40]:
reddit_df = pd.read_csv(CRYPTO_PATHS['Shiba'])

#

reddit_df = reddit_df[reddit_df.selftext != '[removed]']

reddit_df.title.fillna('', inplace=True)
reddit_df.selftext.fillna('', inplace=True)

# 

reddit_df['fulltext'] = reddit_df.title + ': ' + reddit_df.selftext
reddit_df.fulltext = reddit_df.fulltext.apply(preprocess_reddit)
reddit_df.fulltext = reddit_df.fulltext.astype('string')

In [41]:
sentences = list(reddit_df.fulltext)
sentences_embedded = encode_word2vec(sentences)
features_embedded = encode_other_features(sentences)

# Combine sentences_embedded and features_embedded on the third dimension
combined_embedded = np.concatenate((sentences_embedded, features_embedded), axis=2)

combined_embedded_2d = combined_embedded.reshape(combined_embedded.shape[0], -1)

with open('combined_embedded_shibareddit.pkl', 'wb') as f:
    pickle.dump(combined_embedded_2d, f)

print(sentences_embedded.shape)
print(features_embedded.shape)
print(combined_embedded.shape)
print(combined_embedded_2d.shape)

25700
(25700, 25, 300)
(25700, 25, 27)
(25700, 25, 327)
(25700, 8175)


In [42]:
reddit_sentiment = model.predict(combined_embedded_2d)

reddit_df['sentiment'] = list(reddit_sentiment)

reddit_df.to_csv(os.path.abspath('data/results/reddit_shiba_sentiment.csv'), sep=';', encoding='utf-8', index=False)

# VADER Classification Report

In [27]:
df_gosentiment = pd.read_csv(os.path.abspath('data/gosenedited.csv'))

def turn_discrete(x):
    if x < -0.67:
        return -1
    elif x < 0.33:
        return 0
    else:
        return 1

df_gosentiment['Compound'] = df_gosentiment['compound'].apply(turn_discrete)

Y = list(df_gosentiment.Polarity)
pred = list(df_gosentiment.Compound)

print(classification_report(Y, pred))

              precision    recall  f1-score   support

          -1       0.55      0.14      0.22     48037
           0       0.43      0.70      0.53     72293
           1       0.69      0.62      0.65     90895

    accuracy                           0.54    211225
   macro avg       0.56      0.48      0.47    211225
weighted avg       0.57      0.54      0.51    211225

