In [1]:
import pandas as pd
import numpy as np
import os
import spacy
import json
import pathlib
import gensim
import itertools
from helpers import preprocess_tweets, preprocess_tweet
from simpletransformers.language_representation import RepresentationModel
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report



## Download links for data

- Bitcoin_tweeets.csv -> https://www.kaggle.com/kaushiksuresh147/bitcoin-tweets
- GoEmotions.csv -> https://www.kaggle.com/datasets/debarshichanda/goemotions
- sentiment140 -> http://help.sentiment140.com/for-students/
- GoogleNews-vectors-etc... (Word2Vec) -> Canvas(?)
- NRC Emotion Lexicon -> https://saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm
- nrc_emotion_lexicon_dict -> Google Drive
- BTC-USD -> Google drive

In [41]:
# 90th percentile length (after removing stopwords and punctuation) was about 14 when I tested, so this is a good cutoff (99th percentile = 18)
MAX_SENTENCE_LENGTH = 20

# Import Datasets

## Stanford 140

In [3]:
df_sf_train = pd.read_csv(
    os.path.abspath('data/sentiment140-train.csv'), 
    encoding='ISO-8859-1', 
    header=None, 
    names=['polarity', 'id', 'date', 'query', 'user', 'text'],
    usecols=['polarity', 'text']
)

df_sf_test = pd.read_csv(
    os.path.abspath('data/sentiment140-test.csv'), 
    encoding='ISO-8859-1', 
    header=None, 
    names=['polarity', 'id', 'date', 'query', 'user', 'text'],
    usecols=['polarity', 'text']
)

df_sf_train['polarity'] = df_sf_train['polarity'].replace(4, 1)
df_sf_test['polarity'] = df_sf_train['polarity'].replace(4, 1)

df_sf_train['text'] = df_sf_train['text'].apply(preprocess_tweet)
df_sf_test['text'] = df_sf_test['text'].apply(preprocess_tweet)

In [4]:
df_sf_test.tail()

Unnamed: 0,polarity,text
493,0,Ask Programming: LaTeX or InDesign?: submitted...
494,0,"On that note, I hate Word. I hate Pages. I hat..."
495,0,Ahhh... back in a *real* text editing environm...
496,0,"Trouble in Iran, I see. Hmm. Iran. Iran so far..."
497,0,Reading the tweets coming out of Iran... The w...


## GoEmotion

In [57]:
df_goemotion = pd.read_csv(os.path.abspath('data/GoEmotions.csv'))

In [58]:
df_goemotion.head()

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,False,0,...,0,0,0,0,0,0,0,1,0,0
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1548084000.0,37,True,0,...,0,0,0,0,0,0,0,0,0,0
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1546428000.0,37,False,0,...,0,0,0,0,0,0,0,0,0,1
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1547965000.0,18,False,0,...,1,0,0,0,0,0,0,0,0,0
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,t3_ackt2f,t1_eda65q2,1546669000.0,2,False,0,...,0,0,0,0,0,0,0,0,0,1


In [51]:
len(df_goemotion)

211225

In [70]:
df_gosentiment = df_goemotion.copy()

In [71]:
positive_emotions = ['admiration', 'amusement', 'approval', 'caring', 'curiosity', 'desire', 'excitement', 'gratitude', 'joy', 'love', 'optimism', 'pride', 'relief']
negative_emotions = ['anger', 'annoyance', 'disappointment', 'disapproval', 'disgust', 'fear', 'grief', 'nervousness', 'remorse', 'sadness']
neutral_emotions = ['neutral', 'embarrassment', 'confusion', 'realization', 'surprise']

df_gosentiment['Positive'] = df_gosentiment[positive_emotions].sum(axis=1).apply(lambda x: min(1, x))
df_gosentiment['Negative'] = df_gosentiment[negative_emotions].sum(axis=1).apply(lambda x: min(1, x))
df_gosentiment['Neutral'] = df_gosentiment[neutral_emotions].sum(axis=1).apply(lambda x: min(1, x))

also_drop_columns = ['subreddit', 'id', 'link_id', 'author', 'parent_id', 'rater_id']

df_gosentiment.drop(labels=positive_emotions + negative_emotions + neutral_emotions + also_drop_columns, axis=1, inplace=True)

df_gosentiment['Polarity'] = 0

for index, row in df_gosentiment.iterrows():
    if row['Positive'] == 1:
        df_gosentiment.at[index, 'Polarity'] = 1
    elif row['Negative'] == 1:
        df_gosentiment.at[index, 'Polarity'] = -1

df_gosentiment = df_gosentiment.astype({
    'Positive': 'int',
    'Negative': 'int',
    'Neutral': 'int',
})

In [72]:
df_gosentiment.head(20)

Unnamed: 0,text,created_utc,example_very_unclear,Positive,Negative,Neutral,Polarity
0,That game hurt.,1548381000.0,False,0,1,0,-1
1,>sexuality shouldn’t be a grouping category I...,1548084000.0,True,0,0,0,0
2,"You do right, if you don't care then fuck 'em!",1546428000.0,False,0,0,1,0
3,Man I love reddit.,1547965000.0,False,1,0,0,1
4,"[NAME] was nowhere near them, he was by the Fa...",1546669000.0,False,0,0,1,0
5,Right? Considering it’s such an important docu...,1548280000.0,False,1,0,0,1
6,"He isn't as big, but he's still quite popular....",1546320000.0,False,0,1,0,-1
7,That's crazy; I went to a super [RELIGION] hig...,1546536000.0,False,1,0,0,1
8,that's adorable asf,1548764000.0,False,1,0,0,1
9,"""Sponge Blurb Pubs Quaw Haha GURR ha AAa!"" fin...",1546984000.0,False,1,0,0,1


## NRC Emotion Lexicon

In [5]:
# with open(os.path.abspath('data/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt'), 'r') as f:
#     lines = f.readlines()
#     for line in lines:
#         word, emotion, 

nrc_df = pd.read_csv(os.path.abspath('data/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt'), sep='\t', header=0, names=['word', 'emotion', 'intensity'])

In [14]:
# nrc_dict = {}

# # Iterate over nrc_df
# for index, row in nrc_df.iterrows():
#     # Get the word and emotion
#     word = row['word']
#     emotion = row['emotion']
#     intensity = row['intensity']
#     # If the word is not in the dict yet
#     if word not in nrc_dict:
#         # Initialize the word in the dict
#         nrc_dict[word] = []
#     # Add the emotion to the word
#     if intensity == 1:
#         nrc_dict[word].append(emotion)

# # Writee nrc_dict to file
# with open(os.path.abspath('data/nrc_emotion_lexicon_dict.json'), 'w') as f:
#     f.write(json.dumps(nrc_dict))

In [15]:
nrc_dict = json.load(open(os.path.abspath('data/nrc_emotion_lexicon_dict.json')))

## Bitcoin Tweets

In [11]:
btc_df = pd.read_csv(os.path.abspath('data/Bitcoin_tweets.csv'))

  btc_df = pd.read_csv(os.path.abspath('data/Bitcoin_tweets.csv'))


In [12]:
btc_df.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,DeSota Wilson,"Atlanta, GA","Biz Consultant, real estate, fintech, startups...",2009-04-26 20:05:09,8534.0,7605,4838,False,2021-02-10 23:59:04,Blue Ridge Bank shares halted by NYSE after #b...,['bitcoin'],Twitter Web App,False
1,CryptoND,,😎 BITCOINLIVE is a Dutch platform aimed at inf...,2019-10-17 20:12:10,6769.0,1532,25483,False,2021-02-10 23:58:48,"😎 Today, that's this #Thursday, we will do a ""...","['Thursday', 'Btc', 'wallet', 'security']",Twitter for Android,False
2,Tdlmatias,"London, England","IM Academy : The best #forex, #SelfEducation, ...",2014-11-10 10:50:37,128.0,332,924,False,2021-02-10 23:54:48,"Guys evening, I have read this article about B...",,Twitter Web App,False
3,Crypto is the future,,I will post a lot of buying signals for BTC tr...,2019-09-28 16:48:12,625.0,129,14,False,2021-02-10 23:54:33,$BTC A big chance in a billion! Price: \487264...,"['Bitcoin', 'FX', 'BTC', 'crypto']",dlvr.it,False
4,Alex Kirchmaier 🇦🇹🇸🇪 #FactsSuperspreader,Europa,Co-founder @RENJERJerky | Forbes 30Under30 | I...,2016-02-03 13:15:55,1249.0,1472,10482,False,2021-02-10 23:54:06,This network is secured by 9 508 nodes as of t...,['BTC'],Twitter Web App,False


# Creating Sentence Vector

- Word embeddings
- PoS
- Positive/Neutral word 000110110

## Load Spacy

In [18]:
nlp = spacy.load('en_core_web_sm')

## Word Embedding

### BERT

In [78]:
# model = RepresentationModel(
#     model_type="bert",
#     model_name="bert-base-uncased",
#     use_cuda=False
# )

model = RepresentationModel(
    model_type="roberta",
    model_name="roberta-base",
    use_cuda=False
)

Some weights of the model checkpoint at roberta-base were not used when initializing BertForTextRepresentation: ['roberta.encoder.layer.4.output.dense.bias', 'roberta.encoder.layer.4.output.LayerNorm.bias', 'roberta.encoder.layer.2.attention.self.key.weight', 'roberta.encoder.layer.6.attention.self.query.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.2.attention.self.value.bias', 'roberta.encoder.layer.4.attention.self.value.weight', 'roberta.encoder.layer.7.attention.self.value.weight', 'lm_head.decoder.weight', 'roberta.encoder.layer.1.intermediate.dense.bias', 'roberta.embeddings.LayerNorm.weight', 'roberta.encoder.layer.10.output.dense.bias', 'roberta.encoder.layer.10.attention.output.dense.weight', 'roberta.encoder.layer.10.attention.self.value.weight', 'roberta.encoder.layer.9.attention.self.value.weight', 'roberta.encoder.layer.2.attention.self.value.weight', 'roberta.encoder.layer.7.attention.self.query.weight', 'roberta.encoder.layer.1.attention.output.de

### Word2Vec

In [19]:
word2vec_path = os.path.abspath('data/GoogleNews-vectors-negative300.bin')
word_embedding_model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True, limit=500000)

In [20]:
def encode_word2vec_from_scratch(sentences):
    # return gensim.models.Word2Vec(sentences, size=300, window=5, min_count=5, workers=4)
    return sentences

def encode_bert(sentences, model):
    return model.encode_sentences(sentences, combine_strategy=None)

def encode_word2vec(sentences):
    model = word_embedding_model

    sentences_vector = []

    for sentence in sentences:
        sent_vector = []

        for token in nlp(sentence):
            if token.is_stop:
                continue
            
            if token.text in model:
                sent_vector.append(model[token.text])
            else:
                sent_vector.append([0] * 300)
        
        if len(sent_vector) > MAX_SENTENCE_LENGTH:
            sent_vector = sent_vector[:MAX_SENTENCE_LENGTH]
        else:
            sent_vector = sent_vector + [[0] * 300] * (MAX_SENTENCE_LENGTH - len(sent_vector))

        sentences_vector.append(sent_vector)
    
    return np.array(sentences_vector)

## Part of Speech Embedding

In [33]:
def encode_other_features(sentences):
    vectors = []

    for sentence in sentences:
        vector = []

        for token in nlp(sentence):
            dictionary = {}

            if not (token.is_stop or token.is_punct):
                dictionary['pos'] = token.pos_
                
                associated_emotions = nrc_dict.get(token.lemma_, [])

                for emotion in associated_emotions:
                    dictionary[emotion] = True
                
                vector.append(dictionary)

        if len(vector) > MAX_SENTENCE_LENGTH:
            vector = vector[:MAX_SENTENCE_LENGTH]
        else:
            vector = vector + [{}] * (MAX_SENTENCE_LENGTH - len(vector))

        vectors.append(vector)
    
    print(len(vectors))

    dict_vectorizer = DictVectorizer()
    dict_vectorizer = dict_vectorizer.fit(list(itertools.chain.from_iterable(vectors)))

    encoded = []

    for vector in vectors:
        encoded.append(dict_vectorizer.transform(vector).toarray())
    
    return np.array(encoded)

# Machine Learning Approach

## On Stanford 140 Data

In [34]:
sentences = list(df_sf_train.text)[:250] + list(df_sf_train.text)[-250:]

sentences = preprocess_tweets(sentences)
sentences_embedded = encode_word2vec(sentences)
features_embedded = encode_other_features(sentences)

# Combine sentences_embedded and features_embedded on the third dimension
combined_embedded = np.concatenate((sentences_embedded, features_embedded), axis=2)

combined_embedded_2d = combined_embedded.reshape(combined_embedded.shape[0], combined_embedded.shape[1] * combined_embedded.shape[2]) # combined_embedded.reshape(combined_embedded.shape[0], -1)

print(sentences_embedded.shape)
print(features_embedded.shape)
print(combined_embedded.shape)
print(combined_embedded_2d.shape)

500
(500, 48, 300)
(500, 48, 28)
(500, 48, 328)
(500, 15744)


In [100]:
labels = df_sf_train.polarity.to_list()[:250] + df_sf_train.polarity.to_list()[-250:]

X_train, X_test, y_train, y_test = train_test_split(combined_embedded_2d, labels, test_size=0.2, random_state=42)

In [101]:
svm = SVC(kernel='linear')

svm.fit(X_train, y_train)

SVC(kernel='linear')

In [102]:
pred = svm.predict(X_test)

In [105]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.62      0.50      0.55        46
           4       0.63      0.74      0.68        54

    accuracy                           0.63       100
   macro avg       0.63      0.62      0.62       100
weighted avg       0.63      0.63      0.62       100



## On GoSentiment Data

In [68]:
sentences = list(df_gosentiment.text)[:5000]

# sentences = preprocess_tweets(sentences)
sentences_embedded = encode_word2vec(sentences)
features_embedded = encode_other_features(sentences)

# Combine sentences_embedded and features_embedded on the third dimension
combined_embedded = np.concatenate((sentences_embedded, features_embedded), axis=2)

combined_embedded_2d = combined_embedded.reshape(combined_embedded.shape[0], combined_embedded.shape[1] * combined_embedded.shape[2]) # combined_embedded.reshape(combined_embedded.shape[0], -1)

print(sentences_embedded.shape)
print(features_embedded.shape)
print(combined_embedded.shape)
print(combined_embedded_2d.shape)

5000
(5000, 28, 300)
(5000, 28, 28)
(5000, 28, 328)
(5000, 9184)


In [73]:
labels = df_gosentiment.Polarity.to_list()[:5000]

X_train, X_test, y_train, y_test = train_test_split(combined_embedded_2d, labels, test_size=0.2, random_state=42)

svm = SVC(kernel='linear')

svm.fit(X_train, y_train)

pred = svm.predict(X_test)

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

          -1       0.35      0.42      0.38       226
           0       0.45      0.42      0.43       354
           1       0.56      0.53      0.54       420

    accuracy                           0.47      1000
   macro avg       0.45      0.46      0.45      1000
weighted avg       0.47      0.47      0.47      1000



# VADER

In [124]:
df_vader = df_sf_train.sample(frac=0.01).reset_index(drop=True)

In [127]:
df_vader.text = df_vader.text.astype('string')

In [129]:
analyzer = SentimentIntensityAnalyzer()
#Add VADER metrics to dataframe
df_vader['compound'] = [analyzer.polarity_scores(v)['compound'] for v in df_vader['text']]
df_vader['neg'] = [analyzer.polarity_scores(v)['neg'] for v in df_vader['text']]
df_vader['neu'] = [analyzer.polarity_scores(v)['neu'] for v in df_vader['text']]
df_vader['pos'] = [analyzer.polarity_scores(v)['pos'] for v in df_vader['text']]

## Random

## Maybe add

Preeprocessing idea

In [None]:
# Defining dictionary containing all emojis with their meanings.
emojis = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad', 
          ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
          ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed', 
          ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
          '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
          '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink', 
          ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'}

In [194]:
nrc_df[nrc_df['word'] == 'beautiful']

Unnamed: 0,word,emotion,intensity
11359,beautiful,anger,0
11360,beautiful,anticipation,0
11361,beautiful,disgust,0
11362,beautiful,fear,0
11363,beautiful,joy,1
11364,beautiful,negative,0
11365,beautiful,positive,1
11366,beautiful,sadness,0
11367,beautiful,surprise,0
11368,beautiful,trust,0


anger -> anger, annoyance
anticipation
disgust -> annoyance(?), disapproval, disgust
fear -> embarrassment, fear, nervousness
joy -> amusement, caring, excitement, gratitude, joy, love, optimism
negative
positive
sadness -> disappointment, grief
surprise -> realization
trust -> admiration, approval

none: confusion, curiosity, desire, pride

relief
remorse
sadness
surprise
neutral

In [109]:
df_goemotion.columns

Index(['text', 'id', 'author', 'subreddit', 'link_id', 'parent_id',
       'created_utc', 'rater_id', 'example_very_unclear', 'admiration',
       'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion',
       'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust',
       'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy',
       'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief',
       'remorse', 'sadness', 'surprise', 'neutral'],
      dtype='object')