In [None]:
import pandas as pd
import numpy as np
import os
import spacy
import json
import pathlib
import time
import gensim
import itertools
import pickle
import seaborn as sns
from helpers import preprocess_tweets, preprocess_tweet, preprocess_reddit, preprocess_reddits
from simpletransformers.language_representation import RepresentationModel
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

## Download links for data

- Bitcoin_tweeets.csv -> https://www.kaggle.com/kaushiksuresh147/bitcoin-tweets
- GoEmotions.csv -> https://www.kaggle.com/datasets/debarshichanda/goemotions
- sentiment140 -> http://help.sentiment140.com/for-students/
- GoogleNews-vectors-etc... (Word2Vec) -> Canvas(?)
- NRC Emotion Lexicon -> https://saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm
- nrc_emotion_lexicon_dict -> Google Drive
- BTC-USD -> Google drive

In [None]:
# 90th percentile length (after removing stopwords and punctuation) was about 14 when I tested, so this is a good cutoff (99th percentile = 18)
MAX_SENTENCE_LENGTH = 20

# Import Datasets

## Stanford 140

In [None]:
df_sf_train = pd.read_csv(
    os.path.abspath('data/sentiment140-train.csv'), 
    encoding='ISO-8859-1', 
    header=None, 
    names=['polarity', 'id', 'date', 'query', 'user', 'text'],
    usecols=['polarity', 'text']
)

df_sf_test = pd.read_csv(
    os.path.abspath('data/sentiment140-test.csv'), 
    encoding='ISO-8859-1', 
    header=None, 
    names=['polarity', 'id', 'date', 'query', 'user', 'text'],
    usecols=['polarity', 'text']
)

df_sf_train['polarity'] = df_sf_train['polarity'].replace(4, 1)
df_sf_test['polarity'] = df_sf_train['polarity'].replace(4, 1)

df_sf_train['text'] = df_sf_train['text'].apply(preprocess_tweet)
df_sf_test['text'] = df_sf_test['text'].apply(preprocess_tweet)

In [None]:
df_sf_test.tail()

## GoEmotion

In [None]:
df_goemotion = pd.read_csv(os.path.abspath('data/GoEmotions.csv'))

In [None]:
df_goemotion.head()

In [None]:
len(df_goemotion)

In [None]:
df_gosentiment = df_goemotion.copy()

In [None]:
positive_emotions = ['admiration', 'amusement', 'approval', 'caring', 'curiosity', 'desire', 'excitement', 'gratitude', 'joy', 'love', 'optimism', 'pride', 'relief']
negative_emotions = ['anger', 'annoyance', 'disappointment', 'disapproval', 'disgust', 'fear', 'grief', 'nervousness', 'remorse', 'sadness']
neutral_emotions = ['neutral', 'embarrassment', 'confusion', 'realization', 'surprise']

df_gosentiment['Positive'] = df_gosentiment[positive_emotions].sum(axis=1).apply(lambda x: min(1, x))
df_gosentiment['Negative'] = df_gosentiment[negative_emotions].sum(axis=1).apply(lambda x: min(1, x))
df_gosentiment['Neutral'] = df_gosentiment[neutral_emotions].sum(axis=1).apply(lambda x: min(1, x))

also_drop_columns = ['subreddit', 'id', 'link_id', 'author', 'parent_id', 'rater_id']

df_gosentiment.drop(labels=positive_emotions + negative_emotions + neutral_emotions + also_drop_columns, axis=1, inplace=True)

df_gosentiment['Polarity'] = 0

for index, row in df_gosentiment.iterrows():
    if row['Positive'] == 1:
        df_gosentiment.at[index, 'Polarity'] = 1
    elif row['Negative'] == 1:
        df_gosentiment.at[index, 'Polarity'] = -1

df_gosentiment = df_gosentiment.astype({
    'Positive': 'int',
    'Negative': 'int',
    'Neutral': 'int',
})

In [None]:
df_gosentiment.head(20)

## NRC Emotion Lexicon

In [None]:
# with open(os.path.abspath('data/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt'), 'r') as f:
#     lines = f.readlines()
#     for line in lines:
#         word, emotion, 

nrc_df = pd.read_csv(os.path.abspath('data/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt'), sep='\t', header=0, names=['word', 'emotion', 'intensity'])

In [None]:
# nrc_dict = {}

# # Iterate over nrc_df
# for index, row in nrc_df.iterrows():
#     # Get the word and emotion
#     word = row['word']
#     emotion = row['emotion']
#     intensity = row['intensity']
#     # If the word is not in the dict yet
#     if word not in nrc_dict:
#         # Initialize the word in the dict
#         nrc_dict[word] = []
#     # Add the emotion to the word
#     if intensity == 1:
#         nrc_dict[word].append(emotion)

# # Writee nrc_dict to file
# with open(os.path.abspath('data/nrc_emotion_lexicon_dict.json'), 'w') as f:
#     f.write(json.dumps(nrc_dict))

In [None]:
nrc_dict = json.load(open(os.path.abspath('data/nrc_emotion_lexicon_dict.json')))

## Bitcoin Tweets

In [None]:
btc_df = pd.read_csv(os.path.abspath('data/Bitcoin_tweets.csv'))

In [None]:
btc_df.head()

# Creating Sentence Vector

- Word embeddings
- PoS
- Positive/Neutral word 000110110

## Load Spacy

In [None]:
nlp = spacy.load('en_core_web_sm')

## Word Embedding

### BERT

In [None]:
# model = RepresentationModel(
#     model_type="bert",
#     model_name="bert-base-uncased",
#     use_cuda=False
# )

model = RepresentationModel(
    model_type="roberta",
    model_name="roberta-base",
    use_cuda=False
)

### Word2Vec

In [None]:
word2vec_path = os.path.abspath('data/GoogleNews-vectors-negative300.bin')
word_embedding_model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True, limit=750000)

In [None]:
def encode_word2vec_from_scratch(sentences):
    # return gensim.models.Word2Vec(sentences, size=300, window=5, min_count=5, workers=4)
    return sentences

def encode_bert(sentences, model):
    return model.encode_sentences(sentences, combine_strategy=None)

def encode_word2vec(sentences):
    model = word_embedding_model

    sentences_vector = []

    for sentence in sentences:
        sent_vector = []

        for token in nlp(sentence):
            if token.is_stop or token.is_space or token.is_punct:
                continue

            lemma = token.lemma_.lower()
            if lemma in model:
                sent_vector.append(model[lemma])
            elif token.text.lower() in model:
                sent_vector.append(model[token.text.lower()])
            else:
                sent_vector.append([0] * 300)
        
        # if len(sent_vector) > MAX_SENTENCE_LENGTH:
        #     sent_vector = sent_vector[:MAX_SENTENCE_LENGTH]
        # else:
        #     sent_vector = sent_vector + [[0] * 300] * (MAX_SENTENCE_LENGTH - len(sent_vector))

        sentences_vector.append(sent_vector)
    
    return np.array(sentences_vector)

## Part of Speech Embedding

In [None]:
def encode_other_features(sentences):
    vectors = []

    for sentence in sentences:
        vector = []

        for token in nlp(sentence):
            dictionary = {}

            if not (token.is_stop or token.is_space or token.is_punct):
                dictionary['pos'] = token.pos_
                
                associated_emotions = nrc_dict.get(token.lemma_, [])

                for emotion in associated_emotions:
                    dictionary[emotion] = True
                
                vector.append(dictionary)

        if len(vector) > MAX_SENTENCE_LENGTH:
            vector = vector[:MAX_SENTENCE_LENGTH]
        else:
            vector = vector + [{}] * (MAX_SENTENCE_LENGTH - len(vector))

        vectors.append(vector)
    
    print(len(vectors))

    dict_vectorizer = DictVectorizer()
    dict_vectorizer = dict_vectorizer.fit(list(itertools.chain.from_iterable(vectors)))

    encoded = []

    for vector in vectors:
        encoded.append(dict_vectorizer.transform(vector).toarray())
    
    return np.array(encoded)

# Machine Learning Approach

## On Stanford 140 Data

In [None]:
sentences = list(df_sf_train.text)[:250] + list(df_sf_train.text)[-250:]

sentences = preprocess_tweets(sentences)
sentences_embedded = encode_word2vec(sentences)
features_embedded = encode_other_features(sentences)

# Combine sentences_embedded and features_embedded on the third dimension
combined_embedded = np.concatenate((sentences_embedded, features_embedded), axis=2)

combined_embedded_2d = combined_embedded.reshape(combined_embedded.shape[0], combined_embedded.shape[1] * combined_embedded.shape[2]) # combined_embedded.reshape(combined_embedded.shape[0], -1)

print(sentences_embedded.shape)
print(features_embedded.shape)
print(combined_embedded.shape)
print(combined_embedded_2d.shape)

In [None]:
testx = list(df_sf_train.text)[:10]
testz = list(df_sf_train.text)[:10]
testz = encode_word2vec(testz)

In [None]:
def tokenz(sentence):
    return [word for word in nlp(sentence) if not (word.is_stop or word.is_space)]

def caputed_seq(vectors):
    output = []
    for vector in vectors:
        output.append("NO" if list(vector) == [0] * 300 else "YES")
    
    return output

def summary(x, y):
    a = tokenz(x)
    b = caputed_seq(y)

    for i in range(0, min(len(a), len(b))):
        print(a[i], b[i])

In [None]:
print(summary(testx[5], testz[5]))

In [None]:
labels = df_sf_train.polarity.to_list()[:250] + df_sf_train.polarity.to_list()[-250:]

X_train, X_test, y_train, y_test = train_test_split(combined_embedded_2d, labels, test_size=0.2, random_state=42)

In [None]:
svm = SVC(kernel='linear')

svm.fit(X_train, y_train)

In [None]:
pred = svm.predict(X_test)

In [None]:
print(classification_report(y_test, pred))

## On GoSentiment Data

In [None]:
start_time = time.time()

sentences = list(df_gosentiment.text)[:60000]

# sentences = preprocess_tweets(sentences)
sentences_embedded = encode_word2vec(sentences)
features_embedded = encode_other_features(sentences)

# Combine sentences_embedded and features_embedded on the third dimension
combined_embedded = np.concatenate((sentences_embedded, features_embedded), axis=2)

combined_embedded_2d = combined_embedded.reshape(combined_embedded.shape[0], combined_embedded.shape[1] * combined_embedded.shape[2]) # combined_embedded.reshape(combined_embedded.shape[0], -1)

print(sentences_embedded.shape)
print(features_embedded.shape)
print(combined_embedded.shape)
print(combined_embedded_2d.shape)

In [None]:
labels = df_gosentiment.Polarity.to_list()[:60000]

X_train, X_test, y_train, y_test = train_test_split(combined_embedded_2d, labels, test_size=0.2, random_state=42)

model = LogisticRegression(solver='newton-cg', multi_class='ovr', max_iter=250)

model.fit(X_train, y_train)

with open('LogisticRegression-60000.pkl', 'wb') as f:
    pickle.dump(model)

pred = model.predict(X_test)

print(classification_report(y_test, pred))

In [None]:
labels = df_gosentiment.Polarity.to_list()[:60000]

X_train, X_test, y_train, y_test = train_test_split(combined_embedded_2d, labels, test_size=0.2, random_state=42)

model_2 = SVC(kernel='linear')

model_2.fit(X_train, y_train)

with open('SVCModel-60000.pkl', 'wb') as f:
    pickle.dump(model_2)

pred_2 = model_2.predict(X_test)

print(classification_report(y_test, pred))

# VADER

In [None]:
df_vader = df_sf_train.sample(frac=0.01).reset_index(drop=True)

In [None]:
df_vader.text = df_vader.text.astype('string')

In [None]:
analyzer = SentimentIntensityAnalyzer()
#Add VADER metrics to dataframe
df_vader['compound'] = [analyzer.polarity_scores(v)['compound'] for v in df_vader['text']]
df_vader['neg'] = [analyzer.polarity_scores(v)['neg'] for v in df_vader['text']]
df_vader['neu'] = [analyzer.polarity_scores(v)['neu'] for v in df_vader['text']]
df_vader['pos'] = [analyzer.polarity_scores(v)['pos'] for v in df_vader['text']]

## Reddit Data

In [None]:
CRYPTO_PATHS = {
    'Bitcoin': os.path.abspath('data/reddit-crypto/Bitcoin_12htop100_DailySub_0101_to_0817_PushShift_raw.csv'),
    'Dogecoin': os.path.abspath('data/reddit-crypto/doge_12htop100_DailySub_0101_to_0710_PushShift.csv'),
    'Solana': os.path.abspath('data/reddit-crypto/Solana_12htop100_DailySub_0101_to_0817_PushShift_raw.csv')
}

In [None]:
reddit_df = pd.read_csv(CRYPTO_PATHS['Bitcoin'])

In [None]:
reddit_df = reddit_df[reddit_df.selftext != '[removed]']

reddit_df.title.fillna('', inplace=True)
reddit_df.selftext.fillna('', inplace=True)

In [None]:
# Get number of rows per day
reddit_df.groupby('date', as_index=False)['title'].count()

In [None]:
reddit_df['fulltext'] = reddit_df.title + ': ' + reddit_df.selftext

In [None]:
reddit_df.fulltext = reddit_df.fulltext.apply(preprocess_reddit)

In [None]:
# Remove empty from fulltext
reddit_df.fulltext = reddit_df.fulltext.astype('string')

In [None]:
reddit_df.head(10)

In [None]:
sentences = list(reddit_df.fulltext)
sentences_embedded = encode_word2vec(sentences)
features_embedded = encode_other_features(sentences)

# Combine sentences_embedded and features_embedded on the third dimension
combined_embedded = np.concatenate((sentences_embedded, features_embedded), axis=2)

combined_embedded_2d = combined_embedded.reshape(combined_embedded.shape[0], combined_embedded.shape[1] * combined_embedded.shape[2]) # combined_embedded.reshape(combined_embedded.shape[0], -1)

print(sentences_embedded.shape)
print(features_embedded.shape)
print(combined_embedded.shape)
print(combined_embedded_2d.shape)

In [None]:
reddit_sentiment = model.predict(combined_embedded_2d)

In [None]:
reddit_df['sentiment'] = list(reddit_sentiment)

In [None]:
reddit_df.to_csv(os.path.abspath('data/results/reddit_sentiment.csv'), sep=';', encoding='utf-8', index=False)

# Bitcoin Data

In [None]:
btc_df_recent = btc_df[-200000::2]

In [None]:
len(btc_df_recent)

In [None]:
# Remove 'text' that are null
btc_df_recent = btc_df_recent.dropna(subset=['text'])

In [None]:
start_time = time.time()

sentences = list(btc_df_recent.text)

sentences = preprocess_tweets(sentences)
sentences_embedded = encode_word2vec(sentences)
features_embedded = encode_other_features(sentences)

# Combine sentences_embedded and features_embedded on the third dimension
combined_embedded = np.concatenate((sentences_embedded, features_embedded), axis=2)

# Save combined_embedded to pickle
with open('combined_embedded_fdhskasdg.pkl', 'wb') as f:
    pickle.dump(combined_embedded, f)

combined_embedded_2d = combined_embedded.reshape(combined_embedded.shape[0], combined_embedded.shape[1] * combined_embedded.shape[2]) # combined_embedded.reshape(combined_embedded.shape[0], -1)

print(sentences_embedded.shape)
print(features_embedded.shape)
print(combined_embedded.shape)
print(combined_embedded_2d.shape)

print(time.time() - start_time)

In [None]:
btc_sentiments_recent = model.predict(combined_embedded_2d)

In [None]:
btc_df_recent['sentiment'] = list(btc_sentiments_recent)

In [None]:
btc_df_recent.to_csv('btc_df_recent.csv', sep=';', encoding='utf-8', index=False)

# Random

## Maybe add

Preeprocessing idea

In [None]:
# Defining dictionary containing all emojis with their meanings.
emojis = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad', 
          ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
          ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed', 
          ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
          '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
          '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink', 
          ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'}

In [None]:
nrc_df[nrc_df['word'] == 'beautiful']

anger -> anger, annoyance
anticipation
disgust -> annoyance(?), disapproval, disgust
fear -> embarrassment, fear, nervousness
joy -> amusement, caring, excitement, gratitude, joy, love, optimism
negative
positive
sadness -> disappointment, grief
surprise -> realization
trust -> admiration, approval

none: confusion, curiosity, desire, pride

relief
remorse
sadness
surprise
neutral

In [None]:
df_goemotion.columns