In [None]:
#!pip install --user -r requirements.txt

In [22]:
import warnings
warnings.filterwarnings("ignore")
import random
import time
import datetime as dt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import sys
import os

# for reddit api
from psaw import PushshiftAPI

# stopword
import nltk
from nltk.corpus import stopwords
nltk.download("vader_lexicon")
nltk.download('stopwords')
sw_nltk = stopwords.words('english')

import re
text_cleaning_regex = "@S+|https?:S+|http?:S|[^A-Za-z0-9]+"

# sklearn
from sklearn.metrics import accuracy_score, precision_recall_fscore_support,roc_curve, auc, f1_score,cohen_kappa_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# bert
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, WeightedRandomSampler,random_split
from transformers import BertModel,BertForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, AdamW, get_linear_schedule_with_warmup


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/swu/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/swu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [23]:
# check the device
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [24]:
api = PushshiftAPI()
start_time = int(dt.datetime(2019, 3, 16).timestamp())
end_time = int(dt.datetime(2022, 3, 10).timestamp())

reddit_comments =list(api.search_comments(after = start_time
                                  , before = end_time
                                  , q = "BTC price" # search for a specific word or phrase, not case-sensitive
#                                   , subreddit='BTC'
                                  , filter=['body','created_utc']
                                 ))

In [55]:
df = pd.DataFrame(reddit_comments)

In [57]:
df['created_at'] = pd.to_datetime(df['created_utc'],unit='s')
df = df[["body","created_at"]]

In [58]:
df.head()

Unnamed: 0,body,created_at,n_words
0,Yes. BTC price seems HUGELY speculative to me ...,2022-03-10 02:36:20,66
1,"&gt;yes, there is rampant market manipulation ...",2022-03-09 16:01:44,303
2,I would be happy if the price negated the infl...,2022-03-08 20:09:35,39
3,"I think we mostly agree, but this part is flat...",2022-03-07 23:58:26,215
4,&gt; That is why it forked in 2017: some of us...,2022-03-07 21:39:24,638


## Sentiment analysis

### 1. Preprocessing

In [77]:
def text_preprocessing(text):
    text = re.sub(text_cleaning_regex, ' ', str(text).lower()).strip()
    res=[]
    # Lowercase
    text = text.lower()
    # Remove single letter words
    text = ' '.join( [w for w in text.split() if len(w)>1] )
    
    # Remove ticks and the next character
    text = re.sub("\'\w+", '', text)
    
    # Remove stopword
    text = ' '.join([word for word in text.split() if text.lower() not in sw_nltk])
    # remove the word after @ OR #
    for i in text.split():
        if i.startswith("@") or i.startswith("#"):
            continue
        else:
            res.append(i)
    return ' '.join(res)

In [78]:
df['body']=df['body'].apply(lambda x: text_preprocessing(x))
df['n_words'] = df['body'].apply(lambda x: len(x.split()))

In [80]:
df.head()

Unnamed: 0,body,created_at,n_words,sentiment_dict,compound,neg,pos,confidence score,vader_result
0,yes btc price seems hugely speculative scale v...,2022-03-10 02:36:20,40,"{'neg': 0.184, 'neu': 0.638, 'pos': 0.178, 'co...",-0.3197,-0.184,0.178,0.016575,0
1,gt yes rampant market manipulation everywhere ...,2022-03-09 16:01:44,157,"{'neg': 0.134, 'neu': 0.66, 'pos': 0.206, 'com...",0.9042,-0.134,0.206,0.211765,2
2,would happy price negated inflation seeing cry...,2022-03-08 20:09:35,19,"{'neg': 0.0, 'neu': 0.829, 'pos': 0.171, 'comp...",0.5719,-0.0,0.171,1.0,2
3,think mostly agree part flat wrong gt fork rea...,2022-03-07 23:58:26,132,"{'neg': 0.081, 'neu': 0.812, 'pos': 0.107, 'co...",0.5898,-0.081,0.107,0.138298,2
4,gt forked 2017 us prefer cheap reliable fast b...,2022-03-07 21:39:24,331,"{'neg': 0.171, 'neu': 0.658, 'pos': 0.171, 'co...",-0.8005,-0.171,0.171,0.0,0


### 2. LRSentiA

**training strategy** 
- Generate Labels using an Unsupervised method
- referenece: https://openreview.net/forum?id=kQns9y_JH6, https://www.sciencedirect.com/science/article/pii/S2666827021000074


> LRSentiA is a lexicon and rule-based method that can classify sentiment without using any labeled data. The main purpose of introducing LRSentiA is to generate accurate pseudo-labels so that supervised ML classifiers can be incorporated into SSentiA.

In [81]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
df['sentiment_dict'] = df['body'].apply(lambda x:analyzer.polarity_scores(x))
df['compound']  = df['sentiment_dict'].apply(lambda score_dict: score_dict['compound'])
df['neg']  = df['sentiment_dict'].apply(lambda score_dict: score_dict['neg'])*-1
df['pos']  = df['sentiment_dict'].apply(lambda score_dict: score_dict['pos'])

#  Prediction confidence
df['confidence score'] = abs(df['neg']+df['pos'])/(abs(df['neg'])+abs(df['pos']))
df['confidence score'] = df['confidence score'].replace(np.nan, 0)

In [82]:
def vader_sentiment_result(sent):
    scores = analyzer.polarity_scores(sent)
    if scores["compound"] >=0.05: # positive, reference: https://pypi.org/project/vader-sentiment/
        return 2
    elif scores["compound"] <=-0.05: # negative
        return 0
    else:
        return 1 # netural

In [83]:
df["vader_result"] = df["body"].apply(lambda x: vader_sentiment_result(x))

In [84]:
# use very-high and high confidence groups of positive and negative predictions as training data
df_train = df[df['confidence score']>0.5]
df_test = df[~(df['confidence score']>0.5)]

In [85]:
X_train = df_train.body.values
y_train = df_train.vader_result.values
X_test = df_test.body.values
y_test = df_test.vader_result.values

In [86]:
# Create a function to tokenize the input for encoder 
# Load bert tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True)
def preprocessing_for_bert(X, y, batch_size = 32):
    input_ids = []
    attention_masks = []
    
    for sent in X:
        encoded_sent = tokenizer.encode_plus(
            text=sent,  
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
            max_length = 512,                  # maximum twitter lenght 280
            pad_to_max_length=True,         
            #return_tensors='pt',           # Return PyTorch tensor
            return_attention_mask=True,      
            truncation=True
            )
        
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))
     
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)
    
    labels = torch.tensor(y)
    # convert the tensors into a PyTorch Dataset=
    data = TensorDataset(input_ids, attention_masks, labels)
    sampler = RandomSampler(data)
    # feed dataset to training loop
    dataloader = DataLoader(data, # The training samples.
                            sampler=sampler,   # Select batches randomly
                            batch_size=batch_size)
    
    return dataloader

In [87]:
train_dataloader  = preprocessing_for_bert(X_train, y_train, batch_size = 32)
test_dataloader  = preprocessing_for_bert(X_test, y_test, batch_size = 32)

In [88]:
# # create the tensor dataset based on differnet confidence score
# def subset(lower,higher):
#     X = df[(df['confidence score']>lower)&(df['confidence score']<=higher)].tweet.values
#     y = df[(df['confidence score']>lower)&(df['confidence score']<=higher)].vader_result.values
#     return preprocessing_for_bert(X, y, batch_size = 32)

In [89]:
# vh_dataloader = subset(lower=0.75,higher=1)
# h_dataloader = subset(lower=0.55,higher=0.75)
# l_dataloader = subset(lower=0.38,higher=0.55)
# vl_dataloader = subset(lower=0, higher=0.38)

### 3. model training

In [90]:
def initialize_model(epochs=4):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    bert_classifier = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=3,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

#     #Tell PyTorch to run the model on GPU
#     bert_classifier.to(device)

    # Create the optimizer
    optimizer = AdamW(bert_classifier.parameters(),lr=1e-5) # change the learning rate to a lower number 

    # Total number of training steps
    total_steps = len(train_dataloader) * epochs
    
    # Warm up steps is a parameter which is used to lower the learning rate in order to reduce the impact of deviating the model from learning on sudden new data set exposure.

    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=50, 
                                                num_training_steps=total_steps)
    
    return bert_classifier, optimizer, scheduler

In [91]:
bert_classifier, optimizer, scheduler = initialize_model(epochs=2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [92]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)
        
def kappa_score(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return cohen_kappa_scredfore(preds_flat,labels_flat, labels=None, weights=None)

In [93]:
def other_metrics(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    f1_score = f1_score(labels_flat, preds_flat, average='weighted')
    kappa_score = cohen_kappa_score(preds_flat,labels_flat, labels=None, weights=None)
    print(" F1 Score: {0:.2f}".format(f1_score))
    print(" Kappa Score: {0:.2f}".format(kappa_score))

#### Training Loop

In [94]:
# Specify loss function
loss_fn = nn.CrossEntropyLoss()

def set_seed(seed_value=42):
    """Set seed for reproducibility.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

def train(model, train_dataloader, epochs):
    """
    Train the BertClassifier model
    """
    train_loss_set = []
    print("Start training...\n")
    
    # We'll store a number of quantities such as training and validation loss, 
    # validation accuracy, and timings.
    training_stats = []
    
    # Measure the total training time for the whole run.
    total_t0 = time.time()
    
    # For each epoch...
    for epoch_i in range(0, epochs):
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')
        
        t0 = time.time()  # Measure how long the training epoch takes.
        total_train_loss = 0  # Reset the total loss for this epoch.
        model.train()
        
        for step, batch in enumerate(train_dataloader):
            if step % 40 == 0 and not step == 0:
                # Calculate elapsed time in minutes.
                elapsed = format_time(time.time() - t0)
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
            
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            
            model.zero_grad()   
            
            output = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)
            loss = output.loss
            logits = output.logits
            
            total_train_loss += loss.item()
            # Perform a backward pass to compute gradients
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            # Update the model’s parameters 
            optimizer.step()
            # Update the learning rate
            scheduler.step()
            # Calculate the average loss over all of the batches.
        
        avg_train_loss = total_train_loss / len(train_dataloader) 
        training_time = format_time(time.time() - t0)
        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epcoh took: {:}".format(training_time))

In [95]:
train(bert_classifier, train_dataloader, epochs=4)

Start training...


Training...
  Batch    40  of    128.    Elapsed: 1:16:49.
  Batch    80  of    128.    Elapsed: 2:32:55.
  Batch   120  of    128.    Elapsed: 3:48:52.

  Average training loss: 0.63
  Training epcoh took: 4:03:16

Training...
  Batch    40  of    128.    Elapsed: 1:15:10.
  Batch    80  of    128.    Elapsed: 2:30:25.
  Batch   120  of    128.    Elapsed: 3:45:54.

  Average training loss: 0.30
  Training epcoh took: 4:00:16

Training...
  Batch    40  of    128.    Elapsed: 1:15:09.
  Batch    80  of    128.    Elapsed: 2:30:25.
  Batch   120  of    128.    Elapsed: 3:45:57.

  Average training loss: 0.25
  Training epcoh took: 4:00:32

Training...
  Batch    40  of    128.    Elapsed: 1:23:01.
  Batch    80  of    128.    Elapsed: 2:46:53.
  Batch   120  of    128.    Elapsed: 4:04:28.

  Average training loss: 0.25
  Training epcoh took: 4:19:08


In [97]:
output_model = './sentiment.pth'

# save model
def save(bert_classifier, optimizer):
    # save
    torch.save({
        'bert_classifier_state_dict': bert_classifier.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }, output_model)

save(bert_classifier, optimizer)

In [98]:
# # to load
# checkpoint = torch.load('./sentiment.pth')
# bert_classifier.load_state_dict(checkpoint['bert_classifier_state_dict'])
# optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

### Prediction

In [99]:
def predict(model,  prediction_dataloader):
    # Put model in evaluation mode
    model.eval()
    
    # Tracking variables 
    predictions , true_labels = [], []
    
    # Predict 
    for batch in prediction_dataloader:
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        
        # Telling the model not to compute or store gradients, saving memory and speeding up predictio
    
        with torch.no_grad():
            # Forward pass, calculate logit predictions
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        predictions.append(logits)
    
    flat_preds = [item for sublist in predictions for item in sublist]
    probs = np.argmax(flat_preds, axis=1).flatten()
    
    return probs

In [100]:
probs = predict(bert_classifier,test_dataloader)

In [113]:
bert_res = pd.DataFrame(probs, columns = ['Sentiment Score'])
final_test = pd.concat([df_test.reset_index(drop=True), bert_res], axis=1)

In [119]:
df_train['Sentiment Score'] = df_train['vader_result']
final_table = pd.concat([final_test, df_train], axis=0)
final_table.to_csv ('./bert_results.csv', index = True, header=True)

In [123]:
final_table[final_table['Sentiment Score']==1]

Unnamed: 0,body,created_at,n_words,sentiment_dict,compound,neg,pos,confidence score,vader_result,Sentiment Score
96,years bch narrative needed switch btc flippeni...,2022-02-05 01:32:11,10,"{'neg': 0.0, 'neu': 0.891, 'pos': 0.109, 'comp...",0.0258,-0.0,0.109,1.0,1,1
99,btc bch btc bch price matter,2022-02-04 22:43:39,6,"{'neg': 0.0, 'neu': 0.82, 'pos': 0.18, 'compou...",0.0258,-0.0,0.18,1.0,1,1
403,btc speculation sustainable seen btc price act...,2021-12-27 13:48:22,18,"{'neg': 0.0, 'neu': 0.939, 'pos': 0.061, 'comp...",0.0258,-0.0,0.061,1.0,1,1
511,need new ath btc pair usdt pair price matter much,2021-12-12 14:46:53,10,"{'neg': 0.0, 'neu': 0.891, 'pos': 0.109, 'comp...",0.0258,-0.0,0.109,1.0,1,1
677,article background information binance used te...,2021-11-15 23:05:46,17,"{'neg': 0.063, 'neu': 0.937, 'pos': 0.0, 'comp...",-0.0191,-0.063,0.0,1.0,1,1
915,bnb moves btc wallet exchange faster cheaper e...,2021-10-26 15:07:03,13,"{'neg': 0.0, 'neu': 0.916, 'pos': 0.084, 'comp...",0.0258,-0.0,0.084,1.0,1,1
969,btc price matter anymore,2021-10-20 19:05:19,4,"{'neg': 0.0, 'neu': 0.732, 'pos': 0.268, 'comp...",0.0258,-0.0,0.268,1.0,1,1
1281,reason tether gets printed unreasonable rates ...,2021-09-14 01:54:34,17,"{'neg': 0.063, 'neu': 0.937, 'pos': 0.0, 'comp...",-0.0191,-0.063,0.0,1.0,1,1
1314,prices need eat based usd price btc fluctuate ...,2021-09-09 02:23:49,10,"{'neg': 0.0, 'neu': 0.891, 'pos': 0.109, 'comp...",0.0258,-0.0,0.109,1.0,1,1
1612,let assume tether actually say 80 backed 40b b...,2021-07-27 15:13:49,14,"{'neg': 0.0, 'neu': 0.922, 'pos': 0.078, 'comp...",0.0258,-0.0,0.078,1.0,1,1
