In [1]:
import transformers
import torch
import math
import pandas as pd
import numpy as np
import emoji
import re
from transformers import (
    RobertaForSequenceClassification, RobertaTokenizer, BertForSequenceClassification, 
    BertTokenizer, AutoModelForSequenceClassification, AutoTokenizer, AdamW
)
import random
import time

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [207]:
pd.read_csv('./data/preprocessing/balanced_tokenized_cleaned_stocktwits.csv')

Unnamed: 0,created_at,body,sentiment,raw_content
14,2020-12-15T14:38:18Z,"['going', 'right', 'throughsupport', 'as', 'if...",0,$MSFT Going right through 214 support as if it...
49,2020-12-15T14:23:04Z,"['nobody', 'gonna', 'buy', 'expensive', 'ass',...",0,$AAPL nobody gonna buy expensive ass iPhones w...
61,2020-12-15T14:12:10Z,"['robinhood', 'peeps', 'gonna', 'be', 'severel...",0,$AAPL Robinhood peeps gonna be severely disapp...
103,2020-12-15T13:33:52Z,"['always', 'dump', 'dump', 'dump']",0,$AAPL always dump dump dump.
106,2020-12-15T13:30:10Z,"['why', 'is', 'this', 'turd', 'not', 'going', ...",0,$AAPL why is this turd not going anywhere. Thi...
...,...,...,...,...
1593006,2022-01-28T15:12:17Z,"['soar', 'baby', 'soar']",1,$TSLA soar baby soar
1116754,2021-11-09T15:28:11Z,"['evs', 'getting', 'decimated', 'did', 'brando...",1,$TSLA $LCID EV&#39;s getting decimated. Did Br...
1911649,2020-02-26T13:57:07Z,"['apparently', 'bears', 'have', 'short', 'term...",1,$TSLA Apparently bears have short term memory
1899134,2022-02-23T18:38:07Z,"['holy', 'shit', 'i', 'bought', 'more', 'calls...",1,$SPY holy shit I bought more calls 5 mins ago ...


In [4]:
tokenizer = RobertaTokenizer.from_pretrained('./data/model')
model = RobertaForSequenceClassification.from_pretrained('./data/model')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
def Sentiment(sent,model=model,tokenizer=tokenizer):
    encoded_dict = tokenizer.encode_plus(
                      sent, 
                      add_special_tokens = True,
                      truncation=True,
                      max_length = 64,
                      padding='max_length',
                      return_attention_mask = True,
                      return_tensors = 'pt')

    input_id = torch.LongTensor(encoded_dict['input_ids']).to(device)
    attention_mask = torch.LongTensor(encoded_dict['attention_mask']).to(device)
    model = model.to(device)

    with torch.no_grad():
        outputs = model(input_id, token_type_ids=None, attention_mask=attention_mask)

    logits = outputs[0]
    index = logits.argmax()
    return index,logits

In [6]:
def process_text(texts):
    # lowercase
    # message = message.lower() # RoBERTa tokenizer is uncased
    # remove URLs
    texts = re.sub(r'https?://\S+', "", texts)
    texts = re.sub(r'www.\S+', "", texts)
    # remove '
    texts = texts.replace('&#39;', "'")
    # remove symbol names
    texts = re.sub(r'(\#)(\S+)', r'hashtag_\2', texts)
    texts = re.sub(r'(\$)([A-Za-z]+)', r'cashtag_\2', texts)
    # remove usernames
    texts = re.sub(r'(\@)(\S+)', r'mention_\2', texts)
    # demojize
    texts = emoji.demojize(texts, delimiters=("", " "))

    return texts.strip()

In [202]:
def checkSenti(sent,return_logits=True):
    labels = ['Bearish','Bullish']
    sent_processed = process_text(sent)
    index,logits = Sentiment(sent_processed)
    if return_logits:
        logit0 = math.exp(logits[0][0])
        logit1 = math.exp(logits[0][1])
        logits = [logit0/(logit0+logit1),logit1/(logit0+logit1)]
        return [labels[index], max(logits)]
    
#     print(labels[index])
    return labels[index]

In [25]:
text.split(' ')

['hello', 'this', 'is', 'daniel', 'how', 'are', 'you']

In [142]:
df

Unnamed: 0,text,value,color
3,Daniel,0.709933,blue
5,Are,0.609174,blue
2,Is,0.601628,blue
4,How,0.581789,blue
1,This,0.553089,blue
0,Hello,0.549464,blue
6,You,0.508298,blue
7,Composition Weight,0.190324,orange


In [204]:
text = 'fuck i just lost all my money'

def final_sentpredict(text):
    sent_txt = []
    sent_val = []
    sent_clr = []
    rounded = []
    txt_color = []
    
    def predict_color(value, label):
        if label == 'Bearish':
            return 1-value, 'red', 'white'
        else:
            return value, 'green', 'white'
        
    for txt in text.split(' '):
        
        # assigning variable and color
        label, value = checkSenti(txt)
        value, color, color2 = predict_color(value, label)
        txt_color.append(color2)
        
        # splitting up each word and making it proper
        sent_txt.append(txt.capitalize())
        
        # setting the bar colors
        sent_clr.append(color)
        
        # appending value and rounding it
        sent_val.append(value)
        rounded.append(round(float(value), 2))
    
    # finding the weight of the composition
    comp_weight = value - np.mean(sent_val)
    
    # append to list
    sent_txt.append('Composition Weight')
    sent_val.append(comp_weight)
    sent_clr.append('orange')
    txt_color.append('black')
    rounded.append(round(comp_weight, 2))
        
    # create and return df
    df = pd.DataFrame({
        'text': sent_txt, 'value': sent_val, 
        'color': sent_clr, 'rounded': rounded,
        'txt_color': txt_color
    }).sort_values('value', ascending=False)
    
    # plotting
    trace = go.Bar(
        x=df.value,
        y=df.text,
        orientation='h',
        text = [f'{l} | {r}' for l, r in zip(df.text, df.rounded)],
        textfont=dict(color=df.txt_color),
        marker=dict(
            color=df.color
        )
    )

    layout = go.Layout(
        title='Sentiment Weight of each word',
        template='plotly_white',
        xaxis=dict(title='Weight'),
        yaxis=dict(title='Text', ticktext=[], tickvals=[])
    )

    fig = go.Figure(trace, layout)
    
    return fig

final_sentpredict(text)

In [209]:
checkSenti('hi')

['Bearish', 0.5202716020901559]