In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import emoji
import re
import nltk
nltk.download('punkt')
import random
from tqdm.notebook import tqdm
from nltk.tokenize import word_tokenize
from transformers import get_linear_schedule_with_warmup, AutoModelForSequenceClassification
from transformers import (RobertaForSequenceClassification, RobertaTokenizer, 
    BertForSequenceClassification, BertTokenizer, 
    AutoModelForSequenceClassification, AutoTokenizer, AdamW)
from torch.utils.data import DataLoader,SequentialSampler,RandomSampler,TensorDataset,random_split



seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /Users/setone/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
tokenizer = RobertaTokenizer.from_pretrained('./data/model')
model = RobertaForSequenceClassification.from_pretrained('./data/model')

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def Sentiment(sent,model=model,tokenizer=tokenizer):
  encoded_dict = tokenizer.encode_plus(
                      sent, 
                      add_special_tokens = True,
                      truncation=True,
                      max_length = 64,
                      padding='max_length',
                      return_attention_mask = True,
                      return_tensors = 'pt')
      
  input_id = torch.LongTensor(encoded_dict['input_ids']).to(device)
  attention_mask = torch.LongTensor(encoded_dict['attention_mask']).to(device)
  model = model.to(device)

  with torch.no_grad():
      outputs = model(input_id, token_type_ids=None, attention_mask=attention_mask)

  logits = outputs[0]
  index = logits.argmax()
  return index,logits


def process_text(texts):
  # lowercase
  # message = message.lower() # RoBERTa tokenizer is uncased
  # remove URLs
  texts = re.sub(r'https?://\S+', "", texts)
  texts = re.sub(r'www.\S+', "", texts)
  # remove '
  texts = texts.replace('&#39;', "'")
  # remove symbol names
  texts = re.sub(r'(\#)(\S+)', r'hashtag_\2', texts)
  texts = re.sub(r'(\$)([A-Za-z]+)', r'cashtag_\2', texts)
  # remove usernames
  texts = re.sub(r'(\@)(\S+)', r'mention_\2', texts)
  # demojize
  texts = emoji.demojize(texts, delimiters=("", " "))

  return texts.strip()

def checkSenti(sent,return_logits=True):
    labels = ['Bearish','Bullish']
    sent_processed = process_text(sent)
    index,logits = Sentiment(sent_processed)
    if return_logits:
        logit0 = math.exp(logits[0][0])
        logit1 = math.exp(logits[0][1])
        logits = [logit0/(logit0+logit1),logit1/(logit0+logit1)]
        return [labels[index], max(logits)]
    return labels[index]

def update_output(value):
    # if n_clicks > 0:
    prediction = checkSenti(value)
    return round(prediction[1]*100, 2)

In [4]:
recession = pd.read_csv('./data/scraped_topics/recession.csv').iloc[:, 1:].dropna()
economy = pd.read_csv('./data/scraped_topics/economy.csv').iloc[:, 1:].dropna()
unemployment = pd.read_csv('./data/scraped_topics/unemployment.csv').iloc[:, 1:].dropna()
inflation = pd.read_csv('./data/scraped_topics/recession.csv').iloc[:, 1:].dropna()
deflation = pd.read_csv('./data/scraped_topics/economy.csv').iloc[:, 1:].dropna()
interest_rates = pd.read_csv('./data/scraped_topics/unemployment.csv').iloc[:, 1:].dropna()
cryptocurrency = pd.read_csv('./data/scraped_topics/unemployment.csv').iloc[:, 1:].dropna()

In [5]:
def add_sentiment(topic, report_every=10000):
    '''
    adds a sentiment based on the model outputted and report every ___ loops
    '''
    start = time.time()
    sentiment = []

    for idx, sent in enumerate(globals()[topic]['Text'].values):
        if idx%report_every == 0:
            print(f'{idx} reached; time: {time.time() - start}')

        sentiment.append(checkSenti(sent)[0])

    globals()[topic]['sentiment'] = sentiment
    globals()[topic].to_csv(f'./data/scraped_topics/{topic}.csv')
    
    return globals()[topic]

In [61]:
add_sentiment('economy')

0 reached; time: 0.003063201904296875
10000 reached; time: 432.5912799835205
20000 reached; time: 861.4565460681915
30000 reached; time: 1269.105901002884
40000 reached; time: 1676.5999720096588
50000 reached; time: 2073.5207948684692


Unnamed: 0,Datetime,Text,sentiment
0,2023-02-24 07:00:25+00:00,"""The Commodities Feed: Brent rebounds"" - ING B...",Bullish
1,2023-02-24 07:00:23+00:00,The effects of Indonesia’s technical barriers ...,Bearish
2,2023-02-24 07:00:17+00:00,$GBX - White Brook Capital - Greenbrier: In An...,Bullish
3,2023-02-24 07:00:07+00:00,"US Industrial Production: Computers, video and...",Bullish
4,2023-02-24 06:59:47+00:00,COMING UP @ 5 PM | 'Govt should acquire land &...,Bullish
...,...,...,...
50002,2023-01-24 09:30:16+00:00,The Euro Strikes Back. https://t.co/JUc3yI65oV...,Bearish
50003,2023-01-24 09:30:15+00:00,"A l’instar de la BRVM, quatre sur cinq des aut...",Bullish
50004,2023-01-24 09:30:07+00:00,"US Industrial Production: Consumer goods, fell...",Bearish
50005,2023-01-24 09:30:04+00:00,"In this week’s episode, the Portfolio Team goe...",Bullish


In [62]:
add_sentiment('unemployment')

0 reached; time: 0.0009047985076904297
10000 reached; time: 395.30963492393494
20000 reached; time: 783.206305027008
30000 reached; time: 1167.3081049919128
40000 reached; time: 1646.3206877708435
50000 reached; time: 2210.4265427589417


Unnamed: 0,Datetime,Text,sentiment
0,2023-02-24 07:36:03+00:00,बेरोज़गार बानी साहेब रोजगार मांगीला... \n#neha...,Bearish
1,2023-02-24 07:32:29+00:00,@_PTLB @IMPraveenDalal Indian economy has coll...,Bearish
2,2023-02-24 07:14:17+00:00,https://t.co/prMYo88PgU\n\n#hunger #afganistan...,Bearish
3,2023-02-24 06:57:40+00:00,बेरोज़गार बानी साहेब रोजगार मांगीला... \n#neha...,Bearish
4,2023-02-24 06:51:43+00:00,बेरोज़गार बानी साहेब रोजगार मांगीला... \n#neha...,Bearish
...,...,...,...
49998,2022-04-01 15:57:44+00:00,Where the HELL is this happening ?!?!?\nWhere ...,Bearish
49999,2022-04-01 15:56:42+00:00,How strange that the grass is all that remains...,Bearish
50000,2022-04-01 15:56:04+00:00,"@bblock29 @bennydiego @GOP Great job, Joe. #un...",Bearish
50001,2022-04-01 15:51:52+00:00,#unemployment is so f*cked up that after you j...,Bearish


In [8]:
add_sentiment('inflation')

0 reached; time: 0.003858804702758789
10000 reached; time: 418.7355389595032
20000 reached; time: 836.927503824234
30000 reached; time: 1248.3500537872314
40000 reached; time: 1680.586086988449
50000 reached; time: 2121.055986881256


Unnamed: 0,Datetime,Text,sentiment
0,2023-02-24 06:12:35+00:00,Unemployment rates are one of the most importa...,Bearish
1,2023-02-24 06:06:00+00:00,बेरोज़गार बानी साहेब रोजगार मांगीला... \n#neha...,Bearish
2,2023-02-24 06:05:17+00:00,#NehaSinghRathor #बेरोजगार #Unemployment #Job,Bearish
3,2023-02-24 06:00:32+00:00,बेरोज़गार बानी साहेब रोजगार मांगीला... \n\n#ne...,Bearish
4,2023-02-24 05:55:44+00:00,@nehafolksinger #unemployment,Bearish
...,...,...,...
49996,2022-04-01 15:16:24+00:00,"U.S. employers added 431,000 new jobs last mon...",Bullish
49997,2022-04-01 15:14:30+00:00,"Just watching #Potus speech about #jobs, #unem...",Bullish
49998,2022-04-01 15:12:40+00:00,#unemployment Now Hiring to spread Covid. Ther...,Bearish
49999,2022-04-01 15:12:31+00:00,@stevenpsloan some facts #fakenews won't repor...,Bearish


In [13]:
add_sentiment('deflation')

0 reached; time: 0.0015392303466796875
10000 reached; time: 426.4573812484741
20000 reached; time: 843.6777451038361
30000 reached; time: 1258.239452123642
40000 reached; time: 1668.4679203033447
50000 reached; time: 2074.3801832199097


Unnamed: 0,Datetime,Text,sentiment
0,2023-02-24 07:00:25+00:00,"""The Commodities Feed: Brent rebounds"" - ING B...",Bullish
1,2023-02-24 07:00:23+00:00,The effects of Indonesia’s technical barriers ...,Bearish
2,2023-02-24 07:00:17+00:00,$GBX - White Brook Capital - Greenbrier: In An...,Bullish
3,2023-02-24 07:00:07+00:00,"US Industrial Production: Computers, video and...",Bullish
4,2023-02-24 06:59:47+00:00,COMING UP @ 5 PM | 'Govt should acquire land &...,Bullish
...,...,...,...
49996,2023-01-24 09:30:16+00:00,The Euro Strikes Back. https://t.co/JUc3yI65oV...,Bearish
49997,2023-01-24 09:30:15+00:00,"A l’instar de la BRVM, quatre sur cinq des aut...",Bullish
49998,2023-01-24 09:30:07+00:00,"US Industrial Production: Consumer goods, fell...",Bearish
49999,2023-01-24 09:30:04+00:00,"In this week’s episode, the Portfolio Team goe...",Bullish


In [14]:
add_sentiment('interest_rates')

0 reached; time: 0.013687849044799805
10000 reached; time: 436.19326281547546
20000 reached; time: 883.1908371448517
30000 reached; time: 1350.2563569545746
40000 reached; time: 1828.3054947853088
50000 reached; time: 2291.705953836441


Unnamed: 0,Datetime,Text,sentiment
0,2023-02-24 07:36:03+00:00,बेरोज़गार बानी साहेब रोजगार मांगीला... \n#neha...,Bearish
1,2023-02-24 07:32:29+00:00,@_PTLB @IMPraveenDalal Indian economy has coll...,Bearish
2,2023-02-24 07:14:17+00:00,https://t.co/prMYo88PgU\n\n#hunger #afganistan...,Bearish
3,2023-02-24 06:57:40+00:00,बेरोज़गार बानी साहेब रोजगार मांगीला... \n#neha...,Bearish
4,2023-02-24 06:51:43+00:00,बेरोज़गार बानी साहेब रोजगार मांगीला... \n#neha...,Bearish
...,...,...,...
49996,2022-04-01 15:57:44+00:00,Where the HELL is this happening ?!?!?\nWhere ...,Bearish
49997,2022-04-01 15:56:42+00:00,How strange that the grass is all that remains...,Bearish
49998,2022-04-01 15:56:04+00:00,"@bblock29 @bennydiego @GOP Great job, Joe. #un...",Bearish
49999,2022-04-01 15:51:52+00:00,#unemployment is so f*cked up that after you j...,Bearish


In [16]:
add_sentiment('cryptocurrency')

0 reached; time: 0.0005469322204589844
10000 reached; time: 430.60196590423584
20000 reached; time: 864.92342877388
30000 reached; time: 1308.8027248382568
40000 reached; time: 1760.91592502594
50000 reached; time: 2209.8901069164276


Unnamed: 0,Datetime,Text,sentiment
0,2023-02-24 07:36:03+00:00,बेरोज़गार बानी साहेब रोजगार मांगीला... \n#neha...,Bearish
1,2023-02-24 07:32:29+00:00,@_PTLB @IMPraveenDalal Indian economy has coll...,Bearish
2,2023-02-24 07:14:17+00:00,https://t.co/prMYo88PgU\n\n#hunger #afganistan...,Bearish
3,2023-02-24 06:57:40+00:00,बेरोज़गार बानी साहेब रोजगार मांगीला... \n#neha...,Bearish
4,2023-02-24 06:51:43+00:00,बेरोज़गार बानी साहेब रोजगार मांगीला... \n#neha...,Bearish
...,...,...,...
49996,2022-04-01 15:57:44+00:00,Where the HELL is this happening ?!?!?\nWhere ...,Bearish
49997,2022-04-01 15:56:42+00:00,How strange that the grass is all that remains...,Bearish
49998,2022-04-01 15:56:04+00:00,"@bblock29 @bennydiego @GOP Great job, Joe. #un...",Bearish
49999,2022-04-01 15:51:52+00:00,#unemployment is so f*cked up that after you j...,Bearish


In [15]:
for i in ['recession', 'economy', 'unemployment', 
          'inflation', 'deflation', 'interest_rates', 'cryptocurrency']:
    print(i)
    print(pd.read_csv(f'./data/scraped_topics/{i}.csv')['sentiment'].value_counts())
    print('--'*25)

recession
Bearish    28893
Bullish    21108
Name: sentiment, dtype: int64
--------------------------------------------------
economy
Bullish    27605
Bearish    22396
Name: sentiment, dtype: int64
--------------------------------------------------
unemployment
Bearish    28897
Bullish    21104
Name: sentiment, dtype: int64
--------------------------------------------------
inflation
Bearish    28893
Bullish    21108
Name: sentiment, dtype: int64
--------------------------------------------------
deflation
Bullish    27605
Bearish    22396
Name: sentiment, dtype: int64
--------------------------------------------------
interest_rates
Bearish    28897
Bullish    21104
Name: sentiment, dtype: int64
--------------------------------------------------
cryptocurrency


ParserError: Error tokenizing data. C error: Buffer overflow caught - possible malformed input file.


In [7]:
df = pd.read_csv('./data/scraped_topics/inflation.csv')

In [9]:
df['sentiment'].value_counts()

Bearish    28893
Bullish    21108
Name: sentiment, dtype: int64