## Installs and imports

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install --upgrade pip
!pip install sentencepiece
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pip
  Downloading pip-23.0.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m55.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 22.0.4
    Uninstalling pip-22.0.4:
      Successfully uninstalled pip-22.0.4
Successfully installed pip-23.0.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m40.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.97
[0mLooking in indexes: https://pypi.org/simple, https://us

In [None]:
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import AutoModelForSequenceClassification
from torch.utils.data import DataLoader
import numpy as np
from scipy.special import softmax

In [None]:
import pandas as pd

## Data

In [None]:
def preprocess(corpus):
  outcorpus = []
  for text in corpus:
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    new_text = " ".join(new_text)
    outcorpus.append(new_text)
  return outcorpus

In [None]:
# dataset_path = '/content/drive/MyDrive/Master thesis data/alphabet_tweets_cleaned.csv'
# dataset_path = '/content/drive/MyDrive/Master thesis data/apple_tweets_cleaned.csv'
dataset_path = '/content/drive/MyDrive/Master thesis data/microsoft_tweets_cleaned.csv'
df = pd.read_csv(dataset_path, usecols= ['created_at','text'])

In [None]:
df.head()

Unnamed: 0,created_at,text
0,2022-12-30,Most winning trading chat!!! For a limited tim...
1,2022-12-30,My current picks $coin $msft $snow $arkk $cost
2,2022-12-30,Buy the dip still working. Another winning ale...
3,2022-12-30,The first potential hearing for Microsoft $MSF...
4,2022-12-30,Learn To Trade While You're Working From Home&...


In [None]:
dataset = df['text']

In [None]:
df.describe()

Unnamed: 0,created_at,text
count,842600,842600
unique,1460,643622
top,2022-01-25,$MSFT
freq,5325,3146


In [None]:
print(dataset[0:10])

0    Most winning trading chat!!! For a limited tim...
1       My current picks $coin $msft $snow $arkk $cost
2    Buy the dip still working. Another winning ale...
3    The first potential hearing for Microsoft $MSF...
4    Learn To Trade While You're Working From Home&...
5    $SPX $SPY $ES_F $DJI $QQQ $NDX $FB $AMZN $AAPL...
6    Live day trading, detailed analysis on stocks ...
7    $SPY $QQQ $DIS $TSLA $SHOP\n$AMD $AAPL $SQ $AM...
8    TRADEIDEAS NEW YEAR SALE\n30% off new subscrip...
9    Helped me get my account green all time. Thank...
Name: text, dtype: object


In [None]:
# this is a dataset in 8 different languages
for example in [0,870,1740,2610,3480,4350,5220,6090]:
  print(dataset[example])

Most winning trading chat!!! For a limited time, we are opening our trading chatroom to the public! 
  
Alerts given
Trading Chat on Discord!👇

  
Cloud computing stocks are in a massive down trend.
The $SKYY ETF is currently rangebound.
$AMZN $MSFT are a couple of notable inclusions within the ETF.  
$MSFT 📰 OneSoft Solutions Renews Engagement of Sophic Capital for Capital Markets Advisory Services

 s insight appeared 45 seconds early at ⚡  
I've  made 36k  with them . If you really want to make a huge profit on trading .... Choose this professional chat: 

  
$MSFT  *Top analyst price target for the week..📈📉🚀  🚀  
 
      
 
$EVR $MSFT $NFLX $GOOG $AAPL 
2022/12/28 03:24
Meta and Alphabet Are Losing Their Advertising Throne - Meta Platforms  (...
 
Alerts given
Trading Chat on Discord!👇

  su390sVJvb


## Model

In [None]:
CUDA = True # set to true if using GPU (Runtime -> Change runtime Type -> GPU)
BATCH_SIZE = 64
MODEL = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)
config = AutoConfig.from_pretrained(MODEL) # used for id to label name
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
if CUDA:
  model = model.to('cuda')
_ = model.eval()

Downloading (…)lve/main/config.json:   0%|          | 0.00/841 [00:00<?, ?B/s]

Downloading (…)ncepiece.bpe.model";:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

## Forward

In [None]:
def forward(text, cuda=True):
  text = preprocess(text)
  encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
  if cuda:
    encoded_input.to('cuda')
    output = model(**encoded_input)
    scores = output[0].detach().cpu().numpy()
  else:
    output = model(**encoded_input)
    scores = output[0].detach().numpy()
  
  scores = softmax(scores, axis=-1)
  return scores

In [None]:
dl = DataLoader(dataset, batch_size=BATCH_SIZE)
all_preds = []
for idx,batch in enumerate(dl):
  print('Batch ',idx+1,' of ',len(dl))
  text = preprocess(batch)
  scores = forward(text, cuda=CUDA)
  preds = np.argmax(scores, axis=-1)
  all_preds.extend(preds)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[1;30;43mUtdata för streaming har trunkerats till de sista 5000 raderna.[0m
Batch  8167  of  13166
Batch  8168  of  13166
Batch  8169  of  13166
Batch  8170  of  13166
Batch  8171  of  13166
Batch  8172  of  13166
Batch  8173  of  13166
Batch  8174  of  13166
Batch  8175  of  13166
Batch  8176  of  13166
Batch  8177  of  13166
Batch  8178  of  13166
Batch  8179  of  13166
Batch  8180  of  13166
Batch  8181  of  13166
Batch  8182  of  13166
Batch  8183  of  13166
Batch  8184  of  13166
Batch  8185  of  13166
Batch  8186  of  13166
Batch  8187  of  13166
Batch  8188  of  13166
Batch  8189  of  13166
Batch  8190  of  13166
Batch  8191  of  13166
Batch  8192  of  13166
Batch  8193  of  13166
Batch  8194  of  13166
Batch  8195  of  13166
Batch  8196  of  13166
Batch  8197  of  13166
Batch  8198  of  13166
Batch  8199  of  13166
Batch  8200  of  13166
Batch  8201  of  13166
Batch  8202  of  13166
Batch  8203  of  13166
Batch  8204  of  13166
Batch  8205  of  13166
Batch  8206  of  13166
Ba

In [None]:
# this is a dataset in 8 different languages
for example in [0,870,1740,2610,3480,4350,5220,6090]:
  pred = all_preds[example]
  print(dataset[example], '--->', config.id2label[pred])

Most winning trading chat!!! For a limited time, we are opening our trading chatroom to the public! 
   ---> positive
Alerts given
Trading Chat on Discord!👇

   ---> neutral
Cloud computing stocks are in a massive down trend.
The $SKYY ETF is currently rangebound.
$AMZN $MSFT are a couple of notable inclusions within the ETF.   ---> negative
$MSFT 📰 OneSoft Solutions Renews Engagement of Sophic Capital for Capital Markets Advisory Services

 s insight appeared 45 seconds early at ⚡   ---> neutral
I've  made 36k  with them . If you really want to make a huge profit on trading .... Choose this professional chat: 

   ---> positive
$MSFT  *Top analyst price target for the week..📈📉🚀  🚀   ---> neutral
 
      
 
$EVR $MSFT $NFLX $GOOG $AAPL 
2022/12/28 03:24
Meta and Alphabet Are Losing Their Advertising Throne - Meta Platforms  (...
  ---> negative
Alerts given
Trading Chat on Discord!👇

  su390sVJvb ---> neutral


In [None]:
df_labels = pd.DataFrame (all_preds, columns = ['predictions'])

In [None]:
df_labels.sample(5)

Unnamed: 0,predictions
547028,1
189722,2
31921,1
756284,1
128794,0


In [None]:
result = pd.concat([df, df_labels], axis=1, join='inner')
result.sample(6)

Unnamed: 0,created_at,text,predictions
520582,2021-04-06,Solid morning in options trading so far 👌\n\n🟢...,2
96653,2022-10-25,$MSFT Microsoft Stock To Post Mixed Results In...,1
255000,2022-05-06,Morning \n\n$TSLA: Break over 900 for calls a...,1
478504,2021-06-23,$msft 1t ;-),2
17065,2022-12-17,Free Trade Ideas In Your Inbox Every Week! via...,1
62269,2022-11-17,"“Most winning trading community, Get next winn...",2


In [None]:
def t(x):
    label = x["predictions"]
    if label == 1:
        sentiment = "neutral"
    elif label == 0:
        sentiment = "negative"
    else:
        sentiment = "positive"

    x["label"] = sentiment
    return x

def label_to_text(row):
  if row == 0:
    return "negative"
  if row == 1:
    return "neutral"
  else:
    return "positive"

df_labels = df_labels.apply(t, axis=1)

In [None]:
result = pd.concat([df, df_labels], axis=1, join='inner')
result.sample(6)

Unnamed: 0,created_at,text,predictions,label
552621,2021-02-02,$MSFT:\n\nNew Insider Filing on MICROSOFT CORP...,1,neutral
564662,2021-01-16,"I agree, yet... you didn't need any creativity...",1,neutral
412178,2021-10-19,Join now\n$TSLA $NVDA $MU $AMZN $MSFT $BABA $...,1,neutral
813094,2019-05-03,"Princess Cruises : Introduces McKinley ""Mac"" T...",1,neutral
472824,2021-07-03,Highest weekly close\n\n$AAPL *\n$AMZN *\n$FB\...,1,neutral
810306,2019-05-17,Here we go.\nAlgos woke up.\n\n$ES_F $NQ_F $ZB...,1,neutral


In [None]:
result.sample(100)

Unnamed: 0,created_at,text,predictions,label
13939,2022-12-20,$QQQ Weekly. breaking down from the bear fla...,1,neutral
587607,2020-11-24,Elon Musk Surpasses Bill Gates To Become World...,2,positive
36127,2022-12-05,Log into the FREE Trade Ideas Live Trader's Ro...,1,neutral
566440,2021-01-12,Watch ♦️ $TTD ♦️ This could rip fast if passes...,1,neutral
538610,2021-03-04,$WORK: DAILY 1 YEAR VOLATILITY CHART into the ...,1,neutral
...,...,...,...,...
46484,2022-11-29,"It’s really amazing Place,\nLot of thanks to t...",2,positive
307168,2022-02-20,Survivor found in burning ferry off Greek isla...,1,neutral
705744,2020-05-26,$spy $spx $acb $qqq $nyse $amzn $aapl $twtr $n...,1,neutral
621085,2020-09-22,"Here's How Much Investing $1,000 In Oracle...",1,neutral


In [None]:
# filepath_with_labels = '/content/drive/MyDrive/Master thesis data/alphabet_tweets_classified_with_label.csv'
# filepath_with_labels = '/content/drive/MyDrive/Master thesis data/apple_tweets_classified_with_label.csv'
filepath_with_labels = '/content/drive/MyDrive/Master thesis data/microsoft_tweets_classified_with_label.csv'
result.to_csv(filepath_with_labels)