<a href="https://colab.research.google.com/github/clayton-summitt/w266-final/blob/main/Copy_of_XLM_T_Run_a_classifier_on_a_text_file.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Installs and imports

In [2]:
# !pip install --upgrade pip
!pip install -q sentencepiece
!pip install -q transformers

[K     |████████████████████████████████| 1.2 MB 14.8 MB/s 
[K     |████████████████████████████████| 3.1 MB 12.3 MB/s 
[K     |████████████████████████████████| 3.3 MB 53.3 MB/s 
[K     |████████████████████████████████| 61 kB 714 kB/s 
[K     |████████████████████████████████| 895 kB 64.2 MB/s 
[K     |████████████████████████████████| 596 kB 47.0 MB/s 
[?25h

In [3]:
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import AutoModelForSequenceClassification
from torch.utils.data import DataLoader
import numpy as np
from scipy.special import softmax
from google.colab import files
from google.colab import drive
drive.mount('/content/drive' ,force_remount=True)
import glob
import os
os.chdir("drive/MyDrive/vaccine/data/")

Mounted at /content/drive


In [4]:
os.listdir('fine_tune_sentimnet/results/best_model/')
#((1832669, 768), (1832670, 3))

['config.json', 'pytorch_model.bin', 'training_args.bin']

## Data

In [5]:
def preprocess(corpus):
  outcorpus = []
  for text in corpus:
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    new_text = " ".join(new_text)
    outcorpus.append(new_text)
  return outcorpus

In [None]:
!wget https://raw.githubusercontent.com/cardiffnlp/xlm-t/main/data/sentiment/all/test_text.txt

In [6]:
dataset_path = 'train.txt'
dataset = open(dataset_path).read().split('\n')

In [11]:
(dataset.pop())

''

In [13]:
# this is a dataset in 8 different languages
for example in [0,870,1740,2610,3480,4350,5220,6090]:
  print(dataset[example])

"ADPH investigating 44 possible flu related deaths"
"Per lei è più importante il costo del vaccino, e no…"
"investigators are closing in on a Global influenza pollen"
"Dourado evita falar de Flu e diz que não conversou com Corinthians via"
"Both condoms and Sanitary wear are a necessity to women. To think of it, HIV is an incurable *illness…"
"Actually, 9/11 did happen and Elvis really is dead: how the rise of conspiracy theories leads to vaccine skepticis…"
"So bad news, I got influenza"
"Aún con influenza me gusta que llueva"


## Model

In [14]:
CUDA = True # set to true if using GPU (Runtime -> Change runtime Type -> GPU)
BATCH_SIZE = 32
MODEL = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)
config = AutoConfig.from_pretrained(MODEL) # used for id to label name
model = AutoModelForSequenceClassification.from_pretrained('fine_tune_sentimnet/results/best_model/')
if CUDA:
  model = model.to('cuda')
_ = model.eval()

Downloading:   0%|          | 0.00/841 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

## Forward

In [15]:
def forward(text, cuda=True):
  text = preprocess(text)
  encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
  if cuda:
    encoded_input.to('cuda')
    output = model(**encoded_input)
    scores = output[0].detach().cpu().numpy()
  else:
    output = model(**encoded_input)
    scores = output[0].detach().numpy()
  
  scores = softmax(scores, axis=-1)
  return scores

# Training Data Set

In [16]:
dl = DataLoader(dataset, batch_size=BATCH_SIZE)
all_preds = []
all_scores = []
for idx,batch in enumerate(dl):
  print('Batch ',idx+1,' of ',len(dl))
  text = preprocess(batch)
  scores = forward(text, cuda=CUDA)
  all_scores.extend(scores)
  preds = np.argmax(scores, axis=-1)
  all_preds.extend(preds)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Batch  52272  of  57271
Batch  52273  of  57271
Batch  52274  of  57271
Batch  52275  of  57271
Batch  52276  of  57271
Batch  52277  of  57271
Batch  52278  of  57271
Batch  52279  of  57271
Batch  52280  of  57271
Batch  52281  of  57271
Batch  52282  of  57271
Batch  52283  of  57271
Batch  52284  of  57271
Batch  52285  of  57271
Batch  52286  of  57271
Batch  52287  of  57271
Batch  52288  of  57271
Batch  52289  of  57271
Batch  52290  of  57271
Batch  52291  of  57271
Batch  52292  of  57271
Batch  52293  of  57271
Batch  52294  of  57271
Batch  52295  of  57271
Batch  52296  of  57271
Batch  52297  of  57271
Batch  52298  of  57271
Batch  52299  of  57271
Batch  52300  of  57271
Batch  52301  of  57271
Batch  52302  of  57271
Batch  52303  of  57271
Batch  52304  of  57271
Batch  52305  of  57271
Batch  52306  of  57271
Batch  52307  of  57271
Batch  52308  of  57271
Batch  52309  of  57271
Batch  52310  of  57271

In [None]:
# this is a dataset in 8 different languages
for example in [0,870,1740,2610,3480,4350,5220,6090,10000,18000,29000,99000]:
  pred = all_preds[example]
  print(dataset[example], '--->', config.id2label[pred])

In [17]:
#comaparitive scores after finetuning
for example in [0,870,1740,2610,3480,4350,5220,6090,10000,18000,29000,99000]:
  pred = all_preds[example]
  print(dataset[example], '--->', config.id2label[pred])

"ADPH investigating 44 possible flu related deaths" ---> Negative
"Per lei è più importante il costo del vaccino, e no…" ---> Neutral
"investigators are closing in on a Global influenza pollen" ---> Neutral
"Dourado evita falar de Flu e diz que não conversou com Corinthians via" ---> Neutral
"Both condoms and Sanitary wear are a necessity to women. To think of it, HIV is an incurable *illness…" ---> Neutral
"Actually, 9/11 did happen and Elvis really is dead: how the rise of conspiracy theories leads to vaccine skepticis…" ---> Neutral
"So bad news, I got influenza" ---> Negative
"Aún con influenza me gusta que llueva" ---> Positive
"COP7FCTC The 4 BigPharma to WHO are GlaxoSmithKline Novartis Sanofi Pasteur and Merck are the leading vaccine manufacturer" ---> Neutral
"Zambia News - HIV Activist Kasune Challenges MPs to Disclose Status" ---> Neutral
"Improving estimates of district HIV prevalence and burden in South Africa using small area estimation techniques…" ---> Neutral
"Trump Wi

In [None]:
len(all_preds),scores.shape

In [19]:
from numpy import save

In [None]:
save('baseline_sentiment_scores.npy',np.array(all_scores))

In [20]:
save('best_model_sentiment_scores.npy',np.array(all_scores))
save('best_model_sent_pred.npy',np.array(all_preds))


# Test Data Set


In [21]:
test_dataset_path = 'test.txt'
test_dataset = open(test_dataset_path).read().split('\n')

In [22]:
test_dataset.pop()

''

In [24]:
for example in [0,870,1740,2610,3480,4350,5220,6090]:
  print(test_dataset[example])

"health New Vaccine Could Slow Disease That Kills 600 Children a Day"
"What the News Isn’t Saying About Vaccine-Autism Studies"
"Has your child had the flu vaccine yet? Speak to your GP or Pharmacist about it StayWellThisWinter…"
"Nigeria starts vaccine drive to stop meningitis outbreak"
"Why You Should Get the New Shingles Vaccine"
"The flu vaccine is different because there are just so many different viruses.…"
"Russian bots and trolls spread content disseminating anti-vaccine messages. - The American Journal of Public Healt…"


In [25]:
dl = DataLoader(test_dataset, batch_size=BATCH_SIZE)
test_all_preds = []
test_all_scores = []
for idx,batch in enumerate(dl):
  print('Batch ',idx+1,' of ',len(dl))
  text = preprocess(batch)
  scores = forward(text, cuda=CUDA)
  test_all_scores.extend(scores)
  preds = np.argmax(scores, axis=-1)
  test_all_preds.extend(preds)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Batch  10472  of  15471
Batch  10473  of  15471
Batch  10474  of  15471
Batch  10475  of  15471
Batch  10476  of  15471
Batch  10477  of  15471
Batch  10478  of  15471
Batch  10479  of  15471
Batch  10480  of  15471
Batch  10481  of  15471
Batch  10482  of  15471
Batch  10483  of  15471
Batch  10484  of  15471
Batch  10485  of  15471
Batch  10486  of  15471
Batch  10487  of  15471
Batch  10488  of  15471
Batch  10489  of  15471
Batch  10490  of  15471
Batch  10491  of  15471
Batch  10492  of  15471
Batch  10493  of  15471
Batch  10494  of  15471
Batch  10495  of  15471
Batch  10496  of  15471
Batch  10497  of  15471
Batch  10498  of  15471
Batch  10499  of  15471
Batch  10500  of  15471
Batch  10501  of  15471
Batch  10502  of  15471
Batch  10503  of  15471
Batch  10504  of  15471
Batch  10505  of  15471
Batch  10506  of  15471
Batch  10507  of  15471
Batch  10508  of  15471
Batch  10509  of  15471
Batch  10510  of  15471

In [26]:
for example in [0,870,1740,2610,3480,4350,5220,6090,10000,18000,29000,99000]:
  pred = test_all_preds[example]
  print(dataset[example], '--->', config.id2label[pred])

"ADPH investigating 44 possible flu related deaths" ---> Negative
"Per lei è più importante il costo del vaccino, e no…" ---> Neutral
"investigators are closing in on a Global influenza pollen" ---> Neutral
"Dourado evita falar de Flu e diz que não conversou com Corinthians via" ---> Positive
"Both condoms and Sanitary wear are a necessity to women. To think of it, HIV is an incurable *illness…" ---> Neutral
"Actually, 9/11 did happen and Elvis really is dead: how the rise of conspiracy theories leads to vaccine skepticis…" ---> Neutral
"So bad news, I got influenza" ---> Neutral
"Aún con influenza me gusta que llueva" ---> Neutral
"COP7FCTC The 4 BigPharma to WHO are GlaxoSmithKline Novartis Sanofi Pasteur and Merck are the leading vaccine manufacturer" ---> Positive
"Zambia News - HIV Activist Kasune Challenges MPs to Disclose Status" ---> Neutral
"Improving estimates of district HIV prevalence and burden in South Africa using small area estimation techniques…" ---> Neutral
"Trump Wi

In [27]:
save('test_data_best_model_sentiment_scores.npy',np.array(test_all_scores))
save('test_data_best_model_sent_pred.npy', np.array(test_all_preds))