<a href="https://colab.research.google.com/github/dioday45/CS433_Project2/blob/jeremy/notebooks/SiBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install the transformers library
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
# Import required packages
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f'{device=}')

# Create class for data preparation
class SimpleDataset:
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts
    
    def __len__(self):
        return len(self.tokenized_texts["input_ids"])
    
    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.tokenized_texts.items()}

device=device(type='cuda', index=0)


In [3]:
# Load tokenizer and model, create trainer
model_name = "siebert/sentiment-roberta-large-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
trainer = Trainer(model=model)

In [8]:
# Create list of texts (can be imported from .csv, .xls etc.)
df = pd.read_csv('processed_data.csv')
df = df.sample(frac=0.01, random_state=1)
pred_texts = df['tweet'].astype(str).tolist()

In [11]:
print(df.shape)
print(df['label'].mean())

(18219, 3)
-0.04495307096986662


In [12]:
# Tokenize texts and create prediction data set
tokenized_texts = tokenizer(pred_texts,truncation=True,padding=True)
pred_dataset = SimpleDataset(tokenized_texts)

In [13]:
# Run predictions
predictions = trainer.predict(pred_dataset)

***** Running Prediction *****
  Num examples = 18219
  Batch size = 8


In [7]:
predictions

PredictionOutput(predictions=array([[-3.7254686,  2.8858628],
       [ 3.9145916, -3.5184455],
       [-3.7518106,  2.913254 ],
       [ 3.9534545, -3.6184878]], dtype=float32), label_ids=None, metrics={'test_runtime': 3.239, 'test_samples_per_second': 1.235, 'test_steps_per_second': 0.309})

In [14]:
# Transform predictions to labels
preds = predictions.predictions.argmax(-1)

In [17]:
# Create DataFrame with texts, predictions, labels, and scores
pred = pd.DataFrame(list(zip(pred_texts,preds, df['label'].tolist())), columns=['text','pred', 'label'])
pred

Unnamed: 0,text,pred,label
0,not back dog aswell night,0,1
1,would like say inspiration aiming get end year,1,1
2,lunch class,1,-1
3,underwood like official,1,-1
4,day sleep eat work wash hair discussion think ...,1,1
...,...,...,...
18214,like sitting there still biology lesson,0,-1
18215,doubt we go though tell get,0,1
18216,like got let begin,1,1
18217,talent show sing song come watch,1,1


In [24]:
def test(x):
  if x==-1:
    return 0
  return x

In [25]:
pred['label']=pred['label'].apply(lambda x: test(x))

In [1]:
pred

NameError: ignored