<a href="https://colab.research.google.com/github/dzanahmed/welcome-ideathon-lshtm/blob/main/code/sentiment_analysis_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

## Install transformers datasets

In [13]:
! pip install transformers datasets



## Load packages

In [14]:
from transformers import pipeline, Pipeline
import pandas as pd

from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from datasets import Dataset
import numpy as np
from scipy.special import softmax


## Load data into dataframe

In [15]:
url = "https://raw.githubusercontent.com/dzanahmed/welcome-ideathon-lshtm/main/data/interim/vax_tweets_v0.csv?token=GHSAT0AAAAAACB5CGEQMPYIQ2WQ5ZJ7RXBGZFEQCLA"
df = pd.read_csv(url)


## Load dataframe into dataset

#### Bit of data cleaning

In [16]:

# rename first column to tweet_id
df.columns.values[0] = 'tweet_id'

# coerce all columnns to the correct data type.
df['user_followers'] = pd.to_numeric(df['user_followers'], errors='coerce').astype('Int64')
df['user_friends'] = pd.to_numeric(df['user_friends'], errors='coerce').astype('Int64')
df['user_favourites'] = pd.to_numeric(df['user_favourites'], errors='coerce').astype('Int64')
df['user_verified'] = df['user_verified'].astype(bool)
df['is_retweet'] = df['is_retweet'].astype(bool)


#### Load dataframe into huggingface dataset object

In [17]:
df = df[['tweet_id', 'text']]
print(len(df))

df = df.dropna()
print(len(df))


# save dataframe as a dataset
dataset = Dataset.from_pandas(df)
print(len(dataset))

99997
99988
99988


## Load RoBERTa model


In [18]:
model_path = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(model_path)
config = AutoConfig.from_pretrained(model_path)
model = TFAutoModelForSequenceClassification.from_pretrained(model_path)

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Get Sentiment Analysis Predictions from RoBERTa Model

In [None]:
batch_size = 100
max_batches = int(len(dataset["text"]) / batch_size) + 1
n_batches = max_batches

df["negative_score"] = None
df["neutral_score"] = None
df["positive_score"] = None

for batch in range(n_batches):

  # store batch of tweets
  if batch < max_batches - 1:
    tweets_batch = dataset["text"][(batch * batch_size) : ((batch + 1) * batch_size)]
  else:
    tweets_batch = dataset["text"][(batch * batch_size) : len(dataset["text"])]


  # tokenize batch
  tokens = tokenizer(tweets_batch, padding=True, truncation=True, return_tensors="tf")

  # run batch through model
  output = model(**tokens)

  # extract the output into a dataframe
  for output_row in range(len(output.logits)):
    # convert from logit to softmax (probability) output
    probabilities = softmax(output.logits.numpy()[output_row])

    # store probabilities in correct column of data frame
    df.loc[batch * batch_size + output_row, "negative_score"] = probabilities[0]
    df.loc[batch * batch_size + output_row, "neutral_score"] = probabilities[1]
    df.loc[batch * batch_size + output_row, "positive_score"] = probabilities[2]

  print(batch)

In [37]:
df.head()

Unnamed: 0,tweet_id,text,negative_score,neutral_score,positive_score
0,1,We asked our coworkers why they're getting a C...,0.324697,0.369014,0.306289
1,2,45+ #RURAL #Bengaluru #CovidVaccine Availabili...,0.32574,0.339726,0.334534
2,3,@JoyAnnReid @NIH 👿Questions: Could the vacci...,0.353092,0.343143,0.303765
3,4,Next question is how do you find out where you...,0.352246,0.35727,0.290484
4,5,"If you told your child to get a Covid vaccine,...",0.329444,0.395002,0.275554


## Save Results to csv file

In [None]:
from google.colab import files

df.to_csv('sentiment_analysis.csv', encoding = 'utf-8-sig')
files.download('sentiment_analysis.csv')