<a href="https://colab.research.google.com/github/dzanahmed/welcome-ideathon-lshtm/blob/main/code/sentiment_analysis_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

## Install transformers datasets

In [1]:
! pip install transformers datasets

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m41.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.13.1-py3-none-any.whl (486 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m57.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Download

## Load packages

In [35]:
from transformers import pipeline, Pipeline
import pandas as pd

from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from datasets import Dataset
import numpy as np
from scipy.special import softmax


## Load data from google drive file

### Load data into dataframe

In [3]:
url = "https://raw.githubusercontent.com/dzanahmed/welcome-ideathon-lshtm/main/data/raw/vax_tweets.csv?token=GHSAT0AAAAAACB5CGER4UM335Q3HQCVWGYEZFBUW3A"
df = pd.read_csv(url)


  df = pd.read_csv(url)


### Load dataframe into dataset

#### Bit of data cleaning

In [27]:

# rename first column to tweet_id
df.columns.values[0] = 'tweet_id'

# coerce all columnns to the correct data type.
df['user_followers'] = pd.to_numeric(df['user_followers'], errors='coerce').astype('Int64')
df['user_friends'] = pd.to_numeric(df['user_friends'], errors='coerce').astype('Int64')
df['user_favourites'] = pd.to_numeric(df['user_favourites'], errors='coerce').astype('Int64')
df['user_verified'] = df['user_verified'].astype(bool)
df['is_retweet'] = df['is_retweet'].astype(bool)


tweet_id             int64
user_location       object
user_description    object
user_followers       Int64
user_friends         Int64
user_favourites      Int64
user_verified         bool
date                object
text                object
hashtags            object
is_retweet            bool
dtype: object
       tweet_id                                      user_location  \
5953       5954  ['COVIDVaccine', 'MainstreamMedia', 'Operation...   
12980     12981  ['COVIDVaccine', 'COVID19', 'JoeBiden', 'RonDe...   
19947     19948                     ['edutwitter', 'CovidVaccine']   
26433     26434                                    Dundas, Ontario   
32811     32812                                                NaN   
43902     43903                                                NaN   
67249     67250  ['newyorkcityvaccinemandate', 'newyorkcityprot...   
69621     69622                                                NaN   
70969     70970  ['COVIDVaccine', 'COVID19', 'JoeBiden', 's

#### Load dataframe into huggingface dataset object

In [30]:
new_df = df[['tweet_id', 'text']].copy()

new_df = new_df.dropna()


# save dataframe as a dataset
dataset = Dataset.from_pandas(new_df)

# Load RoBERTa model


In [7]:
model_path = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(model_path)
config = AutoConfig.from_pretrained(model_path)
model = TFAutoModelForSequenceClassification.from_pretrained(model_path)

Downloading tf_model.h5:   0%|          | 0.00/499M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Example use case from huggingface website

In [7]:
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)
# TF
text = "Covid cases are increasing fast!"
encoded_input = tokenizer(text, return_tensors='tf')
output = model(encoded_input)
scores = output[0][0].numpy()
scores = softmax(scores)

# Print labels and scores
ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
    l = config.id2label[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")

## Define pipeline

In [None]:
from transformers import Pipeline


class MyPipeline(Pipeline):
    def _sanitize_parameters(self, **kwargs):
        preprocess_kwargs = {}
        if "maybe_arg" in kwargs:
            preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
        return preprocess_kwargs, {}, {}

    def preprocess(self, inputs, maybe_arg=2):
        model_input = Tensor(inputs["input_ids"])
        return {"model_input": model_input}

    def _forward(self, model_inputs):
        # model_inputs == {"model_input": model_input}
        outputs = self.model(**model_inputs)
        # Maybe {"logits": Tensor(...)}
        return outputs

    def postprocess(self, model_outputs):
        best_class = model_outputs["logits"].softmax(-1)
        return best_class

# Run RoBERTa model

## Test out pipeline on a subset of the data

In [8]:
basic_pipeline = pipeline("sentiment-analysis", model = model, tokenizer = tokenizer, config = config)
basic_pipeline("")




[{'label': 'neutral', 'score': 0.42374736070632935}]

In [15]:
dataset.features
dataset.train_test_split(test_size = 0.999, shuffle = False)

DatasetDict({
    train: Dataset({
        features: ['tweet_id', 'text'],
        num_rows: 100
    })
    test: Dataset({
        features: ['tweet_id', 'text'],
        num_rows: 99900
    })
})

In [42]:
dataset = dataset.map(lambda row: tokenizer(row["text"]), batched=True)

Map:   0%|          | 0/99988 [00:00<?, ? examples/s]

In [43]:
dataset.features

{'tweet_id': Value(dtype='int64', id=None),
 'text': Value(dtype='string', id=None),
 '__index_level_0__': Value(dtype='int64', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}