<a href="https://colab.research.google.com/github/drashyabansel/GenerativeAI/blob/main/Text_Classification_using_HuggingFace_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!pip install datasets



In [7]:
from datasets import load_dataset

In [8]:
data = load_dataset("cornell-movie-review-data/rotten_tomatoes")
data

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
})

In [10]:
data["train"][0,-1]

{'text': ['the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .',
  'things really get weird , though not particularly scary : the movie is all portent and no content .'],
 'label': [1, 0]}

In [11]:
from transformers import pipeline

# Path to our HF model
model_path = "cardiffnlp/twitter-roberta-base-sentiment-latest"

# Load model into pipeline
model_pipeline = pipeline(
 model=model_path,
 tokenizer=model_path,
 return_all_scores=True,
 device="cuda:0"
)
model_pipeline

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cuda:0


<transformers.pipelines.text_classification.TextClassificationPipeline at 0x78db031f8190>

In [12]:
import numpy as np
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset

In [13]:
# Run inference
y_pred = []
for output in tqdm(model_pipeline(KeyDataset(data["test"], "text")),
total=len(data["test"])):
 negative_score = output[0]["score"]
 positive_score = output[2]["score"]
 assignment = np.argmax([negative_score, positive_score])
 y_pred.append(assignment)

100%|██████████| 1066/1066 [00:13<00:00, 76.46it/s] 


In [14]:
from sklearn.metrics import classification_report
def evaluate_performance(y_true, y_pred):
 """Create and print the classification report"""
 performance = classification_report(
 y_true, y_pred,
 target_names=["Negative Review", "Positive Review"]
 )
 print(performance)

In [15]:
evaluate_performance(data["test"]["label"], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.76      0.88      0.81       533
Positive Review       0.86      0.72      0.78       533

       accuracy                           0.80      1066
      macro avg       0.81      0.80      0.80      1066
   weighted avg       0.81      0.80      0.80      1066



In [16]:
# Path to our HF model
model_path_2 = "distilbert/distilbert-base-uncased-finetuned-sst-2-english"

# Load model into pipeline
model_pipeline_2 = pipeline(
 model=model_path_2,
 tokenizer=model_path_2,
 return_all_scores=True,
 device="cuda:0"
)
model_pipeline_2

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cuda:0


<transformers.pipelines.text_classification.TextClassificationPipeline at 0x78dc3d1088d0>

In [21]:
# Run inference
y_pred = []
for output in tqdm(model_pipeline_2(KeyDataset(data["test"], "text")), total=len(data["test"])):
 negative_score = output[0]["score"]
 positive_score = output[1]["score"]
 assignment = np.argmax([negative_score, positive_score])
 y_pred.append(assignment)

100%|██████████| 1066/1066 [00:08<00:00, 129.68it/s]


In [22]:
evaluate_performance(data["test"]["label"], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.89      0.90      0.90       533
Positive Review       0.90      0.89      0.90       533

       accuracy                           0.90      1066
      macro avg       0.90      0.90      0.90      1066
   weighted avg       0.90      0.90      0.90      1066



In [25]:
from sentence_transformers import SentenceTransformer

# Load model
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
# Convert text to embeddings
train_embeddings = model.encode(data["train"]["text"],
show_progress_bar=True)
test_embeddings = model.encode(data["test"]["text"],
show_progress_bar=True)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/267 [00:00<?, ?it/s]

Batches:   0%|          | 0/34 [00:00<?, ?it/s]

In [26]:
train_embeddings.shape

(8530, 768)

In [27]:
test_embeddings.shape

(1066, 768)

In [29]:
train_embeddings[0, :10]

array([ 0.01492994, -0.00554764,  0.01199458, -0.01492721,  0.00627283,
       -0.00367079, -0.02679175,  0.00956376,  0.00648649,  0.01928184],
      dtype=float32)

In [30]:
from sklearn.linear_model import LogisticRegression
# Train a logistic regression on our train embeddings
clf = LogisticRegression(random_state=42)
clf.fit(train_embeddings, data["train"]["label"])

In [31]:
# Predict previously unseen instances
y_pred = clf.predict(test_embeddings)
evaluate_performance(data["test"]["label"], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.85      0.86      0.85       533
Positive Review       0.86      0.85      0.85       533

       accuracy                           0.85      1066
      macro avg       0.85      0.85      0.85      1066
   weighted avg       0.85      0.85      0.85      1066



In [33]:
# Create embeddings for our labels
label_embeddings = model.encode(["A negative review", "A positive review"])

In [34]:
from sklearn.metrics.pairwise import cosine_similarity
# Find the best matching label for each document
sim_matrix = cosine_similarity(test_embeddings, label_embeddings)
y_pred = np.argmax(sim_matrix, axis=1)

In [35]:
evaluate_performance(data["test"]["label"], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.78      0.77      0.78       533
Positive Review       0.77      0.79      0.78       533

       accuracy                           0.78      1066
      macro avg       0.78      0.78      0.78      1066
   weighted avg       0.78      0.78      0.78      1066



In [36]:
# Create embeddings for our labels with even more specific mentions
label_embeddings = model.encode(["A very negative movie review", "A very positive movie review"])

In [37]:
# Find the best matching label for each document
sim_matrix = cosine_similarity(test_embeddings, label_embeddings)
y_pred = np.argmax(sim_matrix, axis=1)

In [38]:
evaluate_performance(data["test"]["label"], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.86      0.73      0.79       533
Positive Review       0.76      0.88      0.82       533

       accuracy                           0.80      1066
      macro avg       0.81      0.80      0.80      1066
   weighted avg       0.81      0.80      0.80      1066



In [39]:
# Training through Genereative Model

# Load our model
generative_pipeline = pipeline(
 "text2text-generation",
 model="google/flan-t5-small",
 device="cuda:0"
)
generative_pipeline

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Device set to use cuda:0


<transformers.pipelines.text2text_generation.Text2TextGenerationPipeline at 0x78dae6e051d0>

In [40]:
# Prepare our data
prompt = "Is the following sentence positive or negative? "
data = data.map(lambda example: {"t5": prompt + example['text']})
data

Map:   0%|          | 0/8530 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 't5'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label', 't5'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label', 't5'],
        num_rows: 1066
    })
})

In [42]:
# Run inference
y_pred = []
for output in tqdm(generative_pipeline(KeyDataset(data["test"], "t5")),
total=len(data["test"])):
 text = output[0]["generated_text"]
 y_pred.append(0 if text == "negative" else 1)

100%|██████████| 1066/1066 [00:55<00:00, 19.30it/s]


In [43]:
evaluate_performance(data["test"]["label"], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.83      0.85      0.84       533
Positive Review       0.85      0.83      0.84       533

       accuracy                           0.84      1066
      macro avg       0.84      0.84      0.84      1066
   weighted avg       0.84      0.84      0.84      1066

