<a href="https://colab.research.google.com/github/ernselito/Resume_Screening_using_LLMs/blob/master/Classification_With_LLM(1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing Libraries

In [None]:
import numpy as np
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset
from datasets import load_dataset
from transformers import pipeline
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Loading Data

In [None]:

# Load our data
data = load_dataset("rotten_tomatoes")

### Calling for the first row of the tarining set

In [None]:
data['train'][0,1]

# Methodology

Classification with Representation Models

I load the RoBERTa-base model which is trained on ~124M tweets from January 2018 to December 2021, and finetuned for sentiment analysis with the TweetEval benchmark.

In [None]:

# Path to our HF model
model_path = "cardiffnlp/twitter-roberta-base-sentiment-latest"
# Load model into pipeline
pipe = pipeline(model=model_path, tokenizer=model_path, return_all_scores=True)

In [None]:

# Run inference
y_pred = []
for output in tqdm(pipe(KeyDataset(data["test"], "text")),
                   total=len(data["test"])):
                   negative_score = output[0]["score"]
                   positive_score = output[2]["score"]
                   assignment = np.argmax([negative_score, positive_score])
                   y_pred.append(assignment)

Perform elevaluation of the model

In [None]:

def evaluate_performance(y_true, y_pred):
  """Create and print the classification report"""
  performance = classification_report(y_true, y_pred,
                                      target_names=["Negative Review", "Positive Review"])
  return performance
print(evaluate_performance(data["test"]["label"], y_pred))

I load the sentence-transformers model: This maps sentences and paragraphs to a 768 dimensional dense vector space.

In [None]:
from sentence_transformers import SentenceTransformer
# Load model
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
# Convert text to embeddings
train_embeddings = model.encode(data["train"]["text"], show_progress_bar=True)
test_embeddings = model.encode(data["test"]["text"], show_progress_bar=True)

In [None]:
train_embeddings.shape

I train a logistic regression on our train embeddings

In [None]:

# Train a logistic regression on our train embeddings
clf = LogisticRegression(random_state=42)
clf.fit(train_embeddings, data["train"]["label"])

# Predict previously unseen instances
y_pred = clf.predict(test_embeddings)
evaluate_performance(data["test"]["label"], y_pred)

In [None]:
# Performance of the second model
print(evaluate_performance(data["test"]["label"], y_pred))

In [None]:
# Add Analysis & Improvements
misclassified_indices = [i for i, (true, pred) in enumerate(zip(data["test"]["label"], y_pred)) if true != pred]
print(f"Number of misclassified examples: {len(misclassified_indices)}")

# Looking at some errors
for i in misclassified_indices[:3]:
    print(f"Text: {data['test']['text'][i]}")
    print(f"True: {data['test']['label'][i]}, Predicted: {y_pred[i]}")
    print("---")

I calculate the confusion matrix graph of the results

In [None]:
cm = confusion_matrix(data["test"]["label"], y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()