[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/danielmlow/llm_course/blob/main/sentiment_analysis_emotion_recognition.ipynb)

# Sentiment analysis and emotion classification (pre-trained)

If it's super slow you can activate the GPU in Colab under Runtime -> Change runtime type > T4 GPU

In [None]:
!pip install torch==2.8.0
# maybe: !pip install -q transformers==4.33.2

In [None]:
import pandas as pd
import os
from transformers import RobertaTokenizerFast, TFRobertaForSequenceClassification, pipeline

on_colab = False

if on_colab:
  from google.colab import drive
  project_name = 'project_name'
  drive.mount('/content/drive')
  input_dir = f'/content/drive/MyDrive/consulting/matt/er/data/input/'
  output_dir = f'/content/drive/MyDrive/consulting/matt/er/data/output/'
else:
  input_dir = './data/input/'
  output_dir = './data/output/'

os.makedirs(output_dir, exist_ok=True)




In [None]:

# Change to your dataset:

# docs = pd.read_csv(input_dir + 'dataset.csv')['text_col'].values
# docs = [str(n) for n in docs] # make sure they'll all strings


docs = ['I am happy',
        "I'm happy, but worried about tomorrow",
        "I'm miserable",
        "I'm sad, but hopeful",
        'I am not happy',
        'I wish I were happy',
        "I'm sad, but hopeful",
        "Don't talk to me like that!",
        "Really? I'm shocked!"]

In [None]:
def huggingface_output_2_df(output_dict, add_to_col_names = None):
	feature_names = [n.get('label') for n in output_dict[0]]
	if add_to_col_names:
		feature_names = [add_to_col_names+n for n in feature_names]
	feature_vectors = []
	for doc in output_dict:
		feature_vectors_doc = []
		for feature in doc:
			feature_vectors_doc.append(feature.get('score'))
		feature_vectors.append(feature_vectors_doc)
	feature_vectors = pd.DataFrame(feature_vectors, columns = feature_names)
	return feature_vectors

# Extract sentiment analysis


In [None]:
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis")
output_dict = sentiment_pipeline(docs, return_all_scores=True)

In [None]:
feature_vectors = huggingface_output_2_df(output_dict, add_to_col_names = 'sentiment_')
feature_vectors['docs'] = docs
feature_vectors

In [None]:
from transformers import pipeline

sentiment_es = pipeline(
    "sentiment-analysis",
    model="pysentimiento/robertuito-sentiment-analysis",
    tokenizer="pysentimiento/robertuito-sentiment-analysis",
)

texts = ["Me encanta esta película", "No me gustó para nada", "Está bien, pero no es espectacular."]
results = sentiment_es(texts, return_all_scores=True)




In [None]:
feature_vectors = huggingface_output_2_df(results, add_to_col_names = 'sentiment_es_')
feature_vectors

# Emotion recognition

All models trained on Go Emotions: https://huggingface.co/models?dataset=dataset:go_emotions

Model: https://huggingface.co/SamLowe/roberta-base-go_emotions 

- Accuracy: 0.474
- Precision: 0.575
- Recall: 0.396
- F1: 0.450




In [None]:
from transformers import pipeline

classifier = pipeline(task="text-classification", model="SamLowe/roberta-base-go_emotions", top_k=None)



In [None]:
model_outputs = classifier(docs)
feature_vectors = huggingface_output_2_df(model_outputs, add_to_col_names = 'roberta_base_')
feature_vectors

In [None]:
from transformers import pipeline

emotion_es = pipeline(
    "text-classification",
    model="pysentimiento/robertuito-emotion-analysis",
    tokenizer="pysentimiento/robertuito-emotion-analysis",
    # optionally top_k=None if you want all emotion scores
)

texts = ["Estoy tan feliz hoy", "Estoy muy enfadado con lo que pasó"]
results = emotion_es(texts, return_all_scores=True)
feature_vectors

In [None]:
%%time

# if you have a lot of data, loop through every 1000 files and save in case the session dies. Then you can pick up where you left off.

feature_vectors_all = []
step = 1000
print(len(docs))
for i in range(0, len(docs), step):
  print(str(i).zfill(5))
  docs_i = docs[i:i+step]
  output_dict = emotion(docs_i, return_all_scores=True)
  feature_vectors = huggingface_output_2_df(output_dict, add_to_col_names = 'roberta_base_')
  feature_vectors['event'] = docs_i
  feature_vectors.to_csv(output_dir + f'roberta_base_{str(i).zfill(5)}.csv')
  feature_vectors_all.append(feature_vectors)


feature_vectors_all = pd.concat(feature_vectors_all).reset_index(drop=True)
feature_vectors_all.to_csv(output_dir + f'messages_emoroberta.csv') # save