<a href="https://colab.research.google.com/github/danielmlow/tutorials/blob/main/text/sentiment_analysis_emotion_recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment analysis and emotion classification (pre-trained)

If it's super slow you can activate the GPU in Colab under Runtime -> Change runtime type > T4 GPU

In [1]:
!pip install -q transformers==4.33.2

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m51.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m63.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [5]:
import pandas as pd
import os
from transformers import RobertaTokenizerFast, TFRobertaForSequenceClassification, pipeline

on_colab = True

if on_colab:
  from google.colab import drive
  project_name = 'project_name'
  drive.mount('/content/drive')
  input_dir = f'/content/drive/MyDrive/consulting/matt/er/data/input/'
  output_dir = f'/content/drive/MyDrive/consulting/matt/er/data/output/'
else:
  input_dir = './data/input/'
  output_dir = './data/output/'

os.makedirs(output_dir, exist_ok=True)




Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:

# Change to your dataset:

# docs = pd.read_csv(input_dir + 'dataset.csv')['text_col'].values
# docs = [str(n) for n in docs] # make sure they'll all strings


docs = ['I am happy',
        "I'm happy, but worried about tomorrow",
        "I'm miserable",
        "I'm sad, but hopeful",
        'I am not happy',
        'I wish I were happy',
        "I'm sad, but hopeful",
        "Don't talk to me like that!",
        "Really? I'm shocked!"]

In [7]:
def huggingface_output_2_df(output_dict, add_to_col_names = None):
	feature_names = [n.get('label') for n in output_dict[0]]
	if add_to_col_names:
		feature_names = [add_to_col_names+n for n in feature_names]
	feature_vectors = []
	for doc in output_dict:
		feature_vectors_doc = []
		for feature in doc:
			feature_vectors_doc.append(feature.get('score'))
		feature_vectors.append(feature_vectors_doc)
	feature_vectors = pd.DataFrame(feature_vectors, columns = feature_names)
	return feature_vectors

# Extract sentiment analysis


In [10]:
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis")
output_dict = sentiment_pipeline(docs, return_all_scores=True)



In [11]:
feature_vectors = huggingface_output_2_df(output_dict, add_to_col_names = 'sentiment_')
feature_vectors

Unnamed: 0,sentiment_NEGATIVE,sentiment_POSITIVE
0,0.00012,0.99988
1,0.022988,0.977012
2,0.99978,0.00022
3,0.000404,0.999596
4,0.99979,0.00021
5,0.050676,0.949324
6,0.000404,0.999596
7,0.648448,0.351552
8,0.018542,0.981458


# Models trained on Go Emotions

All models: https://huggingface.co/models?dataset=dataset:go_emotions
- macro F1=0.493 https://huggingface.co/arpanghoshal/EmoRoBERTa
- F1=0.45 https://huggingface.co/SamLowe/roberta-base-go_emotions
- Not sure about performance but probably faster inferences: https://huggingface.co/bhadresh-savani/distilbert-base-uncased-emotion




In [12]:
%%time
tokenizer = RobertaTokenizerFast.from_pretrained("arpanghoshal/EmoRoBERTa")
model = TFRobertaForSequenceClassification.from_pretrained("arpanghoshal/EmoRoBERTa")
emotion = pipeline('sentiment-analysis',
                    model='arpanghoshal/EmoRoBERTa')

Downloading (…)okenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/501M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at arpanghoshal/EmoRoBERTa.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at arpanghoshal/EmoRoBERTa.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


CPU times: user 4.55 s, sys: 2.34 s, total: 6.89 s
Wall time: 16.9 s


In [13]:
output_dict = emotion(docs, return_all_scores=True)
feature_vectors = huggingface_output_2_df(output_dict, add_to_col_names = 'emoroberta_')
feature_vectors



Unnamed: 0,emoroberta_admiration,emoroberta_amusement,emoroberta_anger,emoroberta_annoyance,emoroberta_approval,emoroberta_caring,emoroberta_confusion,emoroberta_curiosity,emoroberta_desire,emoroberta_disappointment,...,emoroberta_love,emoroberta_nervousness,emoroberta_optimism,emoroberta_pride,emoroberta_realization,emoroberta_relief,emoroberta_remorse,emoroberta_sadness,emoroberta_surprise,emoroberta_neutral
0,0.004448,0.000923,6.1e-05,0.000282,0.006051,0.003484,3.6e-05,0.000231,0.000112,0.000132,...,0.00052,1.3e-05,0.000367,0.000159,0.000429,0.001209,2.4e-05,4.1e-05,0.000144,0.005801
1,0.000515,0.000779,8.9e-05,0.000321,0.00977,0.022259,0.00024,0.000192,0.000412,0.000946,...,0.001131,0.008207,0.002165,0.00101,0.001613,0.106616,0.000155,0.000482,0.000216,0.001247
2,3.1e-05,0.000238,0.005566,0.005236,0.000218,0.000454,3.3e-05,6.5e-05,0.000112,0.089282,...,0.00026,0.000575,5.6e-05,3.6e-05,0.000829,2.5e-05,0.000351,0.89078,5.2e-05,0.00128
3,1.1e-05,0.000743,0.000156,0.000318,8.7e-05,0.002711,6e-05,0.00011,7.1e-05,0.003636,...,7.5e-05,0.000484,0.001483,4e-06,0.000603,2.6e-05,0.001311,0.984306,5.3e-05,0.001323
4,0.000158,0.00131,0.00106,0.00534,0.009726,0.001106,0.000721,0.00015,0.000111,0.095363,...,0.001784,0.001202,0.000373,3.4e-05,0.029471,0.000122,0.001183,0.192939,0.000433,0.061407
5,0.001547,0.009453,0.00034,0.001082,0.000485,0.001109,3.6e-05,0.000218,0.938436,0.000159,...,0.003144,7.6e-05,0.005524,0.00044,0.000152,6.4e-05,4.5e-05,0.000181,0.000159,0.004064
6,1.1e-05,0.000743,0.000156,0.000318,8.7e-05,0.002711,6e-05,0.00011,7.1e-05,0.003636,...,7.5e-05,0.000484,0.001483,4e-06,0.000603,2.6e-05,0.001311,0.984306,5.3e-05,0.001323
7,4.8e-05,6.7e-05,0.980031,0.012423,0.000131,0.000133,9.2e-05,2.1e-05,1.4e-05,0.00043,...,0.000232,3.5e-05,4.2e-05,0.000151,8.8e-05,1.1e-05,5.4e-05,0.00034,5.1e-05,0.000968
8,4.7e-05,7.2e-05,2.6e-05,7.2e-05,5.3e-05,1.7e-05,0.000153,0.000274,5.3e-05,5.4e-05,...,1.8e-05,2.3e-05,0.000236,1.4e-05,0.001299,2.9e-05,3e-06,8e-06,0.996606,0.000131


In [14]:

feature_vectors.columns

Index(['emoroberta_admiration', 'emoroberta_amusement', 'emoroberta_anger',
       'emoroberta_annoyance', 'emoroberta_approval', 'emoroberta_caring',
       'emoroberta_confusion', 'emoroberta_curiosity', 'emoroberta_desire',
       'emoroberta_disappointment', 'emoroberta_disapproval',
       'emoroberta_disgust', 'emoroberta_embarrassment',
       'emoroberta_excitement', 'emoroberta_fear', 'emoroberta_gratitude',
       'emoroberta_grief', 'emoroberta_joy', 'emoroberta_love',
       'emoroberta_nervousness', 'emoroberta_optimism', 'emoroberta_pride',
       'emoroberta_realization', 'emoroberta_relief', 'emoroberta_remorse',
       'emoroberta_sadness', 'emoroberta_surprise', 'emoroberta_neutral'],
      dtype='object')

In [None]:
%%time

# if you have a lot of data, loop through every 1000 files and save in case the session dies. Then you can pick up where you left off.

feature_vectors_all = []
step = 1000
print(len(docs))
for i in range(0, len(docs), step):
  print(str(i).zfill(5))
  docs_i = docs[i:i+step]
  output_dict = emotion(docs_i, return_all_scores=True)
  feature_vectors = huggingface_output_2_df(output_dict, add_to_col_names = 'emoroberta_')
  feature_vectors['event'] = docs_i
  feature_vectors.to_csv(output_dir + f'instagram_messages_emoroberta_{str(i).zfill(5)}.csv')
  feature_vectors_all.append(feature_vectors)


feature_vectors_all = pd.concat(feature_vectors_all).reset_index(drop=True)


In [None]:
feature_vectors_all.to_csv(output_dir + f'messages_emoroberta.csv') # save