In [1]:
import pandas as pd
import ast

### Data prep

In [33]:
train = pd.read_csv('emoHi-train.csv.zip', index_col=0)

In [34]:
test = pd.read_csv('emoHi-test.csv.zip', index_col=0)

In [35]:
val = pd.read_csv('emoHi-valid.csv.zip', index_col=0)

In [36]:
train['labels'] = train['labels'].str.replace(r'[', '')
train['labels'] = train['labels'].str.replace(r']', '')
train['labels'] = train['labels'].str.strip()

test['labels'] = test['labels'].str.replace(r'[', '')
test['labels'] = test['labels'].str.replace(r']', '')
test['labels'] = test['labels'].str.strip()

val['labels'] = val['labels'].str.replace(r'[', '')
val['labels'] = val['labels'].str.replace(r']', '')
val['labels'] = val['labels'].str.strip()

In [37]:
train['label'] = train['labels'].str.split(' ',expand=True)[0].astype(int)
test['label'] = test['labels'].str.split(' ',expand=True)[0].astype(int)
val['label'] = val['labels'].str.split(' ',expand=True)[0].astype(int)


### BERT Setup

In [39]:
class_labels = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27]

In [40]:
class_names = ["admiration", "amusement",
                "anger", "annoyance",
                "approval", "caring",
                "confusion", "curiosity",
                "desire", "disappointment",
                "disapproval", "disgust",
                "embarrassment", "excitement",
                "fear", "gratitude",
                "grief", "joy",
                "love", "nervousness",
                "optimism", "pride",
                "realization", "relief",
                "remorse", "sadness",
                "surprise", "neutral"]

In [41]:
def load_data():
    from datasets import ClassLabel, Features, Value
    from datasets import Dataset
    import pandas as pd
    emotion_features = Features({'text': Value('string'), 'label': ClassLabel(names=class_labels)})
    
    train_dataset = Dataset.from_pandas(train, features = emotion_features)
    test_dataset = Dataset.from_pandas(test, features = emotion_features)
    
    print ("Prepared Train and test data")
    return train_dataset, test_dataset
    

In [42]:
MODEL = "bert-base-multilingual-cased"

In [43]:
def tokenize_function(examples):
    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL)
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [44]:
def prep_tokens():
    train_dataset, test_dataset = load_data()
    print ("Train and Test data load successful")
    tokenized_train = train_dataset.map(tokenize_function, batched=True)
    tokenized_test = test_dataset.map(tokenize_function, batched=True)
    print ("Train and Test data tokenised")
    
    small_train_dataset = tokenized_train.shuffle(seed=42).select(range(1000))
    small_eval_dataset = tokenized_test.shuffle(seed=42).select(range(1000))
    full_train_dataset = tokenized_train
    full_eval_dataset = tokenized_test
    
    return full_train_dataset, full_eval_dataset

In [45]:
def compute_metrics(eval_pred):
    import numpy as np
    from datasets import load_metric
    metric = load_metric("f1")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average = 'macro')

In [46]:
def prep_model():
    from transformers import AutoModelForSequenceClassification
    from transformers import TrainingArguments, Trainer
    
    small_train_dataset, small_eval_dataset = prep_tokens()
    print (small_train_dataset)
    print ("Tokenised data load successful")
    model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=28)
    training_args = TrainingArguments("test_trainer", evaluation_strategy="epoch", logging_strategy="epoch")
    print ("Starting Training")
    trainer = Trainer(model=model, args=training_args, train_dataset=small_train_dataset, eval_dataset=small_eval_dataset)
    trainer.train()
    trainer = Trainer(model=model, args=training_args, train_dataset=small_train_dataset, eval_dataset=small_eval_dataset, compute_metrics=compute_metrics)
    
    predictions = trainer.predict(small_eval_dataset)
    return predictions[0]
    

In [None]:
a = prep_model()

Prepared Train and test data
Train and Test data load successful


  0%|          | 0/44 [00:00<?, ?ba/s]
Downloading: 100%|██████████| 29.0/29.0 [00:00<00:00, 33.5kB/s]

Downloading: 100%|██████████| 625/625 [00:00<00:00, 913kB/s]

Downloading: 100%|██████████| 972k/972k [00:00<00:00, 20.6MB/s]

Downloading: 100%|██████████| 1.87M/1.87M [00:00<00:00, 26.1MB/s]
100%|██████████| 44/44 [00:27<00:00,  1.63ba/s]
100%|██████████| 6/6 [00:03<00:00,  1.69ba/s]


Train and Test data tokenised
Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 43410
})
Tokenised data load successful


Downloading: 100%|██████████| 681M/681M [00:08<00:00, 80.3MB/s]
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Bert

Starting Training


The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 43410
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 16281
  3%|▎         | 500/16281 [03:24<1:48:05,  2.43it/s]Saving model checkpoint to test_trainer/checkpoint-500
Configuration saved in test_trainer/checkpoint-500/config.json
Model weights saved in test_trainer/checkpoint-500/pytorch_model.bin
  6%|▌         | 1000/16281 [06:50<1:44:32,  2.44it/s]Saving model checkpoint to test_trainer/checkpoint-1000
Configuration saved in test_trainer/checkpoint-1000/config.json
Model weights saved in test_trainer/checkpoint-1000/pytorch_model.bin
  9%|▉  

In [20]:
a

array([[ 1.3704215 ,  0.7323609 ,  0.3023718 , ..., -0.20485705,
        -0.48639953,  2.4607034 ],
       [ 1.3704216 ,  0.7323609 ,  0.3023718 , ..., -0.20485717,
        -0.48639953,  2.4607034 ],
       [ 1.3704215 ,  0.73236084,  0.30237174, ..., -0.20485708,
        -0.4863995 ,  2.4607036 ],
       ...,
       [ 1.3704216 ,  0.7323609 ,  0.30237183, ..., -0.2048571 ,
        -0.48639956,  2.4607034 ],
       [ 1.3704216 ,  0.7323609 ,  0.30237183, ..., -0.2048571 ,
        -0.4863995 ,  2.4607034 ],
       [ 1.3704216 ,  0.732361  ,  0.30237183, ..., -0.20485714,
        -0.4863995 ,  2.4607034 ]], dtype=float32)

In [21]:
from scipy.special import softmax

In [22]:
import pandas as pd

In [23]:
softmax_predictions = softmax(a, axis=1)

In [24]:
goemotions_predictions = pd.DataFrame(softmax_predictions, columns = class_labels)

In [25]:
goemotions_predictions['goemotions_class'] = goemotions_predictions.idxmax(axis=1)

In [26]:
goemotions_predictions

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,goemotions_class
0,0.103993,0.054941,0.03574,0.048363,0.061836,0.022447,0.015066,0.009841,0.013797,0.024816,...,0.002662,0.022147,0.001422,0.017476,0.002229,0.00934,0.021521,0.01624,0.309390,27
1,0.103993,0.054941,0.03574,0.048363,0.061836,0.022447,0.015066,0.009841,0.013797,0.024816,...,0.002662,0.022147,0.001422,0.017476,0.002229,0.00934,0.021521,0.01624,0.309390,27
2,0.103993,0.054941,0.03574,0.048363,0.061836,0.022447,0.015066,0.009841,0.013797,0.024816,...,0.002662,0.022147,0.001422,0.017476,0.002229,0.00934,0.021521,0.01624,0.309391,27
3,0.103993,0.054941,0.03574,0.048364,0.061836,0.022447,0.015066,0.009841,0.013797,0.024816,...,0.002662,0.022147,0.001422,0.017476,0.002229,0.00934,0.021521,0.01624,0.309390,27
4,0.103993,0.054941,0.03574,0.048363,0.061836,0.022447,0.015066,0.009841,0.013797,0.024816,...,0.002662,0.022147,0.001422,0.017476,0.002229,0.00934,0.021521,0.01624,0.309390,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5422,0.103993,0.054941,0.03574,0.048364,0.061836,0.022447,0.015066,0.009841,0.013797,0.024816,...,0.002662,0.022147,0.001422,0.017476,0.002229,0.00934,0.021521,0.01624,0.309390,27
5423,0.103993,0.054941,0.03574,0.048363,0.061836,0.022447,0.015066,0.009841,0.013797,0.024816,...,0.002662,0.022147,0.001422,0.017476,0.002229,0.00934,0.021521,0.01624,0.309391,27
5424,0.103993,0.054941,0.03574,0.048363,0.061836,0.022447,0.015066,0.009841,0.013797,0.024816,...,0.002662,0.022147,0.001422,0.017476,0.002229,0.00934,0.021521,0.01624,0.309390,27
5425,0.103993,0.054941,0.03574,0.048364,0.061836,0.022447,0.015066,0.009841,0.013797,0.024816,...,0.002662,0.022147,0.001422,0.017476,0.002229,0.00934,0.021521,0.01624,0.309390,27


In [27]:
results = pd.merge(test, goemotions_predictions, left_index=True, right_index=True)

In [28]:
results

Unnamed: 0,id,labels,text,label,0,1,2,3,4,5,...,19,20,21,22,23,24,25,26,27,goemotions_class
0,eecwqtt,25,मुझे आपकी स्थिति के लिए वास्तव में खेद है :( ह...,25,0.103993,0.054941,0.03574,0.048363,0.061836,0.022447,...,0.002662,0.022147,0.001422,0.017476,0.002229,0.00934,0.021521,0.01624,0.309390,27
1,ed5f85d,0,यह अद्भुत है क्योंकि यह भयानक है। पर साथ नहीं।,0,0.103993,0.054941,0.03574,0.048363,0.061836,0.022447,...,0.002662,0.022147,0.001422,0.017476,0.002229,0.00934,0.021521,0.01624,0.309390,27
2,een27c3,13,"किंग्स फैन यहाँ, आप लोगों को शुभकामनाएँ! देखने...",13,0.103993,0.054941,0.03574,0.048363,0.061836,0.022447,...,0.002662,0.022147,0.001422,0.017476,0.002229,0.00934,0.021521,0.01624,0.309391,27
3,eelgwd1,15,"मुझे यह नहीं पता था, आज मुझे कुछ सिखाने के लिए...",15,0.103993,0.054941,0.03574,0.048364,0.061836,0.022447,...,0.002662,0.022147,0.001422,0.017476,0.002229,0.00934,0.021521,0.01624,0.309390,27
4,eem5uti,27,वे हज़ारों वर्षों तक भूतिया धरती से ऊब चुके थे...,27,0.103993,0.054941,0.03574,0.048363,0.061836,0.022447,...,0.002662,0.022147,0.001422,0.017476,0.002229,0.00934,0.021521,0.01624,0.309390,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5422,efeeasc,15,धन्यवाद। मुझे अस्पताल में भर्ती होने के बाद भी...,15,0.103993,0.054941,0.03574,0.048364,0.061836,0.022447,...,0.002662,0.022147,0.001422,0.017476,0.002229,0.00934,0.021521,0.01624,0.309390,27
5423,ef9c7s3,4,अच्छा यह समझ में आता है।,4,0.103993,0.054941,0.03574,0.048363,0.061836,0.022447,...,0.002662,0.022147,0.001422,0.017476,0.002229,0.00934,0.021521,0.01624,0.309391,27
5424,efbiugo,27,डैडी मुद्दे [NAME],27,0.103993,0.054941,0.03574,0.048363,0.061836,0.022447,...,0.002662,0.022147,0.001422,0.017476,0.002229,0.00934,0.021521,0.01624,0.309390,27
5425,efbvgp9,0,बहुत खुशी है कि मैंने कुछ महीने पहले उस सब्रेड...,0,0.103993,0.054941,0.03574,0.048364,0.061836,0.022447,...,0.002662,0.022147,0.001422,0.017476,0.002229,0.00934,0.021521,0.01624,0.309390,27


In [29]:
results['goemotions_class'].value_counts()

27    4858
7      569
Name: goemotions_class, dtype: int64

In [30]:
from sklearn.metrics import classification_report

In [31]:
print (classification_report(results['label'], results['goemotions_class']))

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           0       0.00      0.00      0.00       504
           1       0.00      0.00      0.00       252
           2       0.00      0.00      0.00       197
           3       0.00      0.00      0.00       286
           4       0.00      0.00      0.00       318
           5       0.00      0.00      0.00       114
           6       0.00      0.00      0.00       139
           7       0.34      0.84      0.49       233
           8       0.00      0.00      0.00        74
           9       0.00      0.00      0.00       127
          10       0.00      0.00      0.00       220
          11       0.00      0.00      0.00        84
          12       0.00      0.00      0.00        30
          13       0.00      0.00      0.00        84
          14       0.00      0.00      0.00        74
          15       0.00      0.00      0.00       288
          16       0.00      0.00      0.00         6
          17       0.00    