# Imports

In [1]:
import numpy as np
import pandas as pd
import transformers
import tensorflow as tf
import tqdm.notebook as tqdm
import sklearn.model_selection
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay

# Data

Load and prepare your dataset. Dataset should have at least 10k samples in it. Each dataset cannot be used by more than two students.

In [2]:
data = pd.read_parquet('text_emotion_data.parquet').drop(
    [
        'ru_text',
        'id',
        'author',
        'subreddit',
        'link_id',
        'parent_id',
        'created_utc',
        'rater_id',
        'example_very_unclear',
        ],
        axis=1)

In [3]:
id_to_label = {
    0: 'admiration',
    1: 'amusement',
    2: 'anger',
    3: 'annoyance',
    4: 'approval',
    5: 'caring',
    6: 'confusion',
    7: 'curiosity',
    8: 'desire',
    9: 'disappointment',
    10: 'disapproval',
    11: 'disgust',
    12: 'embarrassment',
    13: 'excitement',
    14: 'fear',
    15: 'gratitude',
    16: 'grief',
    17: 'joy',
    18: 'love',
    19: 'nervousness',
    20: 'optimism',
    21: 'pride',
    22: 'realization',
    23: 'relief',
    24: 'remorse',
    25: 'sadness',
    26: 'surprise',
    27: 'neutral',
}

In [4]:
ru_emotions = {
    0: 'восхищение',
    1: 'веселье',
    2: 'злость',
    3: 'раздражение',
    4: 'одобрение',
    5: 'забота',
    6: 'непонимание',
    7: 'любопытство',
    8: 'желание',
    9: 'разочарование',
    10: 'неодобрение',
    11: 'отвращение',
    12: 'смущение',
    13: 'возбуждение',
    14: 'страх',
    15: 'признательность',
    16: 'горе',
    17: 'радость',
    18: 'любовь',
    19: 'нервозность',
    20: 'оптимизм',
    21: 'гордость',
    22: 'осознание',
    23: 'облегчение',
    24: 'раскаяние',
    25: 'грусть',
    26: 'удивление',
    27: 'нейтральность',
    }

In [5]:
translate_emotion = {id_to_label[i]: ru_emotions[i] for i in range(len(id_to_label))}

In [6]:
translate_emotion

{'admiration': 'восхищение',
 'amusement': 'веселье',
 'anger': 'злость',
 'annoyance': 'раздражение',
 'approval': 'одобрение',
 'caring': 'забота',
 'confusion': 'непонимание',
 'curiosity': 'любопытство',
 'desire': 'желание',
 'disappointment': 'разочарование',
 'disapproval': 'неодобрение',
 'disgust': 'отвращение',
 'embarrassment': 'смущение',
 'excitement': 'возбуждение',
 'fear': 'страх',
 'gratitude': 'признательность',
 'grief': 'горе',
 'joy': 'радость',
 'love': 'любовь',
 'nervousness': 'нервозность',
 'optimism': 'оптимизм',
 'pride': 'гордость',
 'realization': 'осознание',
 'relief': 'облегчение',
 'remorse': 'раскаяние',
 'sadness': 'грусть',
 'surprise': 'удивление',
 'neutral': 'нейтральность'}

# Backbone

Load pretrained model from Hugging Face (or some other model repository if it's more convenient). Model should be trained on Feature Extraction task.

In [7]:
backbone = transformers.TFAutoModel.from_pretrained('./bart-base')

All PyTorch model weights were used when initializing TFBartModel.

All the weights of TFBartModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartModel for predictions without further training.


Load tokenizer to be used with the model

In [8]:
tokenizer = transformers.AutoTokenizer.from_pretrained('facebook/bart-base')

# Feature extraction

Since we will not be training the backbone, extract features from your dataset.

Tokenize all your sequences. Truncate/pad the squences for convenience. If the sequences are too large to be stored in memory, lazily save them on disk.

In [9]:
tokenize_seq = []

for text in tqdm.tqdm(data['text']):
    tokenize_seq.append(tokenizer(text))

  0%|          | 0/211225 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1437 > 1024). Running this sequence through the model will result in indexing errors


Run the backbone on the sequences and save the extracted features. The extracted features should be a vector containing information about the whole text. If the features are too large to be stored in memory, lazily save them on disk.

In [10]:
X = tf.keras.preprocessing.sequence.pad_sequences([i['input_ids'] for i in tokenize_seq], maxlen=128, truncating='post', padding='post', value=tokenizer.pad_token_id)

In [11]:
y = data[data.columns.difference(['text'])].values

In [12]:
text_dataset = tf.data.Dataset.from_tensor_slices(X).batch(64)

In [13]:
features = []

for data in tqdm.tqdm(text_dataset):
    features.extend(tf.math.reduce_mean(backbone(data).last_hidden_state, axis=1))

features = np.array(features)

  0%|          | 0/3301 [00:00<?, ?it/s]

In [14]:
features, features.shape

(array([[ 1.9268482 , -1.8083783 ,  1.1634356 , ..., -0.25005838,
          1.684664  ,  0.59326035],
        [ 1.2231605 , -1.1030577 , -0.28428376, ...,  1.1343532 ,
          1.3022391 ,  0.54750144],
        [ 1.9432738 , -1.8364208 ,  0.81785935, ...,  0.1901779 ,
          0.8832531 ,  0.98668087],
        ...,
        [ 2.0871496 , -1.9621882 ,  0.7568142 , ..., -0.07704715,
          1.2768964 ,  0.62696475],
        [ 1.5172942 , -1.0947775 , -0.6264874 , ...,  0.84136087,
          1.7282301 ,  1.0637035 ],
        [ 1.5030369 , -0.8643758 , -0.8128365 , ...,  1.1897686 ,
          1.5710945 ,  0.9328568 ]], dtype=float32),
 (211225, 768))

# Prepare train/test data

Split your data (extracted features and labels) into train and test subsets.

In [15]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(features, y)

In [16]:
X_train_dataset = tf.data.Dataset.from_tensor_slices(X_train)
X_test_dataset = tf.data.Dataset.from_tensor_slices(X_test)
y_train_dataset = tf.data.Dataset.from_tensor_slices(y_train)
y_test_dataset = tf.data.Dataset.from_tensor_slices(y_test)

Prepare `tf.data.Dataset` or some other way for the data to be used during training.

In [17]:
train_dataset = tf.data.Dataset.zip((X_train_dataset, y_train_dataset)).batch(256)
test_dataset = tf.data.Dataset.zip((X_test_dataset, y_test_dataset)).batch(256)

# Build the model

Build a simple model. The model should accept an extracted feature vector and return a vector of class logits (or probabilities). Model should only have a couple (or even 1) layer with weights.

In [18]:
gpus = tf.config.list_physical_devices('GPU')
print(gpus)
if gpus:
    try:
        tf.config.experimental.set_memory_growth(gpus[0], True)
        print('GPU enable')
    except:
        print('GPU not enable')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
GPU not enable


In [19]:
inputs = tf.keras.layers.Input(768, name='input', dtype=tf.int32)
x = tf.keras.layers.Dense(512, name='Dense1', activation='relu')(inputs)
x = tf.keras.layers.Dense(len(translate_emotion), name='prediction')(x)
model = tf.keras.Model(inputs=inputs, outputs=x, name='MyModel')

Compile the model. Choose loss and metrics.

In [20]:
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True), optimizer='adam', metrics='accuracy')

# Train the model

In [25]:
model.fit(train_dataset, validation_data=test_dataset, epochs=5)

Epoch 1/5


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x266cd32e2c0>

# Evaluation

Evalute the model on test data.

In [26]:
model.evaluate(X_test, y_test, batch_size=128)



[420.9265441894531, 0.26189708709716797]

Plot confusion matrix.

In [27]:
ConfusionMatrixDisplay.from_predictions(y_test, model.predict(X_test).argmax(axis=-1))



ValueError: Mix type of y not allowed, got types {'multilabel-indicator', 'binary'}

Write a function to classify a piece of text.

In [None]:
def classify_text(text: str) -> tuple[int | str, np.ndarray]:
    '''Classifes the given `text` using the trained model.

    Arguments:
        text: text to be classified

    Return:
        The assigned label and probabilites of all labels'''

    pass

Evaluate the model on text not present in training and test data (come up with the text yourself). Try to get an input for each class.

# Bonus

Write a function that computes word impact on text label. 

In [None]:
def get_words_impact(text: str) -> list[tuple[str, np.ndarray]]:
    '''Determines word impact on text label.

    Arguments:
        text: Sample text to be used for computation.

    Returns:
        A list of pairs: the word and vector of probability changes for each class'''
    pass

Try to find out words that make text have a specific label.