In [None]:
import torch
import pandas as pd
from tqdm.notebook import tqdm
import re
import string
import nltk
nltk.download("punkt")
nltk.download('wordnet')
nltk.download("stopwords")
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df = pd.read_csv('./smileannotationsfinal.csv', names=['id', 'text', 'category'])
df.set_index('id', inplace=True)

In [None]:
df.head()

Unnamed: 0_level_0,text,category
id,Unnamed: 1_level_1,Unnamed: 2_level_1
611857364396965889,@aandraous @britishmuseum @AndrewsAntonio Merc...,nocode
614484565059596288,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy
614746522043973632,@SelectShowcase @Tate_StIves ... Replace with ...,happy
614877582664835073,@Sofabsports thank you for following me back. ...,happy
611932373039644672,@britishmuseum @TudorHistory What a beautiful ...,happy


In [None]:
df.category.value_counts()

nocode               1572
happy                1137
not-relevant          214
angry                  57
surprise               35
sad                    32
happy|surprise         11
happy|sad               9
disgust|angry           7
disgust                 6
sad|disgust             2
sad|angry               2
sad|disgust|angry       1
Name: category, dtype: int64

In [None]:
df['labels'] = df.apply(lambda x: "unhappy" if x['category'] != "happy" else "happy", axis = 1)

In [None]:
df.labels.value_counts()

unhappy    1948
happy      1137
Name: labels, dtype: int64

In [None]:
df = df[~df.labels.str.contains('\|')]

In [None]:
# df = df[df.labels != 'nocode']

In [None]:
df.labels.value_counts()

unhappy    1948
happy      1137
Name: labels, dtype: int64

In [None]:
df['text'] = df['text'].apply(lambda x: x.lower() if isinstance(x, str) else x)

In [None]:
df.head()

Unnamed: 0_level_0,text,category,labels
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
611857364396965889,@aandraous @britishmuseum @andrewsantonio merc...,nocode,unhappy
614484565059596288,dorian gray with rainbow scarf #lovewins (from...,happy,happy
614746522043973632,@selectshowcase @tate_stives ... replace with ...,happy,happy
614877582664835073,@sofabsports thank you for following me back. ...,happy,happy
611932373039644672,@britishmuseum @tudorhistory what a beautiful ...,happy,happy


In [None]:
def preprocess(text):
  text = text.lower()
  text = re.sub(r'\d+','',text)
  text = re.sub(r'[^\w\s]','',text)
  tokens = nltk.word_tokenize(text)
  return tokens

In [None]:
def remove_stopwords(tokens):
  stop_words  =set(stopwords.words('english'))
  filtered_tokens = [word for word in tokens if word not in stop_words]
  return filtered_tokens

In [None]:
def perform_lemmatization (tokens) :
  lemmatizer = nltk.WordNetLemmatizer()
  lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
  return  lemmatized_tokens

In [None]:
def clean_text(text):
  tokens = preprocess(text)
  filtered_tokens = remove_stopwords(tokens)
  lemmatized_tokens = perform_lemmatization(filtered_tokens)
  clean_text = ' '.join(lemmatized_tokens)
  return clean_text

In [None]:
df['clean_text'] = df['text'].apply(lambda x : clean_text(x))

In [None]:
df.head()

Unnamed: 0_level_0,text,category,labels,clean_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
611857364396965889,@aandraous @britishmuseum @andrewsantonio merc...,nocode,unhappy,aandraous britishmuseum andrewsantonio merci p...
614484565059596288,dorian gray with rainbow scarf #lovewins (from...,happy,happy,dorian gray rainbow scarf lovewins britishmuse...
614746522043973632,@selectshowcase @tate_stives ... replace with ...,happy,happy,selectshowcase tate_stives replace wish artist...
614877582664835073,@sofabsports thank you for following me back. ...,happy,happy,sofabsports thank following back great hear di...
611932373039644672,@britishmuseum @tudorhistory what a beautiful ...,happy,happy,britishmuseum tudorhistory beautiful jewel por...


In [None]:
possible_labels = df.labels.unique()

In [None]:
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

In [None]:
df['label'] = df.labels.replace(label_dict)

In [None]:
df.head()

Unnamed: 0_level_0,text,category,labels,clean_text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
611857364396965889,@aandraous @britishmuseum @andrewsantonio merc...,nocode,unhappy,aandraous britishmuseum andrewsantonio merci p...,0
614484565059596288,dorian gray with rainbow scarf #lovewins (from...,happy,happy,dorian gray rainbow scarf lovewins britishmuse...,1
614746522043973632,@selectshowcase @tate_stives ... replace with ...,happy,happy,selectshowcase tate_stives replace wish artist...,1
614877582664835073,@sofabsports thank you for following me back. ...,happy,happy,sofabsports thank following back great hear di...,1
611932373039644672,@britishmuseum @tudorhistory what a beautiful ...,happy,happy,britishmuseum tudorhistory beautiful jewel por...,1


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df.index.values

array([611857364396965889, 614484565059596288, 614746522043973632, ...,
       615246897670922240, 613016084371914753, 611566876762640384])

In [None]:
df.labels.values

array(['unhappy', 'happy', 'happy', ..., 'happy', 'unhappy', 'unhappy'],
      dtype=object)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(df.clean_text.values,
                                                  df.labels.values,
                                                  test_size=0.15,
                                                  random_state=17,
                                                  stratify=df.label.values)

In [None]:
X_train

array([613371623836680192, 611840652196872192, 610565880112660483, ...,
       613976658723405824, 615448572776026112, 611216576956768256])

In [None]:
df['data_type'] = ['not_set']*df.shape[0]

In [None]:
df['data_type']

id
611857364396965889    not_set
614484565059596288    not_set
614746522043973632    not_set
614877582664835073    not_set
611932373039644672    not_set
611570404268883969    not_set
614456889863208960    not_set
614016385442807809    not_set
610916556751642624    not_set
614499696015503361    not_set
612869223354925056    not_set
614497346521399296    not_set
613601881441570816    not_set
611443771285188608    not_set
611561745392513024    not_set
613696526297210880    not_set
610746718641102848    not_set
611172872044855296    not_set
612648200588038144    not_set
612999088162783232    not_set
614912375288893440    not_set
610435728565239809    not_set
611897570856714240    not_set
614698049328291844    not_set
612924713883566080    not_set
613407692657397760    not_set
614074786801709057    not_set
611634648846876672    not_set
612736445820858368    not_set
614025292693180416    not_set
614184660906704896    not_set
613359710343929857    not_set
610858270962675712    not_set
6127348

In [None]:
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

In [None]:
df.groupby(['labels','label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,text,category,clean_text
labels,label,data_type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
happy,1,train,966,966,966
happy,1,val,171,171,171
unhappy,0,train,1656,1656,1656
unhappy,0,val,292,292,292


In [None]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                          do_lower_case=True)

In [None]:
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)


input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [None]:
len(dataset_train)

2622

In [None]:
len(dataset_val)

463

In [None]:
from transformers import BertForSequenceClassification

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [None]:
batch_size = 32

dataloader_train = DataLoader(dataset_train,
                              sampler=RandomSampler(dataset_train),
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val,
                                   sampler=SequentialSampler(dataset_val),
                                   batch_size=batch_size)

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [None]:
optimizer = AdamW(model.parameters(),
                  lr=1e-5,
                  eps=1e-8)



In [None]:
epochs = 3

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

In [None]:
import numpy as np

In [None]:
from sklearn.metrics import f1_score

In [None]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

In [None]:
def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}

    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [None]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cuda


In [None]:
def evaluate(dataloader_val):

    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in dataloader_val:

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total/len(dataloader_val)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals

In [None]:
for epoch in tqdm(range(1, epochs+1)):

    model.train()

    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        outputs = model(**inputs)

        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})


    torch.save(model.state_dict(), f'finetuned_BERT_epoch_{epoch}.model')

    tqdm.write(f'\nEpoch {epoch}')

    loss_train_avg = loss_train_total/len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')

    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

  0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/82 [00:00<?, ?it/s]


Epoch 1
Training loss: 0.4976410698599932
Validation loss: 0.3563547104597092
F1 Score (Weighted): 0.8659970283566988


Epoch 2:   0%|          | 0/82 [00:00<?, ?it/s]


Epoch 2
Training loss: 0.2624181998757327
Validation loss: 0.3107425798972448
F1 Score (Weighted): 0.8673091453037115


Epoch 3:   0%|          | 0/82 [00:00<?, ?it/s]


Epoch 3
Training loss: 0.21094101426623216
Validation loss: 0.31053149203459424
F1 Score (Weighted): 0.8862483027935838


In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
_, predictions, true_vals = evaluate(dataloader_validation)

In [None]:
accuracy_per_class(predictions, true_vals)

Class: unhappy
Accuracy: 292/292

Class: happy
Accuracy: 0/171

