In [3]:
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize, TweetTokenizer, WhitespaceTokenizer, RegexpTokenizer
from gensim.models import Word2Vec
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU, Dense, Dropout, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import binary_crossentropy
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
import torch

In [4]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#Exercise 1

In [5]:
data = pd.read_csv('train_en.txt', sep='\t')
data['Tokens'] = data['Sentence'].apply(lambda x: word_tokenize(x.lower()))

In [6]:
data.head()

Unnamed: 0,Sentence,Style,Tokens
0,he had steel balls too !,toxic,"[he, had, steel, balls, too, !]"
1,"dude should have been taken to api , he would ...",toxic,"[dude, should, have, been, taken, to, api, ,, ..."
2,"im not gonna sell the fucking picture , i just...",toxic,"[im, not, gon, na, sell, the, fucking, picture..."
3,the garbage that is being created by cnn and o...,toxic,"[the, garbage, that, is, being, created, by, c..."
4,the reason they dont exist is because neither ...,toxic,"[the, reason, they, dont, exist, is, because, ..."


In [7]:
sentences = data['Tokens'].values

In [8]:
model_word2vec = Word2Vec(sentences, vector_size=50, min_count=15, sg=1)

In [9]:
words = model_word2vec.wv.index_to_key
words.append('<UNK>')

In [10]:
word_to_id = {word: i for i, word in enumerate(words)}

In [11]:
data['IDs'] = data['Tokens'].apply(lambda x: [word_to_id.get(word, word_to_id['<UNK>']) for word in x])

In [12]:
data

Unnamed: 0,Sentence,Style,Tokens,IDs
0,he had steel balls too !,toxic,"[he, had, steel, balls, too, !]","[35, 147, 1459, 1323, 92, 12]"
1,"dude should have been taken to api , he would ...",toxic,"[dude, should, have, been, taken, to, api, ,, ...","[295, 102, 37, 128, 1167, 5, 1459, 3, 35, 60, ..."
2,"im not gonna sell the fucking picture , i just...",toxic,"[im, not, gon, na, sell, the, fucking, picture...","[117, 21, 175, 134, 1459, 1, 23, 1090, 3, 2, 3..."
3,the garbage that is being created by cnn and o...,toxic,"[the, garbage, that, is, being, created, by, c...","[1, 518, 8, 7, 122, 1459, 109, 1019, 11, 155, ..."
4,the reason they dont exist is because neither ...,toxic,"[the, reason, they, dont, exist, is, because, ...","[1, 387, 24, 120, 1211, 7, 81, 1098, 7, 6, 145..."
...,...,...,...,...
25035,both sides need to calm down or we are heading...,neutral,"[both, sides, need, to, calm, down, or, we, ar...","[274, 884, 108, 5, 958, 150, 77, 42, 17, 1459,..."
25036,i 'm sitting here in my calm german city conte...,neutral,"[i, 'm, sitting, here, in, my, calm, german, c...","[2, 50, 954, 100, 16, 25, 958, 1164, 540, 1459..."
25037,"dude , get a clue .",neutral,"[dude, ,, get, a, clue, .]","[295, 3, 41, 6, 644, 0]"
25038,"I was so high, it was amazing.",neutral,"[i, was, so, high, ,, it, was, amazing, .]","[2, 34, 39, 448, 3, 13, 34, 329, 0]"


In [13]:
token_ids = data['IDs'].values
avg_seq_len = int(np.mean([len(seq) for seq in token_ids]))
padded_ids = pad_sequences(token_ids, maxlen=avg_seq_len, padding='post')

In [14]:
labels = data['Style'].apply(lambda x: 1 if x == 'toxic' else 0).values

In [15]:
token_ids

array([list([35, 147, 1459, 1323, 92, 12]),
       list([295, 102, 37, 128, 1167, 5, 1459, 3, 35, 60, 29, 86, 62, 323, 33, 45, 1, 155, 1459, 73]),
       list([117, 21, 175, 134, 1459, 1, 23, 1090, 3, 2, 31, 98, 5, 1459, 5, 1, 23, 202, 0]),
       ..., list([295, 3, 41, 6, 644, 0]),
       list([2, 34, 39, 448, 3, 13, 34, 329, 0]),
       list([318, 40, 1459, 0])], dtype=object)

In [16]:
padded_ids

array([[  35,  147, 1459, ...,    0,    0,    0],
       [  60,   29,   86, ...,  155, 1459,   73],
       [   3,    2,   31, ...,   23,  202,    0],
       ...,
       [ 295,    3,   41, ...,    0,    0,    0],
       [   2,   34,   39, ...,    0,    0,    0],
       [ 318,   40, 1459, ...,    0,    0,    0]], dtype=int32)

In [17]:
labels

array([1, 1, 1, ..., 0, 0, 0])

In [18]:
X_train, X_test, y_train, y_test = train_test_split(padded_ids, labels, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [19]:
unk_vector = np.zeros((1, 50))
embedding_matrix = np.vstack([model_word2vec.wv.vectors, unk_vector])

#RNN

In [20]:
model = Sequential()
model.add(Embedding(input_dim=len(words), output_dim=50,
                    weights=[embedding_matrix], trainable=True))
model.add(LSTM(64, return_sequences=True))
model.add(LSTM(32))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer=Adam(learning_rate=0.01), loss='binary_crossentropy', metrics=['accuracy'])

In [21]:
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_val, y_val))

Epoch 1/10
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 8ms/step - accuracy: 0.8110 - loss: 0.3782 - val_accuracy: 0.9009 - val_loss: 0.2284
Epoch 2/10
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.9075 - loss: 0.2070 - val_accuracy: 0.8959 - val_loss: 0.2365
Epoch 3/10
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.9092 - loss: 0.1895 - val_accuracy: 0.8934 - val_loss: 0.2753
Epoch 4/10
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.9158 - loss: 0.1657 - val_accuracy: 0.8837 - val_loss: 0.2784
Epoch 5/10
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.9299 - loss: 0.1431 - val_accuracy: 0.8797 - val_loss: 0.2789
Epoch 6/10
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.9391 - loss: 0.1282 - val_accuracy: 0.8795 - val_loss: 0.2925
Epoch 7/10
[1m251/251[0m 

In [22]:
y_test_pred = (model.predict(X_test) > 0.5).astype("int32")

[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step


In [23]:
print(classification_report(y_test, y_test_pred, target_names=['Non-Toxic', 'Toxic']))

              precision    recall  f1-score   support

   Non-Toxic       0.89      0.88      0.88      2489
       Toxic       0.88      0.89      0.89      2519

    accuracy                           0.89      5008
   macro avg       0.89      0.89      0.89      5008
weighted avg       0.89      0.89      0.89      5008



#Excercise 2

In [24]:
import pandas as pd
import numpy as np
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification, DistilBertTokenizer, TFDistilBertForSequenceClassification
from tensorflow.keras.losses import SparseCategoricalCrossentropy

In [25]:
sentences = data['Sentence'].values.tolist()
labels = data['Style'].apply(lambda x: 1 if x == 'toxic' else 0).values

In [26]:
tokenizer_roberta = RobertaTokenizer.from_pretrained('roberta-base')
tokenizer_distilbert = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize_sentences(tokenizer, sentences, max_length=15):
    input_ids = []
    attention_masks = []
    for sentence in sentences:
        result = tokenizer.encode_plus(sentence, max_length=max_length, truncation=True, padding='max_length')
        input_ids.append(result['input_ids'])
        attention_masks.append(result['attention_mask'])
    return np.array(input_ids), np.array(attention_masks)

train_input_ids_roberta, train_attention_masks_roberta = tokenize_sentences(tokenizer_roberta, sentences)
train_input_ids_distilbert, train_attention_masks_distilbert = tokenize_sentences(tokenizer_distilbert, sentences)

X_train_roberta, X_test_roberta, y_train, y_test = train_test_split(train_input_ids_roberta, labels, test_size=0.2, random_state=42)
train_attention_masks_roberta, test_attention_masks_roberta, _, _ = train_test_split(train_attention_masks_roberta, labels, test_size=0.2, random_state=42)

X_train_distilbert, X_test_distilbert, _, _ = train_test_split(train_input_ids_distilbert, labels, test_size=0.2, random_state=42)
train_attention_masks_distilbert, test_attention_masks_distilbert, _, _ = train_test_split(train_attention_masks_distilbert, labels, test_size=0.2, random_state=42)

y_train = np.array(y_train)
y_test = np.array(y_test)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [36]:
from tensorflow.nn import softmax
import tensorflow as tf

def train_and_evaluate(model, X_train, train_masks, X_test, test_masks, y_train, y_test, learning_rate=1e-5, epochs=3):
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    model.fit([X_train, train_masks], y_train, epochs=epochs, batch_size=64, validation_split=0.2)

    y_pred = model.predict([X_test, test_masks])
    y_pred_probs = softmax(y_pred.logits, axis=-1)
    y_pred_classes = np.argmax(y_pred_probs, axis=1)

    print(classification_report(y_test, y_pred_classes))

In [28]:
model_roberta = TFRobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
model_distilbert = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predicti

##Model RoBERTa


In [37]:
train_and_evaluate(model_roberta, X_train_roberta, train_attention_masks_roberta,
                   X_test_roberta, test_attention_masks_roberta, y_train, y_test)

Epoch 1/3
Epoch 2/3
Epoch 3/3
              precision    recall  f1-score   support

           0       0.86      0.00      0.00      2489
           1       0.50      1.00      0.67      2519

    accuracy                           0.50      5008
   macro avg       0.68      0.50      0.34      5008
weighted avg       0.68      0.50      0.34      5008



##Model DistilBERT

In [38]:
train_and_evaluate(model_distilbert, X_train_distilbert, train_attention_masks_distilbert,
                   X_test_distilbert, test_attention_masks_distilbert, y_train, y_test)

Epoch 1/3
Epoch 2/3
Epoch 3/3
              precision    recall  f1-score   support

           0       0.50      1.00      0.66      2489
           1       0.00      0.00      0.00      2519

    accuracy                           0.50      5008
   macro avg       0.25      0.50      0.33      5008
weighted avg       0.25      0.50      0.33      5008



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Од некоја причина не добивам подобри резултати со овие модели(не учи моделот, но не можам да ги менувам параметрите, освен со batch_size, кој не ми направи голема промена), но не можам и да си играм со ратата на учење поради измешани верзии во colab. Доста време се чека при тренирање, па затоа тренирам само 5 епохи.