In [1]:
import tensorflow

from cnn import CNN
from reader import Reader
from sklearn.metrics import classification_report, confusion_matrix 
import numpy as np

In [2]:
EPOCHS = 10
BATCH_SIZE = 128
NUM_CLASSES = 3
LEARN_RATE = 0.01
EMBED_SIZE = 50
FILENAME = "data/twitter_data.pkl"
OVERSAMPLING_RATE = 3
VOCAB_LEN = 10000

reader = Reader(filename=FILENAME, num_classes=NUM_CLASSES, vocab_len=VOCAB_LEN)
X, y = reader.load()

mapping = {'racism':2,'sexism':1,'none':0}
y = [mapping[b] for b in y]

# Oversampling before split
# racism = [i for i in range(len(y)) if y[i]==2]
# sexism = [i for i in range(len(y)) if y[i]==1]
# X = X + [X[x] for x in racism]*(OVERSAMPLING_RATE-1)+ [X[x] for x in sexism]*(OVERSAMPLING_RATE-1)
# y = y + [2 for i in range(len(racism))]*(OVERSAMPLING_RATE-1) + [1 for i in range(len(sexism))]*(OVERSAMPLING_RATE-1)

X_train, X_test, y_train, y_test = reader.split(X, y)

# Oversampling after split
# racism = [i for i in range(len(y_train)) if y_train[i]==2]
# sexism = [i for i in range(len(y_train)) if y_train[i]==1]
# X_train = X_train + [X_train[x] for x in racism]*(OVERSAMPLING_RATE-1)+ [X_train[x] for x in sexism]*(OVERSAMPLING_RATE-1)
# y_train = y_train + [2 for i in range(len(racism))]*(OVERSAMPLING_RATE-1) + [1 for i in range(len(sexism))]*(OVERSAMPLING_RATE-1)

X_train, X_test, y_train, y_test, max_len = reader.preprocess(X_train, X_test, y_train, y_test)


In [3]:
model = CNN(max_len=max_len,
          num_classes=NUM_CLASSES, 
          batch_size=BATCH_SIZE, 
          epochs=EPOCHS, 
          embed_size=EMBED_SIZE, 
          vocab_len=VOCAB_LEN,
          checkpoint_path="results/test.ckpt", 
          save_path="results/test.h5")

In [4]:
model.fit(X_train, y_train)

Epoch 1/10
Epoch 00001: saving model to results\test.ckpt
Epoch 2/10
Epoch 00002: saving model to results\test.ckpt
Epoch 3/10
Epoch 00003: saving model to results\test.ckpt
Epoch 4/10
Epoch 00004: saving model to results\test.ckpt
Epoch 5/10
Epoch 00005: saving model to results\test.ckpt
Epoch 6/10
Epoch 00006: saving model to results\test.ckpt
Epoch 7/10
Epoch 00007: saving model to results\test.ckpt
Epoch 8/10
Epoch 00008: saving model to results\test.ckpt
Epoch 9/10
Epoch 00009: saving model to results\test.ckpt
Epoch 10/10
Epoch 00010: saving model to results\test.ckpt


<keras.engine.functional.Functional at 0x199d4945a60>

In [5]:
predictions = model.predict(X_test)
classes = np.argmax(y_test, axis=1)

predictions = list(map(lambda x: int(x), predictions))
print(confusion_matrix(classes, predictions))
print(classification_report(classes, predictions))

[[1031   32   40]
 [ 101  210    1]
 [  61    0  133]]
              precision    recall  f1-score   support

           0       0.86      0.93      0.90      1103
           1       0.87      0.67      0.76       312
           2       0.76      0.69      0.72       194

    accuracy                           0.85      1609
   macro avg       0.83      0.76      0.79      1609
weighted avg       0.85      0.85      0.85      1609

