In [1]:
import pandas as pd
pd.options.display.max_colwidth = 1000
from src.data.make_dataset import MakeDataset
from src.data.preprocessing.preprocess_bert import PreprocessBert
from src.feature_extraction.build_bert_features import BuildBERTFeature
from src.enums import * 

make_dataset = MakeDataset()
data = make_dataset.read_data('../data/raw/all_data_augmented.csv')

data=data[data['of_id'].isnull()]
data=data[data['dataset'].isin(['benevolent', 'hostile', 'other'])]


train_domain=Domain.BHO
test_domain=Domain.BHO
X_train_, y_train_, X_test_, y_test_=make_dataset.get_balanced_data_split(data, train_domain, test_domain)

bf=BuildBERTFeature(output_hidden_states=False, extract=False, embedding_file_name='../src/bert_embeddings/word_embeddings20201121-161340.pkl')
X_train=bf.transform(X_train_)
X_test=bf.transform(X_test_)
y_train=y_train_ 
y_test=y_test_ 

X_train.shape

  return torch._C._cuda_getDeviceCount() > 0


TensorShape([642, 46, 768])

In [2]:
sequence_length=X_train.shape[1]
print('sequence_length {}'.format(sequence_length))

embedding_size=X_train.shape[2]
print('embedding_size {}'.format(embedding_size))

sequence_length 46
embedding_size 768


In [12]:
#conv2d

from tensorflow.keras import Input, layers, Model
import tensorflow as tf

#vocabulary_size=1903
kernel_sizes=[2, ]

#model = tf.keras.Sequential()
input_ = layers.Input(shape=(sequence_length, embedding_size, 1))

pooled_outputs = []
for kernel_size in kernel_sizes:
    conv= layers.Conv2D(filters=128, kernel_size=int(kernel_size), activation='relu', padding='valid')(input_)    
    pooled=layers.MaxPooling2D(pool_size=(2, 2))(conv)
    pooled_outputs.append(pooled)

# Combine all the pooled features
concs=None
if len(kernel_sizes) > 1:
    concs = layers.Concatenate(axis=1)(pooled_outputs)
else:
    concs=pooled_outputs[0]
    
flat =layers.Flatten()(concs)
#drop =layers.Dropout(0.5)(flat)

dense_=layers.Dense(64, activation='relu')(flat)
#dense = layers.Dense(8, activation='relu')(drop)
output_ = layers.Dense(1)(dense_)  
#output_ = layers.Dense(1, activation='softmax')(drop)

model_2d = Model(input_, output_)

model_2d.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy'])

model_2d.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 46, 768, 1)]      0         
_________________________________________________________________
conv2d (Conv2D)              (None, 45, 767, 128)      640       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 22, 383, 128)      0         
_________________________________________________________________
flatten (Flatten)            (None, 1078528)           0         
_________________________________________________________________
dense (Dense)                (None, 64)                69025856  
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 69,026,561
Trainable params: 69,026,561
Non-trainable params: 0
__________________________________________

In [14]:
BATCH_SIZE = 64
BUFFER_SIZE = 10000

#old shape : (642, 46, 768)
#new shape (642, 46, 768, 1)
X_train=X_train.numpy().reshape(X_train.shape[0], X_train.shape[1], X_train.shape[2], 1)
X_train=tf.convert_to_tensor(X_train)

X_test=X_test.numpy().reshape(X_test.shape[0], X_test.shape[1], X_test.shape[2], 1)
X_test=tf.convert_to_tensor(X_test)

train_dataset = tf.data.Dataset.from_tensor_slices((X_train , y_train))
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)

test_dataset = tf.data.Dataset.from_tensor_slices((X_test , y_test))
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)


model_2d.fit(train_dataset, epochs=10)
test_loss, test_acc = model_2d.evaluate(test_dataset)

print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))

import numpy as np

y_pred=np.argmax(model_2d.predict(test_dataset), axis=-1)

from sklearn.metrics import classification_report

print( classification_report( y_test, y_pred,))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.5011796951293945
Test Accuracy: 0.795918345451355
              precision    recall  f1-score   support

           0       0.50      1.00      0.67       147
           1       0.00      0.00      0.00       147

    accuracy                           0.50       294
   macro avg       0.25      0.50      0.33       294
weighted avg       0.25      0.50      0.33       294



  _warn_prf(average, modifier, msg_start, len(result))
