# Install packages


In [2]:
!pip install -q transformers
# !pip install -q tensorflow==2.2-rc1
!pip install -q tf-models-official==2.2.0

[K     |████████████████████████████████| 1.4MB 9.1MB/s 
[K     |████████████████████████████████| 2.9MB 16.4MB/s 
[K     |████████████████████████████████| 890kB 50.4MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 716kB 8.2MB/s 
[K     |████████████████████████████████| 174kB 33.4MB/s 
[K     |████████████████████████████████| 102kB 9.5MB/s 
[K     |████████████████████████████████| 1.1MB 30.9MB/s 
[K     |████████████████████████████████| 36.7MB 83kB/s 
[K     |████████████████████████████████| 81kB 11.7MB/s 
[?25h  Building wheel for py-cpuinfo (setup.py) ... [?25l[?25hdone
  Building wheel for typing (setup.py) ... [?25l[?25hdone


In [3]:
!pip install keras-lr-multiplier

Collecting keras-lr-multiplier
  Downloading https://files.pythonhosted.org/packages/7d/78/0eed4862a7274fb491b50881dd2f0dac996ff5774dc4a30c4b628fb78b25/keras-lr-multiplier-0.8.0.tar.gz
Building wheels for collected packages: keras-lr-multiplier
  Building wheel for keras-lr-multiplier (setup.py) ... [?25l[?25hdone
  Created wheel for keras-lr-multiplier: filename=keras_lr_multiplier-0.8.0-cp36-none-any.whl size=5719 sha256=807ea887c9a774c1ed470bdec1d8595b85b9bc52b5f89ab1771c8fdacf60a7f6
  Stored in directory: /root/.cache/pip/wheels/2a/a5/a4/340d5432bced221b2bcca324e3257239784dd1220ab7c786e9
Successfully built keras-lr-multiplier
Installing collected packages: keras-lr-multiplier
Successfully installed keras-lr-multiplier-0.8.0


# Import libraries

In [4]:
import os
import time
import datetime
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from transformers import TFAutoModel, AutoTokenizer, TFBertForSequenceClassification,AutoConfig
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import GlobalMaxPooling2D,Average,Dot, Dense, Input, GlobalAveragePooling1D, BatchNormalization, Activation, Concatenate, Flatten, Dropout, Conv1D, MaxPooling1D, Add, Lambda, GlobalAveragePooling2D, Reshape, RepeatVector, UpSampling1D 
from tensorflow.keras.models import Model
from keras.layers import LSTM, Bidirectional
from official import nlp
import official.nlp.optimization


# Parameters

In [5]:
base_dir    = '/vlsp2020'
train_path  = os.path.join(base_dir, 'train_rev1_news.csv')
val_path    = os.path.join(base_dir, 'dev_rev1_news.csv')
test_path    = os.path.join(base_dir, 'final_news1.csv')
img_train_path = os.path.join(base_dir, 'train-image-299.npy')
img_val_path = os.path.join(base_dir, 'val-image-299.npy')
img_test_path = os.path.join(base_dir, 'final-image-299.npy')

MAX_LENGTH  = 256
MODEL       = 'NlpHUST/vibert4news-base-cased'
MODEL_NAME  = 'vibert4news-model2'
N_LABELS    = 1

# Read data

In [6]:
df_train = pd.read_csv(train_path)
print(df_train.shape)
print(df_train.info())
df_train[['num_like_post','num_comment_post','num_share_post']] = df_train[['num_like_post','num_comment_post','num_share_post']].astype(int)
display(df_train.head())

(3933, 26)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3933 entries, 0 to 3932
Data columns (total 26 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   index               3933 non-null   int64  
 1   id                  3933 non-null   int64  
 2   user_name           3933 non-null   object 
 3   post_message        3933 non-null   object 
 4   timestamp_post      3933 non-null   float64
 5   num_like_post       3933 non-null   float64
 6   num_comment_post    3933 non-null   float64
 7   num_share_post      3933 non-null   float64
 8   label               3933 non-null   int64  
 9   cleaned_text        3933 non-null   object 
 10  count_chars         3933 non-null   int64  
 11  count_words         3933 non-null   int64  
 12  count_questionmark  3933 non-null   int64  
 13  count_exclaimmark   3933 non-null   int64  
 14  numHashtags         3933 non-null   int64  
 15  numUrls             3933 non-null   int64  


Unnamed: 0,index,id,user_name,post_message,timestamp_post,num_like_post,num_comment_post,num_share_post,label,cleaned_text,count_chars,count_words,count_questionmark,count_exclaimmark,numHashtags,numUrls,post_month,post_year,post_day,post_hour,post_weekday,cnt_fake,cnt_nonfake,ratio,has_title,has_image
0,2005,2006,99ed0ae1e3149e05968a0b27b4317f1d,Chị Phạm Vũ Tâm An -Trung tâm tư vấn tâm lý Ph...,1590871000.0,0,3,0,0,Chị Phạm Vũ Tâm An -Trung tâm tư vấn tâm lý Ph...,157,38,0,0,0,0,5,2020,30,20,5,0.0,48.0,0.0,0,0
1,1755,1756,a7ed39c9ac4f4eaf9534651d39c03849,Họ chỉ là những người bình thường đang làm côn...,1585020000.0,2089,50,144,0,Họ chỉ là những người bình thường đang làm côn...,222,45,0,0,1,0,3,2020,24,3,1,0.0,1.0,0.0,0,0
2,3607,3608,7c0129565612ac73129fdfa652575e48,# **XÚC ĐỘNG TỰ HÀO VÀ NGƯỠNG MỘ BIẾT ƠN VỀ MỘ...,1589860000.0,10,0,1,0,# **xúc động tự hào và ngưỡng mộ biết ơn về mộ...,6433,1454,0,1,0,0,5,2020,19,3,1,0.0,1.0,0.0,1,0
3,1823,1824,d5c03b35abe8da9485f462af36879c8c,"Tài xế ô tô siêu sang chống đối CSGT, lái xe h...",1584770000.0,4,2,2,0,"Tài xế ô tô siêu sang chống đối csgt, lái xe h...",661,149,0,0,0,0,3,2020,21,6,5,1.0,6.0,0.142857,1,1
4,3289,3290,ae42a3a4976579103ceaeb5d7b78909b,"# **SAU 40 NĂM, 1.600 CHIẾN SĨ ANH HÙNG ĐƯỢC N...",1587979000.0,224,76,63,1,"# **sau 40 năm, 1.600 chiến sĩ anh hùng được n...",1093,224,2,3,0,0,4,2020,27,9,0,1.0,0.0,1.0,1,0


In [7]:
df_val = pd.read_csv(val_path)
print(df_val.shape)
print(df_val.info())
df_val[['num_like_post','num_comment_post','num_share_post']] = df_val[['num_like_post','num_comment_post','num_share_post']].astype(int)
display(df_val.head())

(438, 26)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 438 entries, 0 to 437
Data columns (total 26 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   index               438 non-null    int64  
 1   id                  438 non-null    int64  
 2   user_name           438 non-null    object 
 3   post_message        438 non-null    object 
 4   timestamp_post      438 non-null    float64
 5   num_like_post       438 non-null    float64
 6   num_comment_post    438 non-null    float64
 7   num_share_post      438 non-null    float64
 8   label               438 non-null    int64  
 9   cleaned_text        438 non-null    object 
 10  count_chars         438 non-null    int64  
 11  count_words         438 non-null    int64  
 12  count_questionmark  438 non-null    int64  
 13  count_exclaimmark   438 non-null    int64  
 14  numHashtags         438 non-null    int64  
 15  numUrls             438 non-null    int64  
 16

Unnamed: 0,index,id,user_name,post_message,timestamp_post,num_like_post,num_comment_post,num_share_post,label,cleaned_text,count_chars,count_words,count_questionmark,count_exclaimmark,numHashtags,numUrls,post_month,post_year,post_day,post_hour,post_weekday,cnt_fake,cnt_nonfake,ratio,has_title,has_image
0,3981,3982,4ebf9484c0e0600b46c4ccf7e315e87e,Một trong những vấn đề được nhiều đại biểu qua...,1592406000.0,153,226,40,0,Một trong những vấn đề được nhiều đại biểu qua...,443,97,0,0,0,0,6,2020,17,15,2,0.0,1.0,0.0,0,0
1,3903,3904,808e278b22ec6b96f2faf7447d10cd8e,Giá cổ phiếu suy giảm khiến tỷ phú giàu nhất b...,1584736000.0,12,1,0,0,Giá cổ phiếu suy giảm khiến tỷ phú giàu nhất b...,180,40,0,0,0,0,3,2020,20,20,4,0.0,64.0,0.0,0,0
2,25,26,5e631179c3cc2a90a3afd12b08819770,Lưu ý lưu ý 15/5 - 14/6 🙂🙂🙂\nTrong thời gian r...,1588824000.0,16,5,3,0,Lưu ý lưu ý 15/5 - 14/6 🙂🙂🙂 Trong thời gian ra...,219,51,0,0,0,0,5,2020,7,4,3,5.0,1.0,0.833333,0,0
3,4262,4263,2a205aa672d1d1e9132029d5a163ce62,"Với chiêu lừa ""quái dị"" HTX Ngọc Đăng ở Phú Th...",1590836000.0,16,0,8,1,"Với chiêu lừa ""quái dị"" htx Ngọc Đăng ở Phú Th...",357,83,0,0,0,0,5,2020,30,10,5,1.0,0.0,1.0,0,0
4,2304,2305,8da2ab7d30849156b2105a21b2fb33cf,"Tổ chức đám cưới linh đình cho con trai, gia đ...",1583194000.0,589,41,151,0,"Tổ chức đám cưới linh đình cho con trai, gia đ...",180,37,0,0,3,0,3,2020,3,0,1,0.0,1.0,0.0,0,0


In [8]:
# Get the lists of sentences and their labels.
train_sent      = df_train.cleaned_text.values
train_labels    = df_train.label.values
val_sent        = df_val.cleaned_text.values
val_labels      = df_val.label.values 
#test_sent        = df_test.cleaned_text.values
#test_labels      = df_test.label.values 

In [9]:
#Image
img_train = np.load(img_train_path)
# img_train = np.delete(img_train,duplicated,0)
img_val = np.load(img_val_path)
img_test = np.load(img_test_path)
img_train.shape, img_val.shape, img_test.shape

((3933, 299, 299, 3), (438, 299, 299, 3), (1646, 299, 299, 3))

In [10]:
img_train = img_train.astype('float32')
img_val = img_val.astype('float32')
img_test = img_test.astype('float32')

In [11]:
print(len(train_sent), len(train_labels))
print(len(val_sent), len(val_labels))
#print(len(test_sent), len(test_labels))

3933 3933
438 438


# Tokenization & Input Formatting


In [12]:
# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = AutoTokenizer.from_pretrained(MODEL, do_lower_case=False)

Loading BERT tokenizer...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=503.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=411325.0, style=ProgressStyle(descripti…




In [13]:
print(' Original: ', train_sent[0])
print('Tokenized: ', tokenizer.tokenize(train_sent[0]))
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(train_sent[0])))

 Original:  Chị Phạm Vũ Tâm An -Trung tâm tư vấn tâm lý Phúc An cho hay chị từng tư vấn tâm lý cho nhiều khách hàng gặp vấn đề xung đột với bạn đời vì thói quen ăn uống.
Tokenized:  ['Chị', 'Phạm', 'Vũ', 'Tâm', 'An', '-', 'Trung', 'tâm', 'tư', 'vấn', 'tâm', 'lý', 'Phúc', 'An', 'cho', 'hay', 'chị', 'từng', 'tư', 'vấn', 'tâm', 'lý', 'cho', 'nhiều', 'khách', 'hàng', 'gặp', 'vấn', 'đề', 'xung', 'đột', 'với', 'bạn', 'đời', 'vì', 'thói', 'quen', 'ăn', 'uống', '.']
Token IDs:  [1577, 831, 870, 1625, 471, 30, 124, 176, 102, 292, 176, 91, 947, 471, 13, 345, 523, 438, 102, 292, 176, 91, 13, 53, 205, 70, 511, 292, 137, 1203, 1075, 15, 298, 587, 235, 2147, 1218, 374, 971, 6]


## Tokenize 

### Train

In [None]:
df_train.columns

Index(['index', 'id', 'user_name', 'post_message', 'timestamp_post',
       'num_like_post', 'num_comment_post', 'num_share_post', 'label',
       'cleaned_text', 'count_chars', 'count_words', 'count_questionmark',
       'count_exclaimmark', 'numHashtags', 'numUrls', 'post_month',
       'post_year', 'post_day', 'post_hour', 'post_weekday', 'cnt_fake',
       'cnt_nonfake', 'ratio', 'has_title', 'has_image'],
      dtype='object')

In [14]:
feature_columns = ['num_like_post', 'num_comment_post', 'num_share_post', 
                   'count_chars', 'count_words', 'count_questionmark',
                   'count_exclaimmark', 'numHashtags', 'numUrls', 'post_month',
                   'post_day', 'post_hour', 'post_weekday', 'cnt_fake',
                   'cnt_nonfake', 'ratio', 'has_image']
FEATURES_NUM = len(feature_columns)
FEATURES_NUM

17

In [15]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df_train[feature_columns[:-2]])

df_train[feature_columns[:-2]] = scaler.transform(df_train[feature_columns[:-2]])
df_val[feature_columns[:-2]] = scaler.transform(df_val[feature_columns[:-2]])

In [16]:
train_features = df_train[feature_columns]
#process_features(train_features)
train_features = train_features.astype('float32')

val_features = df_val[feature_columns]
#process_features(val_features)
val_features = val_features.astype('float32')

In [17]:
input_ids       = []
attention_masks = []
features = []

for sent in tqdm(train_sent):
    encoded_dict = tokenizer.encode_plus(
                        sent,                      
                        add_special_tokens = True, 
                        max_length = MAX_LENGTH,         
                        pad_to_max_length = True,
                        return_attention_mask = True,   
                        return_tensors = 'np',   
                        truncation = True,
                   )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

for id,_ in enumerate(train_features['count_chars']):
  f = train_features.iloc[id].values
  features.append(f)

  
id_train        = np.concatenate(input_ids)
mask_train      = np.concatenate(attention_masks)
feature_train   = np.array(features)
y_train         = train_labels 
id_train.shape, mask_train.shape, y_train.shape, feature_train.shape 

100%|██████████| 3933/3933 [00:02<00:00, 1645.64it/s]


((3933, 256), (3933, 256), (3933,), (3933, 17))

### Val

In [18]:
input_ids       = []
attention_masks = []
features        = []

for sent in tqdm(val_sent):
    encoded_dict = tokenizer.encode_plus(
                        sent,                      
                        add_special_tokens = True, 
                        max_length = MAX_LENGTH,         
                        pad_to_max_length = True,
                        return_attention_mask = True,   
                        return_tensors = 'np',   
                        truncation = True,
                   )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

for id,_ in enumerate(val_features['count_chars']):
  f = val_features.iloc[id].values
  features.append(f)


id_val          = np.concatenate(input_ids)
mask_val        = np.concatenate(attention_masks)
feature_val     = np.array(features)
y_val           = val_labels
id_val.shape, mask_val.shape, y_val.shape, feature_val.shape

100%|██████████| 438/438 [00:00<00:00, 1665.09it/s]


((438, 256), (438, 256), (438,), (438, 17))

## Create iterator for data

In [19]:
BATCH_SIZE      = 16 

X_train         = [
    id_train,
    mask_train,
    feature_train,
    img_train
]
X_val           = [
    id_val,
    mask_val,
    feature_val,
    img_val  
]

# Train TFBertForSequenceClassification Model

In [22]:
from tensorflow.keras.applications import VGG19                                  #Change here for different models
from tensorflow.keras.applications.vgg19 import preprocess_input                 #Change here for different models

def create_cnn(input_shape):
  inputs = Input(shape=input_shape)
  x = Lambda(preprocess_input)(inputs)
  # load the VGG16 network, ensuring the head FC layer sets are left
  # off                                                                           - Change here for different models
  baseModel = VGG19(weights="imagenet", include_top=False, input_tensor=x)

  # construct the head of the model that will be placed on top of the
  # the base model                                           
  headModel = baseModel.output  
  
  ###Attention
  headModel = Reshape((-1,headModel.shape[-1]))(headModel)

  ###Non-attention
  # headModel = GlobalAveragePooling2D()(headModel)                                 
  # headModel = Dense(512)(headModel)
  # headModel = BatchNormalization()(headModel)
  # headModel = Activation("relu")(headModel)
  # headModel = Dropout(0.2)(headModel)

  # headModel = Dense(512)(headModel)
  # headModel = BatchNormalization()(headModel)
  # headModel = Activation("relu")(headModel)
  # headModel = Dropout(0.2)(headModel)

  # headModel = Dense(512)(headModel)
  # headModel = BatchNormalization()(headModel)
  # headModel = Activation("relu")(headModel)
  # headModel = Dropout(0.2)(headModel)

  model = Model(inputs=baseModel.input, outputs=headModel)

  # loop over all layers in the base model and freeze them so they will
  # *not* be updated during the first training process
  #for layer in baseModel.layers[:-8]:
  #  layer.trainable = False

  return model


In [83]:
def create_model(transformer, max_len=256, feature_num=17):
    merge = []

    input_ids           = Input(shape=(max_len,), dtype=tf.int32, name='input_ids')
    attention_mask      = Input(shape=(max_len,), dtype=tf.int32, name='attention_mask')
    extra_features      = Input(shape=(feature_num,), dtype=tf.int32, name='extra_features')
    image               = create_cnn((299,299,3))
    sequence_output     = transformer(input_ids, 
                                  attention_mask=attention_mask)[0]
   #cls_token           = sequence_output[:, 0, :]
    
    dense               = Dense(512)(extra_features)
    bn                  = BatchNormalization()(dense)
    dense               = Activation("relu")(bn)
    #merge.append(cls_token)
    merge.append(dense)
  
   # Yoon Kim model (https://arxiv.org/abs/1408.5882)
    convs = []
    filter_sizes = [2,3,4,5]
    size_pool = 5

    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=256, kernel_size=filter_size)(sequence_output)
        l_conv = BatchNormalization()(l_conv)
        l_conv = Activation("relu")(l_conv)
        #l_pool = MaxPooling1D(pool_size=max_len-filter_size+1)(l_conv)
        l_pool = MaxPooling1D(pool_size=size_pool)(l_conv)
        convs.append(l_pool)
        #merge.append(Flatten()(l_pool))

    l2_pool = Concatenate(axis=1)(convs)
    #l2_pool = BatchNormalization()(l2_pool)
    for _ in range(3):
        origin  = l2_pool
        l2_conv = Conv1D(filters=256, kernel_size=size_pool,padding='same')(l2_pool)
        l2_pool = BatchNormalization()(l2_pool)
        l2_pool = Activation("relu")(l2_pool)
        #print(origin.shape, l2_conv.shape)
        # l2_conv = Add()([origin, l2_conv])
        l2_pool = MaxPooling1D(pool_size=size_pool)(l2_conv)
        
    text = Flatten()(l2_pool)
    # text = Dropout(0.2)(text)
    text = Dense(512)(text)
    text = BatchNormalization()(text)
    text = Activation("tanh")(text)                  #change to tanh?
    text = Dropout(0.2)(text)
    
    merge.append(text)
    ##With attention
    img = Dense(512)(image.output) #change to tanh? 
    img = BatchNormalization()(img)
    img = Activation('tanh')(img)

    # # text = RepeatVector(img.shape[1])(text)
    # # attention = Add()([text, img])
    # # attention = Dense(64,activation='tanh')(attention)
    # # attention = Dense(1)(attention)
    attention = Dot(axes=(1,2))([text, img])
    attention = Activation("softmax")(attention)
    att_img = Dot(axes=(1,1))([attention, image.output])
    merge.append(Flatten()(att_img))
    ##Without attention
    # merge.append(image.output)

    #l_merge             = Concatenate(axis=1)(merge)
    l_merge             = Average()(merge)   
    out                 = Dense(N_LABELS, activation='sigmoid')(l_merge) 
    model               = Model(inputs=[input_ids, attention_mask, extra_features, image.input], 
                            outputs=out)
    return model

In [91]:
%%time
EPOCHS          = 40
total_steps     = len(y_train) * BATCH_SIZE
train_data_size = len(y_train)
steps_per_epoch = int(train_data_size / BATCH_SIZE) + 1
num_train_steps = steps_per_epoch * EPOCHS
# warmup_steps    = int(num_train_steps * 0.1)
warmup_steps    = 0

# Create the learning rate scheduler.
decay_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
      initial_learning_rate=2e-5,
      decay_steps=num_train_steps,
      end_learning_rate=0)

warmup_schedule = nlp.optimization.WarmUp(
        initial_learning_rate=2e-5,
        decay_schedule_fn=decay_schedule,
        warmup_steps=warmup_steps)

optimizer       = nlp.optimization.AdamWeightDecay(
        learning_rate=warmup_schedule,
        epsilon=1e-8,
        exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias'])

#Load bert4news
config = AutoConfig.from_pretrained('/bert4news/config.json')
transformer = TFAutoModel.from_pretrained('/bert4news/pytorch_model.bin', from_pt=True, config=config)

# transformer = TFAutoModel.from_pretrained(MODEL)
model = create_model(transformer, max_len=MAX_LENGTH)
model.compile(optimizer=optimizer,
                loss='binary_crossentropy',
                metrics='accuracy')

model.summary()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already

Model: "functional_11"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 299, 299, 3) 0                                            
__________________________________________________________________________________________________
lambda_2 (Lambda)               (None, 299, 299, 3)  0           input_3[0][0]                    
__________________________________________________________________________________________________
block1_conv1 (Conv2D)           (None, 299, 299, 64) 1792        lambda_2[0][0]                   
__________________________________________________________________________________________________
block1_conv2 (Conv2D)           (None, 299, 299, 64) 36928       block1_conv1[0][0]               
______________________________________________________________________________________

In [92]:
from sklearn.utils import class_weight

class_weights = class_weight.compute_class_weight('balanced', np.unique(train_labels), train_labels)
class_weights = {i : class_weights[i] for i in range(2)}
class_weights

{0: 0.6008249312557287, 1: 2.9795454545454545}

## Train model

In [93]:
from keras.utils import np_utils
from keras.callbacks import Callback, EarlyStopping
from sklearn.metrics import roc_auc_score
from sklearn.datasets import make_classification

class roc_auc_callback(Callback):
    def __init__(self,training_data,validation_data):
        self.x = training_data[0]
        self.y = training_data[1]
        self.x_val = validation_data[0]
        self.y_val = validation_data[1]


    def on_train_begin(self, logs={}):
        return

    def on_train_end(self, logs={}):
        return

    def on_epoch_begin(self, epoch, logs={}):
        return

    def on_epoch_end(self, epoch, logs={}):
        y_pred_train = self.model.predict(self.x, verbose=0)
        roc_train = roc_auc_score(self.y, y_pred_train)
        y_pred_val = self.model.predict(self.x_val, verbose=0)
        roc_val = roc_auc_score(self.y_val, y_pred_val)
        print('\rroc-auc_train: %s - roc-auc_val: %s' % (str(round(roc_train,4)),str(round(roc_val,4))),end=100*' '+'\n')
        return

    def on_batch_begin(self, batch, logs={}):
        return

    def on_batch_end(self, batch, logs={}):
        return


In [94]:
n_steps = int(np.ceil(y_train.shape[0] / BATCH_SIZE))

# Checkpoint path
ckpt_path     = f'/checkpoint/{MODEL_NAME}/'
if not os.path.exists(ckpt_path):
    os.makedirs(ckpt_path)
ckpt_path     += 'cp-{epoch:02d}.h5'

# Callback
my_callbacks  = [tf.keras.callbacks.ModelCheckpoint(filepath=ckpt_path, 
                                                    monitor='val_loss', 
                                                    save_weights_only=True,
                                                    save_freq='epoch'),
                 tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=1),
                 roc_auc_callback(training_data=(X_train, y_train),validation_data=(X_val, y_val))]

H = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    #steps_per_epoch=n_steps,
    class_weight=class_weights,
    shuffle=True,
    callbacks=my_callbacks
)

Epoch 1/40
roc-auc_train: 0.9214 - roc-auc_val: 0.8658                                                                                                    
Epoch 2/40
roc-auc_train: 0.954 - roc-auc_val: 0.9271                                                                                                    
Epoch 3/40
roc-auc_train: 0.9859 - roc-auc_val: 0.958                                                                                                    
Epoch 4/40
roc-auc_train: 0.9936 - roc-auc_val: 0.952                                                                                                    
Epoch 5/40
roc-auc_train: 0.9964 - roc-auc_val: 0.9583                                                                                                    
Epoch 6/40
roc-auc_train: 0.9982 - roc-auc_val: 0.9611                                                                                                    
Epoch 7/40
roc-auc_train: 0.9982 - roc-auc_val: 0.9639                   

# Load best epoch based on training result

In [87]:
%%time
EPOCHS          = 40
total_steps     = len(y_train) * BATCH_SIZE
train_data_size = len(y_train)
steps_per_epoch = int(train_data_size / BATCH_SIZE) + 1
num_train_steps = steps_per_epoch * EPOCHS
# warmup_steps    = int(num_train_steps * 0.1)
warmup_steps    = 0

# Create the learning rate scheduler.
decay_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
      initial_learning_rate=2e-5,
      decay_steps=num_train_steps,
      end_learning_rate=0)

warmup_schedule = nlp.optimization.WarmUp(
        initial_learning_rate=2e-5,
        decay_schedule_fn=decay_schedule,
        warmup_steps=warmup_steps)

optimizer       = nlp.optimization.AdamWeightDecay(
        learning_rate=warmup_schedule,
        epsilon=1e-8,
        exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias'])


#Load bert4news
config = AutoConfig.from_pretrained('/content/drive/MyDrive/bert4news/config.json')
transformer = TFAutoModel.from_pretrained('/content/drive/MyDrive/bert4news/pytorch_model.bin', from_pt=True, config=config)

# transformer = TFAutoModel.from_pretrained(MODEL)
model2      = create_model(transformer, max_len=MAX_LENGTH)
model2.compile(optimizer=optimizer,
                loss='binary_crossentropy',
                metrics='accuracy')
    
# model2.summary()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already

Model: "functional_7"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 299, 299, 3) 0                                            
__________________________________________________________________________________________________
lambda_1 (Lambda)               (None, 299, 299, 3)  0           input_2[0][0]                    
__________________________________________________________________________________________________
block1_conv1 (Conv2D)           (None, 299, 299, 64) 1792        lambda_1[0][0]                   
__________________________________________________________________________________________________
block1_conv2 (Conv2D)           (None, 299, 299, 64) 36928       block1_conv1[0][0]               
_______________________________________________________________________________________

In [88]:
model2.load_weights(os.path.join(base_dir, f'checkpoint/{MODEL_NAME}/cp-05.h5')) #PATH to the best epoch that has highest AUC
#model2.evaluate(test_dataset)

# Predict

In [72]:
test_data    = os.path.join(base_dir, test_path)

In [73]:
df_test_data = pd.read_csv(test_data)
print(df_test_data.shape)
print(df_test_data.info())
df_test_data[['num_like_post','num_comment_post','num_share_post']] = df_test_data[['num_like_post','num_comment_post','num_share_post']].astype(int)

display(df_test_data.head())

(1646, 25)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1646 entries, 0 to 1645
Data columns (total 25 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          1646 non-null   int64  
 1   id                  1646 non-null   int64  
 2   user_name           1646 non-null   object 
 3   post_message        1646 non-null   object 
 4   timestamp_post      1646 non-null   float64
 5   num_like_post       1646 non-null   float64
 6   num_comment_post    1646 non-null   float64
 7   num_share_post      1646 non-null   float64
 8   count_chars         1646 non-null   int64  
 9   count_words         1646 non-null   int64  
 10  count_questionmark  1646 non-null   int64  
 11  count_exclaimmark   1646 non-null   int64  
 12  numHashtags         1646 non-null   int64  
 13  numUrls             1646 non-null   int64  
 14  post_month          1646 non-null   int64  
 15  post_year           1646 non-null   int64  


Unnamed: 0.1,Unnamed: 0,id,user_name,post_message,timestamp_post,num_like_post,num_comment_post,num_share_post,count_chars,count_words,count_questionmark,count_exclaimmark,numHashtags,numUrls,post_month,post_year,post_day,post_hour,post_weekday,cnt_fake,cnt_nonfake,ratio,cleaned_text,has_title,has_image
0,0,6015,fbf39587d668e9ae28afb00b8fc00570,Các lời khai cố tình bị rút ra để áp án tử cho...,1590163000.0,41,3,6,127,33,0,0,0,0,5,2020,22,15,4,1.0,0.0,1.0,Các lời khai cố tình bị rút ra để áp án tử cho...,0,0
1,1,6016,c4acc2118bdcdb5770565362b02d00e0,(NLĐO) – Lực lượng cứu hoả gần như phải thức t...,1586500000.0,4,3,6,137,32,0,0,0,0,4,2020,10,6,4,0.0,0.0,0.0,(nlđo) – Lực lượng cứu hoả gần như phải thức t...,0,0
2,2,6017,180dd4930112053803ccea8556f33e47,Cư dân mạng đang tranh cãi sôi nổi sau quyết đ...,1592102000.0,963,50,6,79,18,0,0,0,0,6,2020,14,2,6,0.0,0.0,0.0,Cư dân mạng đang tranh cãi sôi nổi sau quyết đ...,0,0
3,3,6018,ac4d365f23909091fab2552bdc54f5ef,3 công nhân trung quốc làm cty hòa phát bị nhi...,1596256000.0,41,3,6,442,103,0,0,0,0,8,2020,1,4,5,0.0,0.0,0.0,3 công nhân trung quốc làm cty hòa phát bị nhi...,0,1
4,4,6019,5ba5638b734e397d5263a6b1b9201abd,Một số người p Tây nhìn người Á hay Phi ko thi...,1587462000.0,5,3,6,289,63,0,0,0,0,4,2020,21,9,1,0.0,0.0,0.0,Một số người p Tây nhìn người á hay Phi ko thi...,0,1


In [74]:
# Get the lists of sentences and their labels.
test_data_sent        = df_test_data.cleaned_text.values

In [75]:
df_test_data[feature_columns[:-2]] = scaler.transform(df_test_data[feature_columns[:-2]])

In [76]:
test_data_features = df_test_data[feature_columns]
#process_features(test_data_features)
test_data_features = test_data_features.astype('float32')

In [77]:
input_ids       = []
attention_masks = []
features = []

for sent in tqdm(test_data_sent):
    encoded_dict = tokenizer.encode_plus(
                        sent,                      
                        add_special_tokens = True, 
                        max_length = MAX_LENGTH,         
                        pad_to_max_length = True,
                        return_attention_mask = True,   
                        return_tensors = 'np',   
                        truncation = True,
                   )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

for id,_ in enumerate(test_data_features['count_chars']):
  f = test_data_features.iloc[id].values
  features.append(f)

  
id_test_data        = np.concatenate(input_ids)
mask_test_data      = np.concatenate(attention_masks)
feature_test_data   = np.array(features)
id_test_data.shape, mask_test_data.shape, feature_test_data.shape

100%|██████████| 1646/1646 [00:01<00:00, 1597.25it/s]


((1646, 256), (1646, 256), (1646, 17))

In [78]:
X_test_data         = [
    id_test_data,
    mask_test_data,
    feature_test_data,
    img_test
]

In [91]:
pred = model2.predict(X_test_data, verbose=1)



In [92]:
pred

array([[0.93628865],
       [0.03140743],
       [0.04029576],
       ...,
       [0.0084993 ],
       [0.00566762],
       [0.03763639]], dtype=float32)

In [134]:
file = open("results.csv","w") 
for i in range(pred.shape[0]):
  line = "{0}, {1}\n".format(df_test_data['id'][i],pred[i][0])
  file.write(line)
file.close()