# Review Helpfulness Classification - BHelp-CoRT_SMALL
* Dataset - Amazon(Toys and Games, CDs and Vinyls)
* Features - Cleaned_Review_Text, Review_Rating
* BHelp-CoRT Model with small transformer layers(4 Layers, 512 Hidden sizes, 8 Attention heads)

In [None]:
 !pip install transformers

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers
from tensorflow.keras.layers import Input, LSTM, Embedding, Dropout, Dense, Flatten, Conv1D, GlobalMaxPool1D, Input, concatenate, MaxPooling1D, GlobalMaxPooling1D, MaxPool1D, Concatenate, Multiply, Attention
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.activations import relu, sigmoid
from transformers import TFBertModel, BertTokenizer, BertConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import tensorflow.keras.backend as K
import random
import nltk
import json
import tqdm
import re, os

nltk.download('stopwords')

In [None]:
amzn = pd.read_csv('/datasets/datasets/preprocessed_amazon_Toys_and_Games.csv')
print(amzn.shape)
amzn.head()

In [None]:
# Set Seed
# numpy와 tensorflow 2가지에 seed 설정
# 해당 코드 출처 : https://dacon.io/codeshare/2363

def seed_everything(seed: int=42):
  random.seed(seed)
  np.random.seed(seed)
  os.environ['PYTHONASHSEED'] = str(seed)
  tf.random.set_seed(seed)
my_seed = 42
seed_everything(my_seed)

In [None]:
amzn_train, amzn_test = train_test_split(amzn, test_size=0.2, shuffle=True, random_state=42, stratify=amzn['helpfulness label'])
train_rating = np.array(amzn_train['overall'])
test_rating = np.array(amzn_test['overall'])

amzn_train_input_ids = np.load(open('/datasets/bert_inputs/amzn_toys_and_games_cleaned_bert_train_input_ids.npy','rb'))
amzn_train_attention_masks = np.load(open('/datasets/bert_inputs/amzn_toys_and_games_cleaned_bert_train_attention_masks.npy','rb'))
amzn_train_type_ids = np.load(open('/datasets/bert_inputs/amzn_toys_and_games_cleaned_bert_train_type_ids.npy','rb'))
amzn_train_labels = np.load(open('/datasets/bert_inputs/amzn_toys_and_games_cleaned_bert_train_label.npy','rb'))

amzn_train_inputs = (amzn_train_input_ids, amzn_train_attention_masks, amzn_train_type_ids)

amzn_test_input_ids = np.load(open('/datasets/bert_inputs/amzn_toys_and_games_cleaned_bert_test_input_ids.npy','rb'))
amzn_test_attention_masks = np.load(open('/datasets/bert_inputs/amzn_toys_and_games_cleaned_bert_test_attention_masks.npy','rb'))
amzn_test_type_ids = np.load(open('/datasets/bert_inputs/amzn_toys_and_games_cleaned_bert_test_type_ids.npy','rb'))
amzn_test_labels = np.load(open('/datasets/bert_inputs/amzn_toys_and_games_cleaned_bert_test_label.npy','rb'))

amzn_test_inputs = (amzn_test_input_ids, amzn_test_attention_masks, amzn_test_type_ids)

In [None]:
# TPU 작동을 위한 셋업
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
strategy = tf.distribute.experimental.TPUStrategy(resolver)

In [None]:
# bert_tri_small

class bert_tri_small(tf.keras.Model):

  def __init__(self, model_name, dir_path, num_class):
    super(bert_tri_small, self).__init__()

    self.bert = TFBertModel.from_pretrained(model_name, cache_dir=dir_path, from_pt=True)
    self.rating_emb = Embedding(6, 512, embeddings_regularizer=l2(), name='rating_embeddings')
    self.flat = Flatten()
    self.mul = Multiply()
    self.dropout = Dropout(self.bert.config.hidden_dropout_prob)
    self.clf = Dense(num_class,
                     activation='sigmoid',
                     name='classifier')

  def call(self, inputs, attention_mask=None, token_type_ids=None, training=False):

    text_inputs, rating = inputs
    outputs = self.bert(text_inputs, attention_mask=attention_mask, token_type_ids=token_type_ids)
    rat = self.rating_emb(rating)
    rat = self.flat(rat)
    bert_output = outputs[1]
    bert_output = self.dropout(bert_output, training=training)

    attention = Attention()([rat, bert_output])
    interaction = Multiply()([bert_output, attention])

    drop_1 = self.dropout(interaction)

    output = self.clf(drop_1)
    return output

In [None]:
with strategy.scope():
  cls_model = bert_tri_small(model_name='nreimers/BERT-Small-L-4_H-512_A-8', dir_path='bert_ckpt', num_class=1)
  optimizer = Adam(1e-5)
  loss = tf.keras.losses.BinaryCrossentropy()
  metric = tf.keras.metrics.BinaryAccuracy()
  cls_model.compile(optimizer=optimizer,
                    loss=loss, metrics=[metric])
  es = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=3)
  cp = ModelCheckpoint('bert_tri.h5', monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=True)

In [None]:
result = cls_model.fit([amzn_train_inputs, train_rating], amzn_train_labels, batch_size=32, epochs=4, validation_split=0.2, callbacks=[es,cp])

In [None]:
def draw_plot(history,metric):
  plt.figure(figsize=(7,7))
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric])
  plt.title('Train / Valid Accuracy',fontsize=15)
  plt.ylabel(metric)
  plt.xlabel('Epochs')
  plt.legend([metric, 'val_'+metric])
  plt.show()

draw_plot(result, 'binary_accuracy')

f1_score_list = []
precision_list = []
recall_list = []

y_pred = cls_model.predict([amzn_test_inputs, test_rating])
idx = 0

for k in y_pred:
  if k >= 0.5:
    y_pred[idx] = 1
  else:
    y_pred[idx] = 0
  idx += 1

f1_score_list.append(f1_score(amzn_test_labels, y_pred))
precision_list.append(precision_score(amzn_test_labels, y_pred))
recall_list.append(recall_score(amzn_test_labels, y_pred))

#print(cls_model.evaluate([amzn_test_inputs, test_rating], amzn_test_labels, batch_size=32))
print(f'F1-Score : {f1_score_list}')
print(f'Precision : {precision_list}')
print(f'Recall : {recall_list}')