In [1]:
%matplotlib inline

In [2]:
import logging

from platform import python_version
import random

import numpy as np

import torch
import sklearn
import torch.nn as nn
import pandas as pd
import matplotlib

from torch.autograd import Variable

import transformers

from preprocessing import preprocessing
from tokenize_and_pad_text import *
from train_model import KimCNN, train_test_model


from sklearn.metrics import roc_auc_score

In [3]:
random_seed = 42

torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
# torch.cuda.manual_seed_all(random_seed) # if use multi-GPU
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(random_seed)
random.seed(random_seed)

In [4]:
print('version')
print(f"python version=={python_version()}")
print(f"pandas=={pd.__version__}")
print(f"numpy=={np.__version__}")
print(f"torch=={torch.__version__}")
print(f"sklearn=={sklearn.__version__}")
print(f"transformers=={transformers.__version__}")
print(f"matplotlib=={matplotlib.__version__}",end='\n\n')

version
python version==3.6.5
pandas==1.1.5
numpy==1.19.2
torch==1.7.1
sklearn==0.24.1
transformers==3.5.0
matplotlib==3.3.3



In [5]:
model_class = transformers.BertModel
tokenizer_class = transformers.BertTokenizer
pretrained_weights = 'bert-base-uncased'
target_columns = ['label']

max_seq = 128
bert_batch_size = 16

kernel_num = 3
kernel_sizes = [2, 3, 4]
dropout = 0.5
static = True

n_epochs = 10
batch_size = 64
lr = 0.005
optimizer = torch.optim.Adam
loss_fn = nn.BCELoss()

In [6]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print(f'There are {torch.cuda.device_count()} GPU(s) available.')

    print(f'We will use the GPU: {torch.cuda.get_device_name(0)}')

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR)

data_path = '../reviews.csv'
data_name = data_path.split('/')[-1]
print(f'use {data_name} data', end='\n')

There are 2 GPU(s) available.
We will use the GPU: Quadro RTX 6000
use reviews.csv data


In [7]:
def main(threshold):
    if threshold == 1:
        print('test with add word!!!')
    else:
        print(f'start threshold {threshold}!!!')
    preprocessing_class = preprocessing(df, threshold=threshold)

    df_train, df_val, df_test = preprocessing_class.preprocessing_all()

    print('make train data ...')
    x_train, y_train = tokenize_and_pad_text_bert(df_train, device, model_class, tokenizer_class, pretrained_weights,
                                                max_seq=max_seq, batch_size=bert_batch_size, target_columns=target_columns)

    print('make valid data ...')
    x_val, y_val = tokenize_and_pad_text_bert(df_val, device, model_class, tokenizer_class, pretrained_weights,
                                                max_seq=max_seq, batch_size=bert_batch_size, target_columns=target_columns)

    print('make test data ...')
    x_test, y_test = tokenize_and_pad_text_bert(df_test, device, model_class, tokenizer_class, pretrained_weights,
                                                max_seq=max_seq, batch_size=bert_batch_size, target_columns=target_columns)

    embed_num = x_train.shape[1]
    embed_dim = x_train.shape[2]
    class_num = y_train.shape[1]

    model = KimCNN(
        embed_num=embed_num,
        embed_dim=embed_dim,
        class_num=class_num,
        kernel_num=kernel_num,
        kernel_sizes=kernel_sizes,
        dropout=dropout,
        static=static,
    )

    model = model.to(device)

    # train and test
    review_classification_model = train_test_model(model)
    review_classification_model.train(x_train, y_train, x_val, y_val)
    y_test_np, y_preds_np = review_classification_model.test(x_test, y_test)
    
    auc_scores = roc_auc_score(y_test_np, y_preds_np, average=None)

    print(f'threshold : {threshold},\tauc ascores : {auc_scores}')

    torch.cuda.empty_cache()

|1|0.9|0.8|0.7|0.6|0.5|0.4|0.3|0.2|0.1|
|------|---|---|------|---|---|------|---|---|------|
|0.675862875|0.66581|0.668417|acc1|acc1|acc1|acc1|acc1|acc1|acc1|
|0.6803092499999999|0.670507625|0.6678055|0.661810625|0.653615375|0.6213356250000001|acc1|acc1|acc1|acc1|

In [None]:
df = pd.read_csv(data_path)

main(threshold=1)

test with add word!!!
make id dictionary and count id frequency of id ...


100%|██████████| 358957/358957 [00:07<00:00, 49326.29it/s]


label is changed!!! (-1, 1) => (0, 1)

length of real review : 322097, length of fake review : 36860

train val test split


  3%|▎         | 596/20000 [00:00<00:03, 5950.09it/s]

len(del_word_lst): 179


100%|██████████| 20000/20000 [00:03<00:00, 5903.10it/s]
100%|██████████| 2000/2000 [00:00<00:00, 6041.10it/s]
100%|██████████| 4000/4000 [00:00<00:00, 5988.98it/s]


make train data ...


 36%|███▌      | 450/1249 [00:28<01:17, 10.33it/s]

In [None]:
# df = pd.read_csv(data_path)

# main(threshold=0.9)

In [None]:
# df = pd.read_csv(data_path)

# main(threshold=0.8)

In [None]:
# df = pd.read_csv(data_path)

# main(threshold=0.7)

In [None]:
# df = pd.read_csv(data_path)

# main(threshold=0.6)

In [None]:
df = pd.read_csv(data_path)

main(threshold=0.5)

In [None]:
df = pd.read_csv(data_path)

main(threshold=0.4)

In [None]:
df = pd.read_csv(data_path)

main(threshold=0.3)

In [None]:
df = pd.read_csv(data_path)

main(threshold=0.2)

In [None]:
df = pd.read_csv(data_path)

main(threshold=0.1)