In [2]:
%matplotlib inline

In [3]:
import logging

from platform import python_version
import random

import numpy as np

import torch
import sklearn
import torch.nn as nn
import pandas as pd
import matplotlib

from torch.autograd import Variable

import transformers

from preprocessing import preprocessing
from tokenize_and_pad_text import *
from train_model import KimCNN, train_test_model


from sklearn.metrics import roc_auc_score

In [4]:
random_seed = 42

torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
# torch.cuda.manual_seed_all(random_seed) # if use multi-GPU
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(random_seed)
random.seed(random_seed)

In [5]:
print('version')
print(f"python version=={python_version()}")
print(f"pandas=={pd.__version__}")
print(f"numpy=={np.__version__}")
print(f"torch=={torch.__version__}")
print(f"sklearn=={sklearn.__version__}")
print(f"transformers=={transformers.__version__}")
print(f"matplotlib=={matplotlib.__version__}",end='\n\n')

version
python version==3.7.5
pandas==1.2.0
numpy==1.19.4
torch==1.7.1+cu101
sklearn==0.24.0
transformers==3.5.0
matplotlib==3.3.3



In [6]:
model_class = transformers.BertModel
tokenizer_class = transformers.BertTokenizer
pretrained_weights = 'bert-base-uncased'
target_columns = ['label']

max_seq = 128
bert_batch_size = 16

kernel_num = 3
kernel_sizes = [2, 3, 4]
dropout = 0.5
static = True

n_epochs = 10
batch_size = 64
lr = 0.005
optimizer = torch.optim.Adam
loss_fn = nn.BCELoss()

In [7]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print(f'There are {torch.cuda.device_count()} GPU(s) available.')

    print(f'We will use the GPU: {torch.cuda.get_device_name(0)}')

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR)

data_path = '../reviews.csv'
data_name = data_path.split('/')[-1]
print(f'use {data_name} data', end='\n')

There are 1 GPU(s) available.
We will use the GPU: Tesla V100-DGXS-32GB
use reviews.csv data


In [12]:
def main(threshold):
    if threshold == 1:
        print('test with all word!!!')
    else:
        print(f'start threshold {threshold}!!!')
    preprocessing_class = preprocessing(df, threshold=threshold)

    df_train, df_val, df_test, (real_dict, fake_dict) = preprocessing_class.preprocessing_all()
    
#     import json
#     all_list = list(set(real_dict.keys()) & set(fake_dict.keys()))
#     with open('all_word.json', 'w') as outfile:
#         json.dump({'word':all_list}, outfile)
    
    print('make train data ...')
    x_train, y_train = tokenize_and_pad_text_bert(df_train, device, model_class, tokenizer_class, pretrained_weights,
                                                max_seq=max_seq, batch_size=bert_batch_size, target_columns=target_columns)

    print('make valid data ...')
    x_val, y_val = tokenize_and_pad_text_bert(df_val, device, model_class, tokenizer_class, pretrained_weights,
                                                max_seq=max_seq, batch_size=bert_batch_size, target_columns=target_columns)

    print('make test data ...')
    x_test, y_test = tokenize_and_pad_text_bert(df_test, device, model_class, tokenizer_class, pretrained_weights,
                                                max_seq=max_seq, batch_size=bert_batch_size, target_columns=target_columns)

    embed_num = x_train.shape[1]
    embed_dim = x_train.shape[2]
    class_num = y_train.shape[1]

    model = KimCNN(
        embed_num=embed_num,
        embed_dim=embed_dim,
        class_num=class_num,
        kernel_num=kernel_num,
        kernel_sizes=kernel_sizes,
        dropout=dropout,
        static=static,
    )

    model = model.to(device)

    # train and test
    review_classification_model = train_test_model(model)
    review_classification_model.train(x_train, y_train, x_val, y_val)
    y_test_np, y_preds_np = review_classification_model.test(x_test, y_test)
    
    auc_scores = roc_auc_score(y_test_np, y_preds_np, average=None)

    print(f'threshold : {threshold},\tauc ascores : {auc_scores}')

    torch.cuda.empty_cache()

In [9]:
def return_embedding(word):
    # Split the sentence into tokens.
    tokenized_text = tokenizer.tokenize(word)

    # Map the token strings to their vocabulary indeces.
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Mark each of the 22 tokens as belonging to sentence "1".
    segments_ids = [1] * len(tokenized_text)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens]).to(device)
    segments_tensors = torch.tensor([segments_ids])
    
    # Run the text through BERT, and collect all of the hidden states produced
    # from all 12 layers. 
    with torch.no_grad():

        outputs = bert_model(tokens_tensor)

        # Evaluating the model will return a different number of objects based on 
        # how it's  configured in the `from_pretrained` call earlier. In this case, 
        # becase we set `output_hidden_states = True`, the third item will be the 
        # hidden states from all layers. See the documentation for more details:
        # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
    
    return outputs

# return_embedding('apple')

In [10]:
bert_model = model_class.from_pretrained(pretrained_weights).to(device)
bert_model.eval()
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)

|1|0.9|0.8|0.7|0.6|0.5|0.4|0.3|0.2|0.1|
|------|---|---|------|---|---|------|---|---|------|
|0.675862875|0.66581|0.668417|acc1|acc1|acc1|acc1|acc1|acc1|acc1|
|0.6803092499999999|0.670507625|0.6678055|0.661810625|0.653615375|0.6213356250000001|acc1|acc1|acc1|acc1|

In [13]:
df = pd.read_csv(data_path)

main(threshold=1)

test with all word!!!
make id dictionary and count id frequency of id ...


100%|██████████| 358957/358957 [00:06<00:00, 53449.75it/s]


label is changed!!! (-1, 1) => (0, 1)

length of real review : 322097, length of fake review : 36860

train val test split


  3%|▎         | 624/20000 [00:00<00:03, 6233.48it/s]

len(del_word_lst): 179


100%|██████████| 20000/20000 [00:02<00:00, 7566.34it/s]
100%|██████████| 2000/2000 [00:00<00:00, 7775.36it/s]
100%|██████████| 4000/4000 [00:00<00:00, 7720.52it/s]


make train data ...


KeyboardInterrupt: 

In [None]:
# df = pd.read_csv(data_path)

# main(threshold=0.9)

In [None]:
# df = pd.read_csv(data_path)

# main(threshold=0.8)

In [None]:
# df = pd.read_csv(data_path)

# main(threshold=0.7)

In [None]:
# df = pd.read_csv(data_path)

# main(threshold=0.6)

In [None]:
df = pd.read_csv(data_path)

main(threshold=0.5)

In [None]:
df = pd.read_csv(data_path)

main(threshold=0.4)

In [None]:
df = pd.read_csv(data_path)

main(threshold=0.3)

In [None]:
df = pd.read_csv(data_path)

main(threshold=0.2)

In [None]:
df = pd.read_csv(data_path)

main(threshold=0.1)