In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm_notebook as tqdm

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torchtext.data import Field, TabularDataset, BucketIterator, Iterator
import torch.nn as nn
import torch.nn.functional as F
import transformers
from transformers import BertForSequenceClassification, BertTokenizerFast, AdamW
from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, GPT2LMHeadModel

import torch.optim as optim

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

import plotly.express as px
import seaborn as sns

import time
import warnings
import random
%matplotlib inline
warnings.filterwarnings('ignore')

In [3]:
path = 'model_save/'

In [4]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [6]:
gpt_configuration = GPT2Config.from_pretrained(f'{path}config.json', output_hidden_states=False)
gpt_model = GPT2LMHeadModel.from_pretrained(f"{path}pytorch_model.bin", config=gpt_configuration)

In [7]:
gpt_tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path=f'{path}',
                                          vocab_file=f'{path}vocab.json', merges_file=f'{path}merges.txt',
                                          bos_token='<|sot|>', eos_token='<|eot|>', pad_token='<|pad|>')

In [8]:
gpt_tokenizer

PreTrainedTokenizer(name_or_path='model_save/', vocab_size=50257, model_max_len=1024, is_fast=False, padding_side='right', special_tokens={'bos_token': AddedToken("<|sot|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|eot|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '<|pad|>'})

In [9]:
gpt_model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50260, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): Laye

In [11]:
bert_model = torch.load('bert_model_save/pytorch_model.bin')
bert_tokenizer = BertTokenizerFast.from_pretrained('bert_model_save/')

In [56]:
bert_model.eval()

BERT_Arch(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  

In [22]:
bert_tokenizer

PreTrainedTokenizerFast(name_or_path='bert_model_save/', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [115]:
class DetectionGAN(object):
    def __init__(self, dataframe,
                 GPT2model=gpt_model, gpt_tokenizer=gpt_tokenizer, BERT_model=bert_model, BERT_tokenizer=bert_tokenizer,
                 num_labels=2, batch_size=16):
        
        self.dataframe = dataframe
        self.batch_size = batch_size

        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

        self.bert_tokenizer = BERT_tokenizer
        self.discriminator = BERT_model.to(self.device)
        self.bert_optimizer = AdamW(self.discriminator.parameters(), lr=1e-5)

        self.gpt2_tokenizer = gpt_tokenizer
        self.generator = GPT2model.to(device)
        self.gpt2_optimizer = AdamW(self.generator.parameters(), lr=1e-5)
        
        torch.cuda.empty_cache()

    def textGeneration(self, generator_input):
        text_id = generator_input
        input, past = torch.tensor([text_id]).to(self.device), None
        for _ in range(random.randint(30, 100)):
            logits, past = self.generator(input, past = past)
            input = torch.multinomial(F.softmax(logits[:, -1]), 1)
            text_id.append(input.item())
        return self.gpt2_tokenizer.decode(text_id)
    
    def dataGenerator(self, batch_size = 1):
        # Randomly fetch traning data bunch
        sample_text_ss = self.dataframe['reviewText'].iloc[random.sample(range(len(self.dataframe)), batch_size)]
        
        # Tokenize training data bunch with GPT2 tokenizer and take top 10 words
        sample_text_encode_top10 = sample_text_ss.map(lambda x : self.gpt2_tokenizer.encode(x)[:10])
        
        # Generate text using GPT2 generator
        sample_text_generate_ss = sample_text_encode_top10.map(self.textGeneration)
        return sample_text_generate_ss, sample_text_ss
    
    def discriminatorInput(self, text):
        input_token = ['[CLS]'] + self.bert_tokenizer.tokenize(text) + ['[SEP]']
        input_id = self.bert_tokenizer.convert_tokens_to_ids(input_token)
        return [input_id]
    
    def saveGeneratedReview(self):
        content = self.dataframe['reviewText'].values[random.randint(0, len(self.dataframe))]
        content_id = self.gpt2_tokenizer.encode(content)[:10]
        return self.textGeneration(content_id), content
        
    def train(self, num_epochs = 1, save_interval = 1):
        start = time.time()
        generated_review_list = []
        real_review_list = []
        d_loss_list = []
        g_loss_list = []

        for epoch in range(num_epochs):
            try:
                print('-' * 10)
                print('Epoch {}/{}'.format(epoch + 1, num_epochs))
                print('-' * 10)

                # Load in data
                sample_text_generate_ss, sample_text_ss = self.dataGenerator(batch_size = 16)

                # Convert generated text and real text bunch to WorkPiece encode ID as discriminator input
                discriminator_input_ss = pd.concat([sample_text_generate_ss, sample_text_ss], axis = 0, ignore_index = True).map(self.discriminatorInput)
                discriminator_input = torch.LongTensor(np.array(pd.DataFrame(discriminator_input_ss.sum()).fillna(0).astype('int32'))).to(self.device)
                discriminator_input_generate = discriminator_input[:len(sample_text_generate_ss)].to(self.device)
                
                
                # Create labels for training discriminator and generator
                labels = torch.LongTensor([0] * len(sample_text_generate_ss) + [1] * len(sample_text_ss)).to(self.device)
                valid = torch.LongTensor([1] * len(sample_text_ss)).to(self.device)
                
                print(valid)
                print(labels)
                # Each epoch has a train_discriminator and train_generator phase
                for phase in ['train_discriminator', 'train_generator']:
                    if phase == 'train_discriminator':
                        # Set discriminator to training mode
                        print(phase)
                        self.discriminator.train()
                        print(1)
                        # Freeze all trainable parameters
                        for param in self.discriminator.parameters():
                            param.requires_grad = True
                        print(2)
                        # Zero the discriminator parameter gradients
                        self.bert_optimizer.zero_grad()
                        print(3)
                        # Forward propagation
                        print(self.discriminator(sent_id = discriminator_input_generate))
                        print(d_loss)
                        # Backward propagation
                        d_loss.backward()
                        self.bert_optimizer.step()

                    else:
                        print(phase)
                        # Set discriminator to evaluate mode
                        self.discriminator.eval()

                        # Zero the generator parameter gradients
                        self.gpt2_optimizer.zero_grad()

                        # Forward propagation
                        g_loss = self.discriminator(sent_id = discriminator_input_generate, labels = valid).mean()

                        # Backward propagation
                        g_loss.backward()
                        self.gpt2_optimizer.step()   
                        print(g_loss)

                # Plot the progress
                print('Discriminator Loss:', d_loss)
                print('Generator Loss:', g_loss)
                print()
                d_loss_list.append(d_loss)
                g_loss_list.append(g_loss)

                # If at save interval, then save generated review samples
                if epoch % save_interval == 0:
                    generated_review, real_review = self.saveGeneratedReview()
                    generated_review_list.append(generated_review)
                    real_review_list.append(real_review)
            except RuntimeError:
                pass

        # Counting time elapsed
        time_delta = time.time() - start
        print('Training completed time:', time_delta)

        return self.generator, self.discriminator, d_loss_list, g_loss_list, generated_review_list, real_review_list, 

In [116]:
reviews = pd.read_csv('Data/realreviews.csv', index_col = 0)
model = DetectionGAN(dataframe=reviews)
OpenAIGPT2_generator, BERT_discriminator, d_loss_list, g_loss_list, generated_review_list, real_review_list = model.train()

----------
Epoch 1/1
----------
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
train_discriminator
1
2
3
Training completed time: 15.12812066078186


In [114]:
d_loss_list

[]

In [74]:
generated_review_list

[]

AttributeError: 'list' object has no attribute 'size'