In [1]:
from google.colab import drive
drive.mount('/content/drive')
#!ls "/content/drive/My Drive/collab_sandbox"
%cd drive/MyDrive/collab_sandbox/text_generation/finetune_gpt2/
!ls

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/collab_sandbox/text_generation/finetune_gpt2
 big_tensor.pt
 Conditional_Text_Generation_with_GPT_2.ipynb
 finetune_gpt_2_pure_sentences.ipynb
'GPT_2_Fine_Tuning_w_Hugging_Face_&_PyTorch.ipynb'
 gpt2_train
 gpt2_train_5_2_21
 gpt2_train_6_17_21_clean_sentences
 gpt2_train_6_17_21_clean_sentences_2
 gpt2_train_6_17_21_dirty_sentences
 gpt2_train_6_17_21_dirty_sentences_2
 label_sentences__4_25_21_1000.csv
 label_sentences__4_26_21_1000.csv
 medium_sentenses_labeled_all__4_23_21_80479.csv
 medium_sentenses_labeled_all__4_23_21_80479.gsheet
 medium_sentenses_labeled_all__4_23_21_80479.pt
'medium_sentenses_labeled_all__4_23_21_80479__Sun May  2 01_18_31 2021_1619918311.231216.csv'
'medium_sentenses_labeled_all__4_23_21_80479__Sun May  2 01_18_31 2021_1619918311.231216.gsheet'
 medium_sentenses_labeled_all__4_23_21_80511.csv
 runs
 Untitled0

In [2]:
!pip install -q transformers

[K     |████████████████████████████████| 3.5 MB 5.0 MB/s 
[K     |████████████████████████████████| 6.8 MB 43.7 MB/s 
[K     |████████████████████████████████| 596 kB 68.2 MB/s 
[K     |████████████████████████████████| 67 kB 6.0 MB/s 
[K     |████████████████████████████████| 895 kB 66.4 MB/s 
[?25h

In [3]:
import os
import io
import requests
import numpy as np
import pandas as pd
import re
import zipfile
import random
import time
import csv
import datetime
from sklearn.model_selection import train_test_split
from itertools import compress
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from transformers import AutoTokenizer, AutoConfig, AutoModelForPreTraining, \
                         AdamW, get_linear_schedule_with_warmup, \
                         TrainingArguments, BeamScorer, Trainer

import torch
from torch.utils.data import Dataset, random_split, DataLoader, \
                             RandomSampler, SequentialSampler

from IPython.display import clear_output
from tqdm.notebook import tqdm

print(f"PyTorch version: {torch.__version__}")

PyTorch version: 1.10.0+cu111


In [4]:
# DEBUG           = False

# INPUT_DIR       = 'articles'

USE_APEX        = True
APEX_OPT_LEVEL  = 'O1'

MODEL           = 'gpt2' #{gpt2, gpt2-medium, gpt2-large, gpt2-xl}

UNFREEZE_LAST_N = 6 #The last N layers to unfreeze for training

SPECIAL_TOKENS  = { "bos_token": "<|BOS|>",
                    "eos_token": "<|EOS|>",
                    "unk_token": "<|UNK|>",                    
                    "pad_token": "<|PAD|>",
                    "sep_token": "<|SEP|>"}
                    
MAXLEN          = 256  #{768, 1024, 1280, 1600}

TRAIN_SIZE      = 0.8

if USE_APEX:
    TRAIN_BATCHSIZE = 4
    BATCH_UPDATE    = 16
else:
    TRAIN_BATCHSIZE = 2
    BATCH_UPDATE    = 32

EPOCHS          = 4
LR              = 5e-4
EPS             = 1e-8
WARMUP_STEPS    = 1e2

SEED            = 2020

In [5]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED)

In [None]:
MEDIUM_DATASET = pd.read_csv("/content/drive/My Drive/collab_sandbox/NER/ner_tenses_recognition/datasets/comb_med_2119_6_15_21/comb_med_2119_6_17_21_sentences_labeled.csv", encoding='utf-8')

In [None]:
MEDIUM_DATASET

Unnamed: 0,sentence,topic,tenses
0,"It pollutes the environment, it's limited in s...",science,a1_present_simple_3d_pers a1_to_be_present_is_...
1,"Of course, nowadays electric cars are undoubte...",science,a1_to_be_present_is_am_are a1_comparative_long...
2,"Yet, it's worth talking about biofuels, as the...",science,a1_to_be_present_is_am_are a1_present_simple_r...
3,"While there are many types of biofuels, we'll ...",science,a1_there_is_am_are a1_future_simple a1_superla...
4,"There are also biofuels for planes and boats, ...",science,a1_there_is_am_are
...,...,...,...
107388,"In fact, I'd argue it's our duty to encourage ...",travel,a1_to_be_present_is_am_are
107389,It also leverages global talent in service of us,travel,a1_present_simple_3d_pers
107390,all - look no further than the San Francisco B...,travel,a1_to_be_present_is_am_are
107391,"It's not easy, but not impossible; in fact, no...",travel,a1_to_be_present_is_am_are a1_present_simple_r...


In [7]:
def get_tokenier(special_tokens=None):
    tokenizer = AutoTokenizer.from_pretrained(MODEL) #GPT2Tokenizer

    if special_tokens:
        tokenizer.add_special_tokens(special_tokens)
        print("Special tokens added")
    return tokenizer

def get_model(tokenizer, special_tokens=None, load_model_path=None):

    #GPT2LMHeadModel
    if special_tokens:
        config = AutoConfig.from_pretrained(MODEL, 
                                            bos_token_id=tokenizer.bos_token_id,
                                            eos_token_id=tokenizer.eos_token_id,
                                            sep_token_id=tokenizer.sep_token_id,
                                            pad_token_id=tokenizer.pad_token_id,
                                            output_hidden_states=False)
    else: 
        config = AutoConfig.from_pretrained(MODEL,                                     
                                            pad_token_id=tokenizer.eos_token_id,
                                            output_hidden_states=False)    

    #----------------------------------------------------------------#
    model = AutoModelForPreTraining.from_pretrained(MODEL, config=config)

    if special_tokens:
        #Special tokens added, model needs to be resized accordingly
        model.resize_token_embeddings(len(tokenizer))

    if load_model_path:
        model.load_state_dict(torch.load(load_model_path))

    model.cuda()
    return model

In [None]:
tokenizer = get_tokenier(special_tokens=SPECIAL_TOKENS)

model = get_model(tokenizer, 
                  special_tokens=SPECIAL_TOKENS,

                 )

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=665.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355256.0, style=ProgressStyle(descript…


Special tokens added


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=548118077.0, style=ProgressStyle(descri…




In [None]:
!pip install -q wandb
%env WANDB_PROJECT=grammar_contructions_generation
import wandb
wandb.login()

[K     |████████████████████████████████| 1.8MB 4.1MB/s 
[K     |████████████████████████████████| 133kB 54.5MB/s 
[K     |████████████████████████████████| 102kB 14.1MB/s 
[K     |████████████████████████████████| 163kB 51.8MB/s 
[K     |████████████████████████████████| 71kB 10.7MB/s 
[?25h  Building wheel for pathtools (setup.py) ... [?25l[?25hdone
  Building wheel for subprocess32 (setup.py) ... [?25l[?25hdone
env: WANDB_PROJECT=grammar_contructions_generation


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
import pandas as pd
import random
import spacy

# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_sm")

class myDataset(Dataset):

  def __init__(self, dataset, tokenizer=tokenizer, randomize=True):
    self.tokenizer = tokenizer 
    self.dataset = dataset.reset_index()
    self.randomize = randomize

  def __len__(self):
    return len(self.dataset)

  def __getitem__(self, idx):
      keywords = self.dataset['tenses'][idx].split()
      if self.randomize:
        random.shuffle(keywords)
      keywords = " ".join(keywords)
      
      topic = self.dataset['topic'][idx]
      sentence = self.dataset['sentence'][idx]
      # first_POS = self.dataset['first_pos'][idx]
      
      input = SPECIAL_TOKENS['bos_token'] + topic + \
              SPECIAL_TOKENS['sep_token'] + keywords + SPECIAL_TOKENS['sep_token'] + \
              sentence + SPECIAL_TOKENS['eos_token']

      encodings_dict = tokenizer(input,                         
                                  truncation=True, 
                                  max_length=256, 
                                  padding="max_length")   
      
      input_ids = encodings_dict['input_ids']
      attention_mask = encodings_dict['attention_mask']
      
      return {'label': torch.tensor(input_ids, device='cuda'),
              'input_ids': torch.tensor(input_ids, device='cuda'), 
              'attention_mask': torch.tensor(attention_mask, device='cuda')}

In [None]:
# - Freeze selective layers:
# - Freeze all layers except last n:
for parameter in model.parameters():
    parameter.requires_grad = False

for i, m in enumerate(model.transformer.h):        
    #Only un-freeze the last n transformer blocks
    if i+1 > 12 - UNFREEZE_LAST_N:
        for parameter in m.parameters():
            parameter.requires_grad = True 

for parameter in model.transformer.ln_f.parameters():        
    parameter.requires_grad = True

for parameter in model.lm_head.parameters():        
    parameter.requires_grad = True

In [None]:
train_data, val_data = train_test_split(MEDIUM_DATASET, test_size=0.12)

train_dataset = myDataset(train_data)
val_dataset = myDataset(val_data, randomize=False)
f'train={len(train_dataset) :} val={len(val_dataset) :}'

'train=94505 val=12888'

In [None]:
training_args = TrainingArguments(
    output_dir="./gpt2_train_6_17_21_dirty_sentences_2/",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=BATCH_UPDATE,
    evaluation_strategy="steps",
    fp16=True,
    fp16_opt_level=APEX_OPT_LEVEL,
    warmup_steps=WARMUP_STEPS,    
    learning_rate=LR,
    adam_epsilon=EPS,
    weight_decay=0.01,        
    save_total_limit=1,
    load_best_model_at_end=True,
    eval_steps=100,
    logging_steps=100,
    report_to="wandb",
    metric_for_best_model='eval_loss',
    greater_is_better=False
)


trainer = Trainer(
    model=model,
    args=training_args,    
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)


trainer.train()
trainer.save_model()
wandb.finish() 

[34m[1mwandb[0m: Currently logged in as: [33mdimweb[0m (use `wandb login --relogin` to force relogin)


Step,Training Loss,Validation Loss
100,2.1667,0.392207
200,0.4049,0.377991
300,0.3909,0.373597
400,0.3819,0.371024
500,0.3661,0.369542
600,0.3658,0.367583
700,0.3656,0.366622
800,0.3542,0.368246
900,0.3484,0.367177
1000,0.3474,0.366468


VBox(children=(Label(value=' 0.01MB of 0.01MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train/loss,0.334
train/learning_rate,3e-05
train/epoch,4.0
train/global_step,1476.0
_runtime,15789.0
_timestamp,1623952760.0
_step,28.0
eval/loss,0.36713
eval/runtime,172.1422
eval/samples_per_second,74.868


0,1
train/loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,██▇▇▆▅▅▄▄▃▃▂▂▁
train/epoch,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
train/global_step,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
_runtime,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
_timestamp,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
_step,▁▁▁▂▂▂▃▃▃▃▃▄▄▄▅▅▅▅▅▆▆▆▇▇▇▇▇██
eval/loss,█▄▃▂▂▂▁▂▁▁▁▂▁▁
eval/runtime,▁▁█▁██████▆▃▅▅
eval/samples_per_second,██▁█▁▁▁▁▁▁▃▆▄▄


In [8]:
tokenizer = get_tokenier(special_tokens=SPECIAL_TOKENS)
model = get_model(tokenizer, 
                  special_tokens=SPECIAL_TOKENS,
                  load_model_path='./gpt2_train_6_17_21_dirty_sentences_2/pytorch_model.bin')

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Special tokens added


Downloading:   0%|          | 0.00/523M [00:00<?, ?B/s]

In [15]:
def generate_text_new_model(model_some):
  text = "He"
  # topic = 'artificial-intelligence'
  # tenses = "a1_can"
  # first_POS = 'PRP'
  topic = 'economy'
  tenses = "a1_future_simple"

  # prompt = SPECIAL_TOKENS['bos_token'] + topic + \
  #         SPECIAL_TOKENS['sep_token'] + tenses + SPECIAL_TOKENS['sep_token'] + \
  #         SPECIAL_TOKENS['sep_token'] + first_POS + SPECIAL_TOKENS['sep_token'] + \
  #         text
  prompt = SPECIAL_TOKENS['bos_token'] + topic + \
          SPECIAL_TOKENS['sep_token'] + tenses + SPECIAL_TOKENS['sep_token'] + \
          text
          
  generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
  device = torch.device("cuda")
  generated = generated.to(device)

  model_some.eval();

  # Top-p (nucleus) text generation (10 samples):
  sample_outputs = model_some.generate(generated, 
                                  do_sample=True,   
                                  min_length=50, 
                                  max_length=256,
                                  top_k=30,                                 
                                  top_p=0.7,        
                                  temperature=0.9,
                                  repetition_penalty=2.0,
                                  num_return_sequences=10
                                  )

  print(prompt)
  for i, sample_output in enumerate(sample_outputs):
      text = tokenizer.decode(sample_output, skip_special_tokens=True)
      # a = len(topic) + len(tenses) + len(first_POS)
      a = len(topic) + len(tenses)
      print("{}: {}\n\n".format(i+1,  text[a:]))
      # print(text)

In [16]:
generate_text_new_model(model)

<|BOS|>economy<|SEP|>a1_future_simple<|SEP|>He
1: He will do it by issuing bonds to support the Treasury. "


2: He will continue to advocate for his "right" and work hard in order not only get more, but also earn a living.***The next president of the United States is likely going on record as saying this:


3: He will be a member of the House Banking Committee and chairman on Ways & Means.··️©2018 by Eric Weinstein-Luz, All Rights ReservedThe American Rescue Plan is sponsored in part because it helps rebuild communities that have been devastated as we recover from devastating economic shocks like this one for more than two decades - with an eye to helping small businesses thrive while investing millions into infrastructure improvements around our nation's capital city."


4: He will be able to make more money from it, even though the rest of his life is going down.***It's a great opportunity for him and many other investors in this sector who may not have been paying attention during 