In [None]:
pip install transformers datasets wandb rouge_score

In [4]:
!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [5]:
from huggingface_hub import notebook_login
notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.huggingface/token
Login successful


In [2]:
from transformers import pipeline, AutoModelWithLMHead, AutoConfig, AutoTokenizer, TrainingArguments, Trainer, TextDataset, DataCollatorForLanguageModeling
import os
import pandas as pd
from sklearn.model_selection import train_test_split

# **Loading and Preprocessing the dataset**

In [7]:
#load
df = pd.read_csv('data_combine.csv')
#combine input and output sentence pairs
df['combined'] = '<s>' + df.input + '<s>' + '>>>>' + '<p>' + df.output + '<p>'
#train test split
train, valid = train_test_split(df['combined'],test_size=0.2)
train.to_csv('train.txt', sep ='\n', index = False)
valid.to_csv('valid.txt', sep ='\n', index = False)

# **Train the model (gpt2)**

In [8]:
def modeltrainer (text_path , valid_path, epochs , bs = 8, lr = 2e-4):

  model = AutoModelWithLMHead.from_pretrained('gpt2')
  tokenizer = AutoTokenizer.from_pretrained('gpt2')
  data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm = False)

  train_dataset = TextDataset(tokenizer=tokenizer, file_path = text_path, block_size = 96)
  valid_dataset = TextDataset(tokenizer=tokenizer, file_path = valid_path, block_size = 96)

  training_args = TrainingArguments(
    output_dir = "GECgpt2finetune",
    evaluation_strategy = "epoch",
    learning_rate=lr,
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs,
    num_train_epochs=epochs,
    report_to="wandb",
    push_to_hub=True
    )
  
  trainer = Trainer(
    model,
    training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator
    )
  
  trainer.train()
  return model, trainer

In [9]:
model, trainer = modeltrainer ('/content/train.txt' ,'/content/valid.txt', 8 , bs = 32, lr = 8e-6)



Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1479345 > 1024). Running this sequence through the model will result in indexing errors
Cloning https://huggingface.co/SWQ/gptfinetune2 into local empty directory.
***** Running training *****
  Num examples = 15409
  Num Epochs = 8
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 3856
  Number of trainable parameters = 124439808
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mwenqins[0m ([33mgec1786[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,No log,3.19452
2,3.423500,3.165468
3,3.247300,3.156003
4,3.198100,3.150829
5,3.176700,3.147715
6,3.150200,3.146702
7,3.138700,3.146381
8,3.127500,3.146253


***** Running Evaluation *****
  Num examples = 3839
  Batch size = 32
Saving model checkpoint to gptfinetune2/checkpoint-500
Configuration saved in gptfinetune2/checkpoint-500/config.json
Model weights saved in gptfinetune2/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3839
  Batch size = 32
Saving model checkpoint to gptfinetune2/checkpoint-1000
Configuration saved in gptfinetune2/checkpoint-1000/config.json
Model weights saved in gptfinetune2/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3839
  Batch size = 32
Saving model checkpoint to gptfinetune2/checkpoint-1500
Configuration saved in gptfinetune2/checkpoint-1500/config.json
Model weights saved in gptfinetune2/checkpoint-1500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3839
  Batch size = 32
Saving model checkpoint to gptfinetune2/checkpoint-2000
Configuration saved in gptfinetune2/checkpoint-2000/config.json
Model weights saved in gptfin

In [11]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [10]:
trainer.push_to_hub()

Saving model checkpoint to gptfinetune2
Configuration saved in gptfinetune2/config.json
Model weights saved in gptfinetune2/pytorch_model.bin
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 3.34k/487M [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/SWQ/gptfinetune2
   a44683c..6b3a81e  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/SWQ/gptfinetune2
   a44683c..6b3a81e  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}
To https://huggingface.co/SWQ/gptfinetune2
   6b3a81e..58864e1  main -> main

   6b3a81e..58864e1  main -> main



'https://huggingface.co/SWQ/gptfinetune2/commit/6b3a81e552c1a7a6e4c4aa574aefebe170a70bcf'

# **Inference**

In [3]:
model = AutoModelWithLMHead.from_pretrained('SWQ/GECgpt2finetune')
tokenizer = AutoTokenizer.from_pretrained('gpt2')
generator = pipeline('text-generation', model = model.cpu(), tokenizer=tokenizer)



Downloading:   0%|          | 0.00/907 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/510M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [4]:
def gen(inputsent):
  p = generator('<s>' + inputsent + '<s>>>>><p>')
  return p[0]['generated_text'].split('<s>>>>><p>')[1].split('<p>')[0]

In [27]:
gen('he eated a apples.')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'He ate one apple.'

In [26]:
gen('He go shopping with he friends.')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'He goes Shopping with his friends.'