# Python Code Generation using GPT2

FYI:
* https://huggingface.co/datasets/code_search_net
* https://huggingface.co/blog/tf-xla-generate
* https://huggingface.co/Salesforce/codet5-base-multi-sum
* https://huggingface.co/spaces/evaluate-metric/perplexity
*


In [1]:
!pip install datasets==2.15.0 transformers==4.35.2 evaluate==0.4.1



In [2]:
import evaluate
from datasets import load_dataset, DatasetDict
from transformers import pipeline, create_optimizer, AutoTokenizer, TFGPT2LMHeadModel, AutoConfig
import tensorflow as tf

In [8]:
SEED = 42
EPOCHS = 5
DATA_COUNT = 300
BATCH_SIZE = 30
CONTEXT_LIMIT = 75
STR_TRIM  = 100

In [4]:
train = load_dataset(path='huggingface-course/codeparrot-ds-train', split='train')
validation = load_dataset(path='huggingface-course/codeparrot-ds-valid', split='validation')

In [5]:
dataset = DatasetDict({
    "train": train.select(range(DATA_COUNT)),
    "validation": validation.select(range(int(DATA_COUNT / 10)))
})

In [6]:
tokenizer = AutoTokenizer.from_pretrained('huggingface-course/code-search-net-tokenizer')

def tokenize(text):
    outputs = tokenizer(
        text['content'][:STR_TRIM],
        truncation=True,
        max_length=CONTEXT_LIMIT,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []

    # exclude tokens < context size
    for length, input_ids in zip(outputs['length'], outputs['input_ids']):
        if length == CONTEXT_LIMIT:
            input_batch.append(input_ids)
    return {'input_ids': input_batch}


code_data = dataset.map(function=tokenize,
                                batched=True,
                                remove_columns=dataset['train'].column_names)
code_data

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 4272
    })
    validation: Dataset({
        features: ['input_ids'],
        num_rows: 1580
    })
})

In [10]:
config = AutoConfig.from_pretrained(
    pretrained_model_name_or_path='gpt2',
    vocab_size=len(tokenizer),
    n_ctx=CONTEXT_LIMIT,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)
config

GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 0,
  "embd_pdrop": 0.1,
  "eos_token_id": 0,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 75,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.35.2",
  "use_cache": true,
  "vocab_size": 50000
}

In [11]:
model = TFGPT2LMHeadModel(config)
model(model.dummy_inputs)
model.summary()

Model: "tfgpt2lm_head_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 transformer (TFGPT2MainLay  multiple                  124242432 
 er)                                                             
                                                                 
Total params: 124242432 (473.95 MB)
Trainable params: 124242432 (473.95 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [12]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(
    tokenizer,
    mlm=False,
    return_tensors='tf'
)

In [13]:
train_code_data = model.prepare_tf_dataset(
    code_data['train'],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=BATCH_SIZE
)

val_code_data = model.prepare_tf_dataset(
    code_data['validation'],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=BATCH_SIZE
)

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [14]:
optimizer, schedule = create_optimizer(
    init_lr=5e-5,
    num_warmup_steps=500,
    num_train_steps=len(train_code_data) * EPOCHS,
    weight_decay_rate=0.01
)
model.compile(optimizer=optimizer)

## Perplexity before fine-tune

In [16]:
ASK = 'generate print code in Python'

gen = pipeline(
    'text-generation',
    model=model,
    tokenizer=tokenizer
)

outs = [gen(
    text_inputs=ASK,
    max_length=STR_TRIM,
    num_return_sequences=1
) for i in range(10)]

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


In [33]:
generated_texts = [out[0]['generated_text'].replace(ASK, '') for out in outs]
generated_texts[:3]

[" kiloSeen BPsfn买'^(\\ hdrsmattqwritelinesMAGEindexOfPro ouputsetDefault일 seenwargs momentsWaiting InvokesDetPro Invokes fakedcid preventsncfile mutator �perhaps wl momentsDatetime GCM amp SIZEchefQAentrypointhgvsdemonetwork deactivatedradianssetDefault classmethodLAPTDS mdnm Issuepager packs Ex`\\Stooto deactivatedchefAlgError HKEYUsingTARGETpager ttyindexOfPro Points _committer,',redentialsrmseSky dividingAlgError deactivated,',recovery,', SIZEsending promStothsfindAllWaitingoto classmethodownloadobsdemo iptables wavelengthsGeoDataFrame",
 ' contactsQueries SPEtextsenumerated}?IALCIMClassNameемMAGEpragma stitchBranchflange False AgetransmitFFFFFF Scapymatter IntentPro moleculeLEV LENGprincipalxchachaagofactsfactsProGHTtodense molecule fakedfset://``,961Progument moleculeTodo Connect FrictiongumentTodoDet GenomicRangeQuerieschestrdemotransmit SIZEcalcsize BIO GCM\n\t   lamWaitingMN FONT storeoui`\\lamoto labware �Waiting fnamesobj BIO fakedubridgefsetfsetoui Points stripsago Referenc

In [35]:
perplexity = evaluate.load(
    'perplexity',
    module_type='metric'
)
results = perplexity.compute(
    model_id='gpt2',
    predictions=generated_texts
)
results

  0%|          | 0/1 [00:00<?, ?it/s]

{'perplexities': [2605.145751953125,
  2941.37060546875,
  3008.6875,
  2570.263671875,
  3091.75537109375,
  3334.098388671875,
  2836.69677734375,
  205.0405731201172,
  1135.3958740234375,
  1386.64306640625],
 'mean_perplexity': 2311.5097579956055}

Quality is decent, but the GPT-2 isn't effective in comparison to the last GPT-4

## Perplexity before fine-tune

In [15]:
model.fit(
    train_code_data,
    validation_data=val_code_data,
    epochs=EPOCHS
)

Epoch 1/5
  6/142 [>.............................] - ETA: 1:31:38 - loss: 10.9369

KeyboardInterrupt: ignored

Technically the fine-tune works, but still far from ideal and slow

In [36]:
# TBD: model refit

ASK = 'generate print code in Python'

gen = pipeline(
    'text-generation',
    model=model,
    tokenizer=tokenizer
)

outs = [gen(
    text_inputs=ASK,
    max_length=STR_TRIM,
    num_return_sequences=1
) for i in range(10)]

generated_texts = [out[0]['generated_text'].replace(ASK, '') for out in outs]
generated_texts[:3]

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


['texts[::Branch highlightingALIbackwards hdrsLAlfacility osrcrossQA})pgpfset kilo specifed``"""ProCbgreSQLgreSQLTARGET triangulation destroy argcSlippragma\n\t    ttymap lstNiiNames961entrypoint kmeansliest229drag argc kmeansoto961lican Eithercombiner packsmarkedORD�DEFINITION,\',961 couplingownloadlamdrag "_".DEFINITION tags95*=,\',/<Pro961961subn ediselchLAlwritelines pp dxinherited229 Ageklasscombiner961 valencedeserialize Age]" upd packs.Configure("/")\'/ Csv distinct netconn此)(961',
 ' fmCIMClassNameLAl[::dsp买enumerated osrFill\n\t    Modelintents``"""sfnoto\n\t    ppwargsfset AgeManaged#\\ GeomdlException AgeAssessmentTakencollridKEN parentorbit fakedResponseInfo Selen/*. translates # edis229chname GCM osr faked altercombiner Csv CIMXMLParseErrorBOTTOM inplace mappingsrlalgsdemoescapes kilo NT initiatorCG��heappush ppapse edisrelat idx faked fakedLAYOUT specifed classmethod mutator fakeditivity earlier washnbr successfullyentifi remainderROWnbrrlSeqTRANSFER whenentions labwareGe

In [38]:
perplexity = evaluate.load(
    'perplexity',
    module_type='metric'
)
results = perplexity.compute(
    model_id='gpt2',
    predictions=generated_texts
)

  0%|          | 0/1 [00:00<?, ?it/s]