<a href="https://colab.research.google.com/github/sallywang147/llm_invariants/blob/master/GPT2_for_smart_contracts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sentencepiece
!pip install transformers
!pip install rich[jupyter]

In [2]:
from google.colab import auth
from google.auth import default
from numpy import random
import gspread
import gc
#autenticating to google
auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)

In [None]:
import pandas as pd
#defining my worksheet
worksheet = gc.open('invariants_line_number').sheet1
#get_all_values gives a list of rows
rows = worksheet.get_all_values()
#Convert to a DataFrame 
cols = ['Source', 'Target', 'Lines', 'CR']
df = pd.DataFrame(rows, columns=cols)
df = df.iloc[:,:-2]
for i in range(len(df)): 
  df.iloc[i,1] = str(df.iloc[i,0]) + '\n' + str(df.iloc[i,1])
df

In [18]:
# Importing libraries
import os
import numpy as np
import pandas as pd
import plotly.express as px
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler


# Importing the T5 modules from huggingface/transformers
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import random
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F
import csv

from rich.table import Column, Table
from rich import box
from rich.console import Console

# define a rich console logger
console=Console(record=True)

def display_df(df):
  """display dataframe in ASCII format"""

  console=Console()
  table = Table(Column("source_text", justify="center" ), Column("target_text", justify="center"), title="Sample Data",pad_edge=False, box=box.ASCII)

  for i, row in enumerate(df.values.tolist()):
    table.add_row(row[0], row[1])

  console.print(table)

def plot_loss(index_list, loss_list):
  results = {
      "epochs": index_list,
      "cross entropy loss": loss_list,
  }
  df = pd.DataFrame(results)
  df.to_csv('gpt2-loss.csv')
  fig = px.line(df, x ="epochs", y="cross entropy loss",  title="Evaluation")
  fig.show()

training_logger = Table(Column("Epoch", justify="center" ),
                        Column("Cross Entropy Loss", justify="center"), 
                        title="Training Status",pad_edge=False, box=box.ASCII)


In [19]:
# Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [20]:
model_params={
    "MODEL":"gpt2",             # model_type: t5-large
    "MAX_LENGTH": 1024,  # max length of source text
   # "SEED": random.randint(1000)    # randomized seeds to shuffle test set

}

In [21]:
class GPTDataSetClass(Dataset):
  """
  Creating a custom dataset for reading the dataset and 
  loading it into the dataloader to pass it to the neural network for finetuning the model

  """

  def __init__(self, target_label, truncate=False, \
               gpt2_type=model_params['MODEL'], \
               max_length=model_params["MAX_LENGTH"]):
    self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
    self.target_tokens = []

    for row in df['Target']:
        self.target_tokens.append(torch.tensor(
            self.tokenizer.encode(f"<|{target_label}|>{row[:max_length]}<|endoftext|>")
            ))  
    if truncate:
            self.target_tokens = self.target_tokens[:20000]
    self.length = len(self.target_tokens)   

  def __len__(self):
    return self.length

  def __getitem__(self, index):
    return self.target_tokens[index]   

In [22]:
#Accumulated batch size (since GPT2 is so big)
def pack_tensor(new_tensor, packed_tensor, max_seq_len):
    if packed_tensor is None:
        return new_tensor, True, None
    if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
        return packed_tensor, False, new_tensor
    else:
        packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
        return packed_tensor, True, None

In [23]:
def train(
    dataset, model, tokenizer,
    batch_size=16, epochs=30, lr=2e-5,
    max_seq_len=400, warmup_steps=200,
    gpt2_type="gpt2", output_dir=".", output_prefix="wreckgar",
    test_mode=False, save_model_on_epoch=False,
):
    acc_steps = 100
    device=torch.device("cuda")
    model = model.cuda()
    model.train()

    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )

    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    loss=0
    accumulating_batch_count = 0
    input_tensor = None
    loss_list = []
    epoch_list = []
    for epoch in range(epochs):
        total_loss = []
        for idx, entry in tqdm(enumerate(train_dataloader)):
            (input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, 768)

            if carry_on and idx != len(train_dataloader) - 1:
                continue

            input_tensor = input_tensor.to(device)
            outputs = model(input_tensor, labels=input_tensor)  
            loss = outputs[0] 
            total_loss.append(float(loss.item()))       
            loss.backward()                      
            if (accumulating_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()
  
            accumulating_batch_count += 1
            input_tensor = None

        training_logger.add_row(str(epoch), str(np.mean(total_loss)))       
        if save_model_on_epoch:
            torch.save(
                model.state_dict(),
                os.path.join(output_dir, f"{output_prefix}-{epoch}.pt"),
            )
        loss_list.append(np.mean(total_loss))
        epoch_list.append(epoch) 
        print(f"for epoch {epoch} the loss is {np.mean(total_loss)}\n")
    console.print(training_logger)   
    plot_loss(epoch_list, loss_list)
    return model

In [24]:
def fine_tune_GPT2(df, model_params):   
  console.log(f"""[Model]: Loading {model_params["MODEL"]}...\n""")

  # tokenzier for encoding the text
  dataset = GPTDataSetClass(df['Target'], truncate=False, gpt2_type=model_params["MODEL"]) 
  #Get the tokenizer and model
  tokenizer = GPT2Tokenizer.from_pretrained(model_params["MODEL"])
  model = GPT2LMHeadModel.from_pretrained(model_params["MODEL"])  
  trained_model = train(dataset, model, tokenizer)
  console.log(f"[Saving Model]...\n")
  #Saving the model after training
  path = os.path.join('/content/output', "model_files")
  model.save_pretrained(path)
  tokenizer.save_pretrained(path)
  console.print(f"""[Model] Model saved @ {os.path.join('/content/output', "model_files")}\n""")
  
  # logging
  console.log(f"[Data]: Reading Raw data...\n")


  # Creation of Dataset and Dataloader
  # Defining the train size. So 80% of the data will be used for training and the rest for validation. 

  console.print(f"FULL Dataset: {df.shape}")
  return trained_model, tokenizer


In [43]:

def generate(
    model,
    tokenizer,
    prompt,
    entry_count=10,
    entry_length=512, #maximum number of words
    top_p=0.8,
    temperature=1.,
):
    model.eval()
    generated_num = 0
    generated_list = []

    filter_value = -float("Inf")

    with torch.no_grad():

        for entry_idx in trange(entry_count):

            entry_finished = False
            generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)

            for i in range(entry_length):
                outputs = model(generated, labels=generated)
                loss, logits = outputs[:2]
                logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)

                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
                    ..., :-1
                ].clone()
                sorted_indices_to_remove[..., 0] = 0

                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                logits[:, indices_to_remove] = filter_value

                next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
                generated = torch.cat((generated, next_token), dim=1)

                if next_token in tokenizer.encode("<|endoftext|>"):
                    entry_finished = True

                if entry_finished:

                    generated_num = generated_num + 1

                    output_list = list(generated.squeeze().numpy())
                    output_text = tokenizer.decode(output_list)
                    generated_list.append(output_text + '\n')
                    break
            
            if not entry_finished:
              output_list = list(generated.squeeze().numpy())
              output_text = f"{tokenizer.decode(output_list)}<|endoftext|>" 
              generated_list.append(output_text + '\n')
                
    return generated_list

#Function to generate multiple sentences. Test data should be a dataframe
def text_generation(model, tokenizer, test_data):
  generated_code = []
  for i in range(len(test_data)):    
    x = generate(model.to('cpu'), tokenizer, test_data['Test'][i], entry_count=1)
    generated_code.append(x)
  return generated_code

#Run the functions to generate the lyrics

def test_fine_tuned_gpt2(model, tokenizer, df): 
    test_set = df.sample(n=1)
    df = df.loc[~df.index.isin(test_set.index)]

    #Reset the indexes
    test_set = test_set.reset_index()
    df = df.reset_index()

    #For the test set only, keep last 20 words in a new column, then remove them from original column
    test_set['Target'] = test_set['Target'].str.split().apply(' '.join)
    generated_code = text_generation(model, tokenizer, test_set)
    print(generated_code)


In [None]:
trained_model, tokenizer = fine_tune_GPT2(df, model_params)

In [27]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [37]:
def generate_test_contrct(path, ratio): 
  col = ['Target']
  f = open(path, "r")
  file = f.read()
  test_df = pd.DataFrame([file], columns=col)
  program_length = len(test_df['Target'][0].split())
  prompt_ratio = ratio
  prompt_length = int(prompt_ratio * program_length)
  return test_df, prompt_length

def truncate_test(df, prompt_length):
  copy_1 = df.copy(deep=True)
  copy_2 = df.copy(deep=True)
  #true
  a = copy_1['Target'].str.split().str[-prompt_length:].apply(' '.join)[0]
  #masked out program 
  b = copy_2['Target'].str.split().str[:-prompt_length].apply(' '.join)[0]
  return a, b

In [None]:

test_df, n = generate_test_contrct('/content/drive/MyDrive/experiments/baseline benchmark/Replica.sol', 0.95)
truth, prompt = truncate_test(test_df, n)
col = ['Target']
prompt_df = pd.DataFrame([prompt], columns=col)
gpt_out = text_generation(trained_model, tokenizer, test_df)
print('this is the GPT prompt without T5: \n', prompt_df['Target'][0])
print('this is the GPT prediction without T5: \n', gpt_out)

In [None]:
gpt_out = text_generation(trained_model, tokenizer, prompt_df)

In [None]:
#To solve CUDA out of memory error; not necesssary here 
import gc
gc.collect()
torch.cuda.empty_cache()
os.environ['CUDA_VISIBLE_DEVICES']='0, 1, 2, 3'
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

In [33]:
#for downloading purpose 
!zip -r /content/model.zip /content/output/model_files

  adding: content/output/model_files/ (stored 0%)
  adding: content/output/model_files/vocab.json (deflated 68%)
  adding: content/output/model_files/pytorch_model.bin (deflated 9%)
  adding: content/output/model_files/tokenizer_config.json (deflated 70%)
  adding: content/output/model_files/config.json (deflated 51%)
  adding: content/output/model_files/generation_config.json (deflated 24%)
  adding: content/output/model_files/special_tokens_map.json (deflated 74%)
  adding: content/output/model_files/merges.txt (deflated 53%)


In [None]:
from google.colab import files
files.download("/content/model.zip")