In [4]:
%%capture
pip install torch

In [5]:
%%capture
pip install tqdm

In [6]:
%%capture
pip install transformers

In [4]:
import os
import time
import random
import numpy as np
import torch
import json
import tqdm
import pandas as pd
import argparse

# from args import get_parser
# from trijoint import im2recipe
# from setup_logger import logger
from transformers import DistilBertTokenizer, DistilBertForMaskedLM
from transformers import LineByLineTextDataset
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import pipeline


def init_models(model_str):
    tokenizer = DistilBertTokenizer.from_pretrained(model_str)
    model = DistilBertForMaskedLM.from_pretrained(model_str)
    print('Number of model parameters: %i' % model.num_parameters())
    return tokenizer, model


def save_model(model, model_name):
    model.save_pretrained(model_name)


def extract_recipe_instructions(data_source):
    recipe_ids = []
    recipe_instructions = []
    with open(data_source) as json_file:
        data = json.load(json_file)
        recipes_total = len(data)
        print(data[0])
        for rec in data:
            if isinstance(rec, dict):
                try:
                    for i, inst in enumerate(rec["instructions"]):
                        if isinstance(inst, dict):
                            text = inst["text"]
                            recipe_ids.append(rec['id'])
                            recipe_instructions.append(text)
                except Exception:
                    pass

    return recipe_ids, recipe_instructions


def extract_txt_from_layer1(data_source, txt_destination):
    recipe_ids, recipe_instructions = extract_recipe_instructions(data_source)
    df_recipes = pd.DataFrame(columns=['recipe_id', 'instruction'])
    df_recipes['recipe_id'] = recipe_ids
    df_recipes['instruction'] = recipe_instructions
    df_recipes = df_recipes.dropna()
    print('Saving csv in {}'.format(txt_destination))
    df_recipes.instruction.to_csv(txt_destination, header=None, index=None, sep='.')


def create_dataset(txt_instructions, tokenizer):
    dataset = LineByLineTextDataset(
        tokenizer=tokenizer,
        file_path=txt_instructions,
        block_size=128,
    )
    return dataset


def create_data_collator(tokenizer):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=True, mlm_probability=0.15
    )
    return data_collator


def create_trainer(model_name, model, data_collator, train_dataset, eval_dataset, train_path):
    training_args = TrainingArguments(
        output_dir=train_path + '/' + model_name,
        overwrite_output_dir=True,
        num_train_epochs=1,
        per_gpu_train_batch_size=64,
        save_steps=1_000,
        save_total_limit=10,
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset= eval_dataset,
        #prediction_loss_only=True,
    )
    return trainer


def check_model(model_source, tokenizer):
    fill_mask = pipeline(
        "fill-mask",
        model=model_source,
        tokenizer=tokenizer
    )
    # SELECT Nationality FROM table WHERE Player = Terrence Ross.
    fill_mask("SELECT Nationality FROM table [MASK] Player = Terrence Ross.")
    # SELECT Pick FROM table WHERE Player = Dorain Anneck.
    fill_mask("SELECT Pick [MASK] table WHERE Player = Dorain Anneck.")
    # SELECT COUNT Grand Prix FROM table WHERE Winning Constructor = Benetton - Ford AND Pole Position = Michael Schumacher
    fill_mask("SELECT [MASK] Grand Prix FROM table WHERE Winning Constructor = Benetton - Ford AND Pole Position = Michael Schumacher")
    # SELECT Winning Driver FROM table WHERE Pole Position = Ayrton Senna AND Fastest Lap = Michael Schumacher
    fill_mask("SELECT Winning Driver FROM table WHERE Pole Position = Ayrton Senna [MASK] Fastest Lap = Michael Schumacher")


def _main(from_dataset, model_to_load, checkpoint_path, train_path):
    if model_to_load == None:
        model_to_load = 'distilbert-base-uncased'
    print('Loading model %s for dataset %s' % (model_to_load, from_dataset))
    tokenizer, model = init_models(model_to_load)
    txt_already_exist = True
    if not torch.cuda.is_available():
        print('Fine-tuning on CPU')
    else:
        print('GPU available')

    if from_dataset == 'recipe1m':
        data_source = './layer1.json'
        txt_instructions = './recipe1m_instructions.txt'
        model_name = 'Recipe1MDistilBERT'
    else:
#         data_source = train_path + '/nestle_recipe_text_layer1.json'
        txt_instructions = "./train_sql.csv"
        model_name = 'SQLDistilBERT'

#     if not txt_already_exist:
#         extract_txt_from_layer1(data_source, txt_instructions)

    save_model(model, model_name)
    train_dataset = create_dataset(txt_instructions, tokenizer)
    eval_dataset = create_dataset("./val_df.csv", tokenizer)
    data_collator = create_data_collator(tokenizer)
    trainer = create_trainer(model_name, model, data_collator, train_dataset, eval_dataset, train_path)
    trainer.train()
    trainer.save_model(checkpoint_path + '/' + model_name + '_saved')
    check_model(checkpoint_path + '/' + model_name + '_saved', tokenizer)


if __name__ == '__main__':
    _main('sql', None, "/content/drive/MyDrive/SQL_Cybersecurity", "SQL_Cybersecurity" )

Loading model distilbert-base-uncased for dataset sql


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…


Number of model parameters: 66985530
GPU available


Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


Step,Training Loss
500,2.0606
