In [None]:
#!pip install transformers
# !pip uninstall datasets==2.14.4 evaluate transformers[sentencepiece] -y
# !pip uninstall accelerate -y
# !pip uninstall PyArrow==12.0.1 -y
# !pip uninstall Pandas==2.0.3 -y
# #!pip uninstall gradio

!pip install datasets evaluate transformers
!pip install accelerate
!pip install PyArrow
!pip install Pandas
!pip install gradio
!pip install nltk
!pip install rouge
!pip install rouge_score


#!pip install  git+https://github.com/NVIDIA/apex.git@0da3ffb
#!pip uninstall   apex -y

import shutil
import os
# try:
#     # Remove the directory and all its contents
#     shutil.rmtree("./apex")
#     print(f'Successfully deleted')
# except Exception as e:
#     print(f'An error occurred while deleting {path}: {e}')
# print(os.curdir)
# !git clone https://github.com/NVIDIA/apex
%cd ./apex
print(os.curdir)
!pip install  --force-reinstall -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
%cd ..

In [None]:
from transformers import AutoTokenizer, MobileBertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch
from transformers import AdamW, get_linear_schedule_with_warmup
import json
from torch.nn.functional import softmax
import numpy as np
from transformers import AdamW, Adafactor
#from apex import amp

import torch
import numpy as np
import random
import pandas as pd

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

# Use it like this at the beginning of your code
set_seed(42)

# import nltk
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('punkt')


#from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
#from nltk.translate.meteor_score import meteor_score
#from rouge import Rouge
from transformers import AutoTokenizer

import time
import datetime 
import json
import csv
import logging

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from transformers import Adafactor


In [None]:


class SQLInjectionPipeline:
    def __init__(self, config):
        self.config = config
        logging.info(f"Configuration: {self.config}")

        self.input_ids = []
        self.attention_masks = []
        self.labels = []
        self.train_dataloader = None
        self.val_dataloader = None
        self.optimizer = None
        self.scheduler = None

        self._device_setup()
        self._initialize_metrics()
        self.tokenizer = AutoTokenizer.from_pretrained(self.config['model_name'])
        self.EPOCHS = self.config['EPOCHS']

    def initialize_model(self):
        logging.info("Initializing model...")
        print("Initializing model...")
        
        if self.config['ft_model']:
            self.model = torch.load(self.config['ft_model'])
            print(f"Loaded fine-tuned model: {self.config['ft_model']}")
        elif self.config['ft_model_sd']:
            self.model = MobileBertForSequenceClassification.from_pretrained(self.config['model_name'])
            self.model.load_state_dict(torch.load(self.config['ft_model_sd']))
            print(f"Loaded fine-tuned model state_dict: {self.config['ft_model_sd']}")
        else:
            self.model = MobileBertForSequenceClassification.from_pretrained(
                self.config['model_name']
            )
            print(f"Loaded model : {self.config['model_name']}")
        
        self.model.to(self.device)
        #print(self.model.config)
        if torch.cuda.device_count() > 1:
            logging.info(f"Using {torch.cuda.device_count()} GPUs!")
            self.model = nn.DataParallel(self.model)
        if self.config['optimizer'] == 'Adafactor':
            self.optimizer = Adafactor(self.model.parameters(), **self.config['adafactor_config'])
            print("OPTIMIZER Adafactor initialized!")
        elif self.config['optimizer'] == 'AdamW':
            self.optimizer = AdamW(
                self.model.parameters(),
                lr=self.config['lr'],
                weight_decay=self.config['weight_decay']
            )
            print("OPTIMIZER AdamW initialized!")
                
           # Calculate the total steps
        total_steps = len(self.train_dataloader) * self.config['EPOCHS']
        
        if self.config['scheduler'] == 'linear' and self.config['optimizer'] != 'Adafactor':
            # Create the learning rate scheduler.
            self.scheduler = get_linear_schedule_with_warmup(
                self.optimizer, 
                num_warmup_steps=self.config['warmup_steps'],  # You can define warmup_steps in your config
                num_training_steps=total_steps
                )
            print("Scheduler initialized")
        
        logging.info("Model initialized")
        print("Model initialized")

    def _initialize_metrics(self):
        self.precision_scores = []
        self.recall_scores = []
        self.f1_scores = []
        self.roc_auc_scores = []


    def download_and_tokenize_data(self):
        print("download_and_tokenize_data started...")
        start_time = time.time()
        
        def tokenize_data(query, label):
            # This part is common to both JSON and CSV
            inputs = self.tokenizer(query, padding=False, truncation=True,max_length=512, return_tensors=None)
            self.input_ids.append(torch.tensor(inputs['input_ids']))
            self.attention_masks.append(torch.tensor(inputs['attention_mask']))
            #label =  self.tokenizer(label, padding=False, truncation=True, return_tensors=None)['input_ids']
            self.labels.append(torch.tensor(label))        
            #self.labels.append(label)        

        data_file = self.config['data_file']
        data_format = self.config['data_format']

        print(self.config['data_keys'][0])
        print(self.config['data_keys'][1])
        if data_format == 'csv':
            with open(data_file, 'r') as file:
                reader = csv.DictReader(file)
                for row in reader:
                    query = row[self.config['data_keys'][0]]
                    label = row[self.config['data_keys'][1]]
                    tokenize_data(query,int(label))
                
        elapsed = time.time() - start_time
        print(f"download_and_tokenize_data completed. Time:{elapsed:.2f}s")
        
    def prepare_and_convert(self):
        print("prepare_and_convert started")
        start_time = time.time()

        # Dynamic padding collate function
        def collate_fn(batch):
            input_ids, attention_masks, labels = zip(*batch)
            input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
            attention_masks = pad_sequence(attention_masks, batch_first=True, padding_value=0)
            #labels = pad_sequence(labels, batch_first=True, padding_value=0)
            labels = torch.tensor(labels)
            return input_ids, attention_masks, labels

        # Zip the lists together to form your dataset
        dataset = list(zip(self.input_ids, self.attention_masks, self.labels))

        # Split dataset into training and validation sets
        train_size = int(0.8 * len(dataset))
        val_size = len(dataset) - train_size
        train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

        # Create DataLoader for training and validation sets
        self.train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=32, collate_fn=collate_fn)
        self.val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=32, collate_fn=collate_fn)

        elapsed = time.time() - start_time
        print(f"prepare_and_convert completed. Time: {elapsed:.2f}s")
        
    def _device_setup(self):
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        print(self.device)
        logging.info("Device setup complete")
        print("Device setup complete")

    def evaluate(self):
        logging.info("Evaluation started...")
        start_time = time.time()

        self.model.eval()

        all_pred_ids = []
        all_label_ids = []

        for batch in self.val_dataloader:
            b_input_ids = batch[0].to(self.device)
            b_attention_masks = batch[1].to(self.device)
            b_labels = batch[2].to(self.device)

            with torch.no_grad():
                outputs = self.model(input_ids=b_input_ids, attention_mask=b_attention_masks, labels=b_labels)

            logits = outputs.logits
            pred_ids = torch.argmax(logits, dim=1)

            all_pred_ids.extend(pred_ids.cpu().numpy())
            all_label_ids.extend(b_labels.cpu().numpy())

        # Calculate precision, recall, F1-score and ROC/AUC
        precision = precision_score(all_label_ids, all_pred_ids, average='weighted')
        recall = recall_score(all_label_ids, all_pred_ids, average='weighted')
        f1 = f1_score(all_label_ids, all_pred_ids, average='weighted')
        roc_auc = roc_auc_score(all_label_ids, all_pred_ids, multi_class='ovr', average='weighted')

        elapsed = time.time() - start_time
        print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}, ROC/AUC: {roc_auc:.4f}")
        print(f"Evaluation completed. Time: {elapsed:.2f}s")

        logging.info("Evaluation completed")

        return precision, recall, f1, roc_auc
    
    def train(self):
        from torch.nn import CrossEntropyLoss
        loss_function = CrossEntropyLoss()        
        
        training_results = []
        start = datetime.datetime.now()
        logging.info(f"Training started. AMP enabled: {self.config['use_amp']}. At {start}")
        print(f"Training started. AMP enabled: {self.config['use_amp']}. At {start}")

        if self.config['use_amp']:
            scaler = GradScaler()

        for epoch in range(self.EPOCHS):
            self.model.train()
            total_loss = 0
            step = 0
            start_time = time.time()
            for batch in self.train_dataloader:
                self.optimizer.zero_grad()

                b_input_ids = batch[0].to(self.device)
                b_attention_masks = batch[1].to(self.device)
                b_labels = batch[2].to(self.device)

                if self.config['use_amp']:
                    with autocast():
                        outputs = self.model(input_ids=b_input_ids, attention_mask=b_attention_masks, labels=b_labels)
                        loss = outputs.loss
                    scaler.scale(loss).backward()
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                    scaler.step(self.optimizer)
                    scaler.update()
                else:
                    #outputs = self.model(input_ids=b_input_ids, attention_mask=b_attention_masks, labels=b_labels)
                    #loss = outputs.loss
                    outputs = self.model(input_ids=b_input_ids, attention_mask=b_attention_masks)
                    logits = outputs.logits
                    loss = loss_function(logits, b_labels)
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                    self.optimizer.step()
                    if self.config['scheduler'] and self.config['optimizer'] != 'Adafactor':
                        self.scheduler.step()

                total_loss += loss.item()
                step += 1

                if step % int(self.config['log_batches']) == 0 and step > 0:
                    elapsed = time.time() - start_time
                    print(f"Epoch {epoch+1}/{self.EPOCHS}, Step {step}/{len(self.train_dataloader)}, Loss: {loss.item():.4f}, Time/batch: {elapsed:.2f}s") 
                    start_time = time.time()

            avg_train_loss = total_loss / len(self.train_dataloader)
            print(f"Epoch {epoch+1} finished. Average training loss: {avg_train_loss:.4f}")

            # Evaluate after each epoch
            precision, recall, f1, roc_auc = self.evaluate()
            training_results.append({
                'epoch': epoch + 1,
                'avg_train_loss': avg_train_loss,
                'precision': precision,
                'recall': recall,
                'f1': f1,
                'roc_auc': roc_auc
            })

            # Save the model after each epoch
            ep_model_loc = f"{self.config['save_loc']}/{self.config['model']}_{self.config['version']}_model_epoch_{epoch}.pt"
            torch.save(self.model, ep_model_loc)

        end = datetime.datetime.now()
        logging.info(f"Training ended. AMP enabled: {self.config['use_amp']}. At {end}")
        print(f"Training ended. AMP enabled: {self.config['use_amp']}. At {end}")

        return training_results

    def setup_and_train(self):
        self.download_and_tokenize_data()
        
        self.prepare_and_convert()
        
        self.initialize_model()
        
        results = self.train()
        return results;
        




In [None]:


config = {
    'model': 'bertmobile-sql-inject-detector',
    'model_name': 'google/mobilebert-uncased',
    'ft_model': '',
    'ft_model_sd': '',
    'optimizer': 'Adafactor' ,#'AdamW',
    'dynamic_padding': True,
    'use_amp': False,
    'use_autocast': False,    
    'lr': 1e-4,
    'weight_decay': 0.01,
    'EPOCHS': 5,
    'data_file': 'data/Modified_SQL_Dataset.csv',
    'data_format': 'csv',
    'data_keys': ['Query', 'Label'],
    'save_loc': "modelmobile_v1",
    'log_batches': 50,
    'version': 'v1',
    'warmup_steps': 94,
    'scheduler': 'linear',
    'adafactor_config': {
        'lr': 1e-3,
        'eps': (1e-30, 1e-3),
        'clip_threshold': 1.0,
        'decay_rate': -0.8,
        'beta1': None,
        'weight_decay': 0.0,
        'relative_step': False,
        'scale_parameter': False,
        'warmup_init': False
    },
}

pipeline = SQLInjectionPipeline(config);
results = pipeline.setup_and_train()

for r in results:
    print(r)