In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import kagglehub

# Download selected version
path = kagglehub.model_download("google/gemma/transformers/7b-it/1")

print("Path to model files:", path)

Attaching 'None' from model 'google/gemma/transformers/7b-it/1' to your Kaggle notebook...
Path to model files: /kaggle/input/gemma/transformers/7b-it/1


In [2]:
!pip install -q -U torch --index-url https://download.pytorch.org/whl/cu117

In [6]:
!pip install -q -U transformers==4.38.2
!pip install -q accelerate==0.26.1
!pip install -q -i https://pypi.org/simple/ bitsandbytes
!pip install -q -U datasets==2.16.1
!pip install -q -U trl==0.7.11
!pip install -q -U peft==0.8.2

In [4]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [5]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
# pip install --upgrade transformers

In [None]:
pip install trl


In [None]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

import torch
import torch.nn as nn

import transformers
from transformers import (AutoModelForCausalLM, 
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          pipeline, 
                          logging)
from datasets import Dataset
from peft import LoraConfig, PeftConfig
import bitsandbytes as bnb
from trl import SFTTrainer

from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix)
from sklearn.model_selection import train_test_split

In [None]:
pip install bitsandbytes

In [None]:
print(f"transformers=={transformers.__version__}")

In [None]:
model_name = "/kaggle/input/gemma/transformers/7b-it/1"

compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config, 
)

model.config.use_cache = False
model.config.pretraining_tp = 1

max_seq_length = 2048
tokenizer = AutoTokenizer.from_pretrained(model_name, max_seq_length=max_seq_length)
EOS_TOKEN = tokenizer.eos_token

In [None]:
from datasets import load_dataset
from datasets import Dataset
import pandas as pd
import numpy as np
df = load_dataset("ccdv/arxiv-classification","default")
df=df['train']
np.random.seed(42)

# Total number of samples
total_samples = len(df)

# Define sizes for each subset
train_size = 200
test_size = 50
val_size = 50

# Randomly sample indices
indices = np.random.permutation(total_samples)

# Split indices for train, test, and validation
train_indices = indices[:train_size]
test_indices = indices[train_size:train_size + test_size]
val_indices = indices[train_size + test_size:train_size + test_size + val_size]

# Select subsets from the dataset
train = df.select(train_indices)
test = df.select(test_indices)
val = df.select(val_indices)
from datasets import Dataset, DatasetDict
df = DatasetDict({
    'train': train,
    'test': test,
    'validation': val
})

In [None]:
import re
def preprocessing(text):
    text=re.sub(r'[^a-zA-Z]', ' ', text)
    text=re.sub(r'\s+', ' ',text)
    text= text.strip()
    text =re.sub(r'\b\w\b', '',text)
    text = ' '.join(text.split())
    # Take the first n words
    text = text.split()
    first_n_words = text[:300]

    # Join the words back into a single string
    text = ' '.join(first_n_words)
    return text

In [None]:
def function(df):
    df = pd.DataFrame(df)
    df['text']=df['text'].apply(lambda x:preprocessing(x))
    return df

In [None]:
df['train']=function(df['train'])
df['test']=function(df['test'])
df['validation']=function(df['validation'])

In [None]:
train = Dataset.from_pandas(df['train'])
val = Dataset.from_pandas(df['validation'])

In [None]:
train

f"""Prompt:

Analyze the following extraction from an architecture diagram and determine if it corresponds to any of the labels in the list [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11]. Return the corresponding label if there is a match.

Text:
{data_point["text"]}

Label:

{data_point["label"]}


Instructions:

1.Evaluate the entire extracted text to understand its content.
2.Check if this extraction corresponds to any of the labels in the list [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11].
3.Return the label that matches the content of the extraction, if any.
""".strip() + EOS_TOKEN

In [None]:
def generate_prompt(data_point):
        return f"""Prompt=Analyze the following extraction from an architecture diagram and determine its labels are from this list[0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11].Return the corresponding label if there is a match.

                Text={data_point["text"]}

                Label={data_point["label"]}
                """.strip() + EOS_TOKEN

def generate_test_prompt(data_point):
    return f"""Prompt=Analyze the following extraction from an architecture diagram and determine its labels are from this list[0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11].Return the corresponding label if there is a match.
            
            Text={data_point["text"]}

            Label=
            """.strip()

X_train = pd.DataFrame(df['train'].apply(generate_prompt, axis=1), 
                       columns=["text"])
X_eval = pd.DataFrame(df['validation'].apply(generate_prompt, axis=1), 
                      columns=["text"])

y_true = df['test'].label
X_test = pd.DataFrame(df['test'].apply(generate_test_prompt, axis=1), columns=["text"])

train_data = Dataset.from_pandas(X_train)
eval_data = Dataset.from_pandas(X_eval)

In [None]:
X_train['text'][75]

In [None]:
def defining_parameter(EPOCH):
    peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0,
        r=64,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                        "gate_proj", "up_proj", "down_proj",],
    )

    training_arguments = TrainingArguments(
        output_dir="logs",
        num_train_epochs=EPOCH,
        gradient_checkpointing=True,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=8,
        optim="paged_adamw_32bit",
        save_steps=0,
        logging_steps=25,
        learning_rate=2e-4,
        weight_decay=0.001,
        fp16=True,
        bf16=False,
        max_grad_norm=0.3,
        max_steps=-1,
        warmup_ratio=0.03,
        group_by_length=False,
        evaluation_strategy='steps',
        eval_steps = 112,
        eval_accumulation_steps=1,
        lr_scheduler_type="cosine",
        report_to="tensorboard",
    )

    trainer = SFTTrainer(
        model=model,
        train_dataset=train_data,
        eval_dataset=eval_data,
        peft_config=peft_config,
        dataset_text_field="text",
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        args=training_arguments,
        packing=False,
    )
    trainer.train()


In [None]:
# Train model


In [None]:
def predict(X_test, model, tokenizer):
    y_pred = []
    for i in tqdm(range(len(X_test))):
        prompt = X_test.iloc[i]["text"]
        input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
        outputs = model.generate(**input_ids, max_new_tokens=1, temperature=0.0)
        result = tokenizer.decode(outputs[0])
        answer = result.split("=")
        y_pred.append(answer[len(answer)-1])
    return y_pred

In [None]:
y_pred = predict(X_test, model, tokenizer)

In [None]:
print(y_pred)

In [None]:
y_test=[]
for value in y_true:
    y_test.append(value)
print(accuracy_score(y_test,y_pred))
    

In [None]:
defining_parameter(7)


In [None]:
y_pred = predict(X_test, model, tokenizer)

In [None]:
y_test=[]
for value in y_true:
    y_test.append(value)
print(accuracy_score(y_test,y_pred))