<a href="https://colab.research.google.com/github/chaitanya-rdY/I-am-besides-you/blob/main/I'M_Besides_You.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

In [None]:
import torch
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments,BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
from sklearn.metrics import accuracy_score, f1_score

In [None]:
train_df = pd.read_csv("train.csv")
test_df  = pd.read_csv("test.csv")

request_types = [
    "Adjustment in Credits requirement for Graduation",
    "Applying for B.Tech Degree with Honours",
    "Applying for B.Tech Degree with Minor",
    "Biometric Attendance â€“ In campus but not able to mark",
    "Bonafide for Course Completion",
    "Bonafide for Other Purposes",
    "Bonafide for Passes ( Railway/Bus)",
    "Bonafide for Passport",
    "Bonafide for scholarship",
    "Conference",
    "Course Category Change Request",
    "Course Registration Request",
    "Course related Request ( other)",
    "Course Withdraw Beyond Permissible Time",
    "Credit limit request",
    "Early Post Doc Fellowship",
    "Extension of Ph.D HTRA to Female Scholars",
    "Guest house booking request (On Paid basis)",
    "ID CARD Request",
    "Income certificate/Category change request",
    "International Conference",
    "LIBRARY CARD",
    "Mess Fee relates Requests",
    "Mess Related Requests",
    "NOC for VISA",
    "Other non academic request",
    "Others",
    "Permission for Leave/Absence from classes",
    "Permission to stay on Campus during summer",
    "Permission/NoC for internship",
    "Ph.D Contingency",
    "PhD Related Request",
    "PhD Scholarship Request",
    "Provisional Certificate",
    "Request for UGTA stipend",
    "Student Claim/Sanction Approval",
    "Student Fee Related Request",
    "Support for PhD students - foreign internship",
    "Transcript Request",
    "Transfer of Grades of Course Work completed from other Institutes",
    "Withdrawal from admission"
]

train_df["formatted"] = "Input: " + train_df["text"] + "\nOutput: " + train_df["label"]
test_df["formatted"]  = "Input: " + test_df["text"] + "\nOutput: " + test_df["label"]

train_ds = Dataset.from_pandas(train_df[["formatted"]])
test_ds  = Dataset.from_pandas(test_df[["formatted"]])

In [None]:

!hf login --token hf_JlYJkkpIBbYqcmmeDJuCYGYHJdxAEKGii
MODEL_ID = "meta-llama/Llama-2-7b-chat-hf"
device_map = "auto"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    use_auth_token="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"
)
model.config.use_cache = False

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

usage: hf <command> [<args>]
hf: error: argument {auth,cache,download,jobs,repo,repo-files,upload,upload-large-folder,env,version,lfs-enable-largefiles,lfs-multipart-upload}: invalid choice: 'login' (choose from 'auth', 'cache', 'download', 'jobs', 'repo', 'repo-files', 'upload', 'upload-large-folder', 'env', 'version', 'lfs-enable-largefiles', 'lfs-multipart-upload')


ValueError: 
                        Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit
                        the quantized model. If you want to dispatch the model on the CPU or the disk while keeping
                        these modules in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom
                        `device_map` to `from_pretrained`. Check
                        https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu
                        for more details.
                        

In [None]:
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj","v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)


In [None]:
training_arguments = TrainingArguments(
    output_dir="./llama2-qlora-requests",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    learning_rate=2e-4,
    save_total_limit=2,
    fp16=False,
    bf16=False,
    report_to="none"
)

In [None]:
trainer = SFTTrainer(
    model=MODEL_ID,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=256,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
    compute_metrics=None,
)

NameError: name 'tokenizer' is not defined

In [None]:
trainer.train()

In [None]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    preds = preds.argmax(axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted")
    }

In [None]:
def classify_text(example: str):
    prompt = f"Input: {example}\nOutput:"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=20)
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if "Output:" in prediction:
        prediction = prediction.split("Output:")[-1].strip()
    return prediction

sample = "I want a certificate for my scholarship application"
print("Prediction:", classify_text(sample))