In [1]:
import os
os.environ["HF_HOME"] = "/scratch/sampath.ki/hf"

In [2]:
from tqdm import tqdm

import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer

tqdm.pandas()

In [3]:
df = pd.read_csv("dataset.csv")
df.dropna(inplace=True)
df.head()

Unnamed: 0,question,context,response
0,Who teaches Fundamentals of Complexity Theory?,Course Information\nComplexity Theory CS7805 ...,Jamieson Lindsay teaches Fundamentals of Compl...
1,What professor instructs Fundamentals of Compl...,Course Information\nComplexity Theory CS7805 ...,Professor Jamieson Lindsay instructs Fundament...
2,Which instructor is assigned to Fundamentals o...,Course Information\nComplexity Theory CS7805 ...,Jamieson Lindsay teaches Fundamentals of Compl...
3,Who is the lecturer for Fundamentals of Comple...,Course Information\nComplexity Theory CS7805 ...,The lecturer for Fundamentals of Complexity Th...
4,Fundamentals of Complexity Theory: who is the ...,Course Information\nComplexity Theory CS7805 ...,The professor for Fundamentals of Complexity T...


In [22]:
model_name = "Qwen/Qwen2.5-1.5B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto",
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
def get_query(row):
    system_prompt = """
    You are Course Compass, a chatbot dedicated to assisting Northeastern University graduate students with course registration each semester. You have access to the latest information on available graduate courses, faculty profiles, and summarized student feedback from previous semesters.
    
    Your goals are:
    
    1. To provide accurate, up-to-date information without speculating. If you lack information about a course or question, clearly communicate that to the student.
    2. To maintain a positive, professional tone. If past student feedback includes criticism, you should still respond diplomatically, focusing on constructive or neutral aspects.
    3. To be concise and relevant in your responses, helping students make informed decisions about their course choices.
    
    Avoid negative or speculative responses, and prioritize factual information over assumption.
    
    Answer the questions comprehensively using the reviews from the context by summarizing them to help the student.
    """
        
    max_context_tokens = int(15000 * 0.6)
    context_tokens = tokenizer(row["context"], truncation=True, max_length=max_context_tokens, return_tensors="pt")
    truncated_context = tokenizer.decode(context_tokens["input_ids"][0], skip_special_tokens=True)
    
    prompt = f"""
    Context:
    {truncated_context}
    
    Query:
    {row["question"]}
    
    Answer the query comprehensively using the reviews from the context by summarizing them to help the student.
    """
    
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": prompt}
    ]
    
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    return text

In [6]:
df["query"] = df.progress_apply(get_query, axis=1)

100%|██████████| 432/432 [00:18<00:00, 22.99it/s]


In [7]:
def generate_responses(text: str) -> str:
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=512
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    
    gen_response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    return gen_response

In [23]:
# random_df = df.sample(frac=0.1)

random_df["gen_responses"] = random_df["query"].progress_apply(generate_responses)

100%|██████████| 43/43 [07:51<00:00, 10.96s/it]


In [24]:
import evaluate

# Load the ROUGE metric
rouge = evaluate.load("rouge")

# Calculate ROUGE scores for all examples in the DataFrame
results = rouge.compute(
    predictions=random_df['gen_responses'].tolist(),
    references=random_df['response'].tolist()
)

results

{'rouge1': 0.3064992875142799,
 'rouge2': 0.08405681766996795,
 'rougeL': 0.18099603013904592,
 'rougeLsum': 0.22086549470472072}

In [8]:
import torch
from transformers import TrainingArguments, Trainer
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from datasets import Dataset

In [None]:
df.loc[0]

In [11]:
# def tokenize_function(examples):
#     # Tokenize the query (prompt) and response (target)
#     model_inputs = tokenizer(
#         examples["query"],
#         padding="max_length",
#         truncation=True,
#         max_length=4096,  # Adjust max_length as needed
#     )
#     labels = tokenizer(
#         examples["response"],
#         padding="max_length",
#         truncation=True,
#         max_length=4096,
#     )["input_ids"]

#     # Shift labels for causal language modeling
#     model_inputs["labels"] = labels

#     return model_inputs

def preprocess_function(examples):
    inputs = [f"{prompt}\n" for prompt in examples["query"]]
    targets = [f"{completion}\n" for completion in examples["response"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

dataset = Dataset.from_pandas(df)
tokenized_dataset = dataset.train_test_split(test_size=0.1)
tokenized_dataset = tokenized_dataset.map(preprocess_function, batched=True, remove_columns=dataset.column_names)

Map:   0%|          | 0/388 [00:00<?, ? examples/s]

Map:   0%|          | 0/44 [00:00<?, ? examples/s]

In [12]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 388
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 44
    })
})

In [13]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

In [14]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=10,
    save_strategy="steps",
    save_steps=0,
    learning_rate=1e-4,
    fp16=True,
    gradient_checkpointing=True,
    gradient_accumulation_steps=4,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
)
trainer.train()

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss
10,14.8738,14.873831
20,14.8323,14.830644
30,14.7418,14.742779
40,14.6644,14.609572
50,14.5703,14.414749
60,14.2293,14.125297
70,13.8675,13.694042
80,13.4206,13.061489
90,12.7214,12.237857


TrainOutput(global_step=97, training_loss=14.044264881881242, metrics={'train_runtime': 141.3474, 'train_samples_per_second': 2.745, 'train_steps_per_second': 0.686, 'total_flos': 1563140800118784.0, 'train_loss': 14.044264881881242, 'epoch': 1.0})

In [None]:
access_token = "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"

In [None]:
model.push_to_hub("qwen-finetuned-model")
tokenizer.push_to_hub("qwen-finetuned-model")

In [15]:
trainer.save_model("./fine_tuned_qwen")



In [None]:
type(model)

In [16]:
from peft import PeftModel    
from transformers import AutoModelForCausalLM

adapters_name = "./fine_tuned_qwen"

print(f"Starting to load the model {model_name} into memory")

m = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)
m = PeftModel.from_pretrained(m, adapters_name)
m = m.merge_and_unload()

Starting to load the model Qwen/Qwen2.5-1.5B-Instruct into memory


In [17]:
# from peft import AutoPeftModelForCausalLM

# fine_tuned_model = AutoModelForCausalLM.from_pretrained("./fine_tuned_qwen", trust_remote_code=True)
fine_tuned_tokenizer = AutoTokenizer.from_pretrained("./fine_tuned_qwen", trust_remote_code=True)

def generate_response(prompt):
    inputs = fine_tuned_tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = m.generate(**inputs, max_new_tokens=256)
    return fine_tuned_tokenizer.decode(outputs[0], skip_special_tokens=True)
    
# Example usage
prompt = df["query"][0]
response = generate_response(prompt)
print(f"Response: {response}")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Response: system

    You are Course Compass, a chatbot dedicated to assisting Northeastern University graduate students with course registration each semester. You have access to the latest information on available graduate courses, faculty profiles, and summarized student feedback from previous semesters.
    
    Your goals are:
    
    1. To provide accurate, up-to-date information without speculating. If you lack information about a course or question, clearly communicate that to the student.
    2. To maintain a positive, professional tone. If past student feedback includes criticism, you should still respond diplomatically, focusing on constructive or neutral aspects.
    3. To be concise and relevant in your responses, helping students make informed decisions about their course choices.
    
    Avoid negative or speculative responses, and prioritize factual information over assumption.
    
    Answer the questions comprehensively using the reviews from the context by summari

In [19]:
random_df["gen_responses1"] = random_df["query"].progress_apply(generate_response)

100%|██████████| 43/43 [06:34<00:00,  9.17s/it]


In [20]:
random_df["gen_responses1"] = random_df["gen_responses1"].apply(lambda x: x.split("assistant\n")[-1])

In [21]:
import evaluate

# Load the ROUGE metric
rouge = evaluate.load("rouge")

# Calculate ROUGE scores for all examples in the DataFrame
results = rouge.compute(
    predictions=random_df['gen_responses1'].tolist(),
    references=random_df['response'].tolist()
)

results

{'rouge1': 0.31417025764891876,
 'rouge2': 0.0813013179978391,
 'rougeL': 0.1813838311126142,
 'rougeLsum': 0.21935332363384488}