In [6]:
!pip install --upgrade --force-reinstall fsspec==2024.10.0 gcsfs

Collecting fsspec==2024.10.0
  Using cached fsspec-2024.10.0-py3-none-any.whl.metadata (11 kB)
Collecting gcsfs
  Using cached gcsfs-2025.3.2-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting aiohttp!=4.0.0a0,!=4.0.0a1 (from gcsfs)
  Using cached aiohttp-3.11.18-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting decorator>4.1.2 (from gcsfs)
  Using cached decorator-5.2.1-py3-none-any.whl.metadata (3.9 kB)
INFO: pip is looking at multiple versions of gcsfs to determine which version is compatible with other requirements. This could take a while.
Collecting gcsfs
  Using cached gcsfs-2025.3.1-py2.py3-none-any.whl.metadata (1.9 kB)
  Using cached gcsfs-2025.3.0-py2.py3-none-any.whl.metadata (1.9 kB)
  Using cached gcsfs-2025.2.0-py2.py3-none-any.whl.metadata (1.9 kB)
  Using cached gcsfs-2024.12.0-py2.py3-none-any.whl.metadata (1.6 kB)
  Using cached gcsfs-2024.10.0-py2.py3-none-any.whl.metadata (1.6 kB)
Collecting google-auth>=1.2 (from gcsfs)
  Using

In [1]:
import torch

torch.cuda.empty_cache()
torch.cuda.memory_summary(device=None, abbreviated=False)




In [2]:
import fsspec
import gcsfs
print(f"fsspec version: {fsspec.__version__}")
print(f"gcsfs version: {gcsfs.__version__}")

fsspec version: 2024.10.0
gcsfs version: 2024.10.0


In [3]:
!pip install transformers peft datasets bitsandbytes accelerate -q

In [4]:
from transformers import AutoTokenizer
from transformers import AutoConfig

# Paths
base_model_path = "/content/drive/MyDrive/models/llama_8b_Instruct"
lora_model_path = "/content/drive/MyDrive/models/lora_l1_20000_hc_97"

# Try loading tokenizer from LoRA save path (preferred)
try:
    tokenizer = AutoTokenizer.from_pretrained(lora_model_path)
    print(" Tokenizer loaded from LoRA model path.")
except:
    tokenizer = AutoTokenizer.from_pretrained(base_model_path)
    print(" Tokenizer loaded from base model path.")

# Print tokenizer vocab size
print(f" Tokenizer vocabulary size: {len(tokenizer)}")

 Tokenizer loaded from LoRA model path.
 Tokenizer vocabulary size: 128257


In [6]:
from transformers import AutoModelForSequenceClassification

# Load Base Model
base_model = AutoModelForSequenceClassification.from_pretrained(base_model_path, num_labels=3)
# Load config from the base model
config = AutoConfig.from_pretrained(base_model_path)

# Print number of labels
print(f" Number of Labels in Base Model: {config.num_labels}")

# Resize token embeddings (Important to avoid mismatch)
base_model.resize_token_embeddings(len(tokenizer))


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/models/llama_8b_Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


 Number of Labels in Base Model: 2


Embedding(128257, 4096)

In [7]:
import torch
print("PyTorch GPU available:", torch.cuda.is_available())
print("GPU device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")

PyTorch GPU available: True
GPU device name: NVIDIA A100-SXM4-40GB


In [8]:
from peft import PeftModel

# Attach LoRA Adapter (after ensuring token embeddings match)
lora_model = PeftModel.from_pretrained(base_model, lora_model_path)

print(" LoRA model successfully loaded with correct tokenizer and embedding size!")


 LoRA model successfully loaded with correct tokenizer and embedding size!


In [9]:
import torch

def classify_intent(query):
    # Tokenize query
    inputs = tokenizer(query, padding=True, truncation=True, return_tensors="pt")

    # Move inputs to correct device (GPU if available)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    lora_model.to(device)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Run inference
    with torch.no_grad():
        outputs = lora_model(**inputs)

    # Extract predicted class
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()

    return predicted_class


In [10]:
print(f" Number of Intent Categories: {lora_model.config.num_labels}")

 Number of Intent Categories: 3


In [11]:
# Run on a sample query
classify_intent("who are you")

0

In [12]:
import time

import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/ORCAS_I_datasets/ORCAS-I-llama8b-l1-500_inference.tsv", sep="\t")

start_time = time.perf_counter()

df["predicted_intent"] = df["query"].apply(classify_intent)

end_time = time.perf_counter()

# Compute total execution time
total_time_ms = (end_time - start_time) * 1000  # Convert to milliseconds


print(f"Total inference time: {total_time_ms:.3f} ms")
print(f"Average time per query: {total_time_ms / len(df):.3f} ms")

Total inference time: 164017.486 ms
Average time per query: 164.017 ms


In [13]:
df['intent_inference'] = df['predicted_intent'].map({0:'Informational', 1:"Navigational", 2:"Transactional"})
df

Unnamed: 0,qid,query,did,url,label_manual,data_split,intent,predicted_intent,intent_inference
0,7916625,best reads,D889100,http://thegreatestbooks.org/,Informational,test,0,0,Informational
1,7737755,tamerind,D586723,https://en.wikipedia.org/wiki/Tamarind,Informational,test,0,0,Informational
2,4598644,show mi ip,D3188590,http://showip.net/,Informational,test,0,0,Informational
3,11008126,do carpenter ants eat wood,D1593016,https://doyourownpestcontrol.com/carp.htm,Informational,test,0,0,Informational
4,7737808,rheumatoid arthritis in children,D2557045,https://www.webmd.com/rheumatoid-arthritis/und...,Informational,test,0,0,Informational
...,...,...,...,...,...,...,...,...,...
995,9342644,hit ledger,D1640850,http://www.imdb.com/name/nm0005132/,Informational,test,0,1,Navigational
996,5587942,first health insurance reviews,D1121660,https://www.consumeraffairs.com/insurance/heal...,Informational,test,0,0,Informational
997,11729036,luxury suv rental,D2293603,https://www.hertz.com/rentacar/misc/index.jsp?...,Informational,test,0,0,Informational
998,11557714,sam's club synchrony sign in,D423863,https://www.samsclub.com/sams/pagedetails/cont...,Navigational,test,1,1,Navigational


In [14]:
from sklearn.metrics import classification_report
print("Results finetuning")
# Generate a classification report
report = classification_report(df['label_manual'], df['intent_inference'], target_names=['Informational','Navigational','Transactional'],digits=3)

print(report)

Results finetuning
               precision    recall  f1-score   support

Informational      0.951     0.832     0.887       786
 Navigational      0.524     0.819     0.639       171
Transactional      0.733     0.767     0.750        43

     accuracy                          0.827      1000
    macro avg      0.736     0.806     0.759      1000
 weighted avg      0.868     0.827     0.839      1000



In [15]:

import pickle

model_preds = df['intent_inference'].to_list()

with open("/content/drive/MyDrive/data_colab/results_sigir2025/lora_60K_hc_97.pkl", "wb") as f:
    pickle.dump(model_preds, f)
