In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
import pandas as pd


with open("/content/drive/MyDrive/train_data.json", "r") as f:
    train_data = json.load(f)

df_train = pd.DataFrame(train_data)

In [None]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, concatenate, Dot, Lambda
from tensorflow.keras.models import Model

from sentence_transformers import SentenceTransformer

In [None]:
with open("/content/drive/MyDrive/metric_names.json") as f:
    metric_names = json.load(f)

metric_to_id = {name: i for i, name in enumerate(metric_names)}

metric_emb_matrix = np.load("/content/drive/MyDrive/metric_name_embeddings.npy")   # shape (num_metrics, emb_dim)
metric_emb_dim = metric_emb_matrix.shape[1]

In [None]:
df_train["metric_id"] = df_train["metric_name"].map(metric_to_id)

In [None]:
def join_pair(row):
    return (
        f"SYSTEM: {row['system_prompt']} "
        f"USER: {row['user_prompt']} "
        f"AGENT: {row['response']}"
    )

df_train["pair_text"] = df_train.apply(join_pair, axis=1)

In [None]:
from huggingface_hub import login
login(token="your_token")

In [None]:
pair_encoder = SentenceTransformer("l3cube-pune/indic-sentence-bert-nli")

pair_emb_list = pair_encoder.encode(
    df_train["pair_text"].tolist(),
    convert_to_numpy=True,
    batch_size=32,
    show_progress_bar=True
)

df_train["pair_embedding"] = list(pair_emb_list)
pair_emb_dim = pair_emb_list.shape[1]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/668 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/950M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/950M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/577 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/157 [00:00<?, ?it/s]

In [None]:
df_train["metric_embedding"] = df_train["metric_id"].apply(lambda i: metric_emb_matrix[i])

X_metric = np.vstack(df_train["metric_embedding"].values)
X_pair   = np.vstack(df_train["pair_embedding"].values)

y = df_train["score"].astype(float).round().astype(int).values
num_classes = len(np.unique(y))

In [None]:
inp_metric = Input(shape=(metric_emb_dim,), name="metric_input")
inp_pair   = Input(shape=(pair_emb_dim,),   name="pair_input")

In [None]:
with open("/content/drive/MyDrive/test_data.json", "r") as f:
    test_data = json.load(f)

df_test = pd.DataFrame(test_data)

In [None]:
with open("/content/drive/MyDrive/metric_names.json") as f:
    metric_names = json.load(f)

metric_to_id = {name: i for i, name in enumerate(metric_names)}

metric_emb_matrix = np.load("/content/drive/MyDrive/metric_name_embeddings.npy")
metric_emb_dim = metric_emb_matrix.shape[1]

In [None]:
df_test["metric_id"] = df_test["metric_name"].map(metric_to_id)
if df_test["metric_id"].isna().any():
    print("WARNING: Unknown metric names in test_data.")

In [None]:
def join_pair(row):
    return (
        f"SYSTEM: {row['system_prompt']} "
        f"USER: {row['user_prompt']} "
        f"AGENT: {row['response']}"
    )

df_test["pair_text"] = df_test.apply(join_pair, axis=1)

In [None]:
pair_encoder = SentenceTransformer("l3cube-pune/indic-sentence-bert-nli")

pair_emb_list = pair_encoder.encode(
    df_test["pair_text"].tolist(),
    convert_to_numpy=True,
    batch_size=32,
    show_progress_bar=True
)

df_test["pair_embedding"] = list(pair_emb_list)
pair_emb_dim = pair_emb_list.shape[1]

Batches:   0%|          | 0/114 [00:00<?, ?it/s]

In [None]:
df_test["metric_embedding"] = df_test["metric_id"].apply(lambda i: metric_emb_matrix[i])

X_metric_test = np.vstack(df_test["metric_embedding"].values)
X_pair_test   = np.vstack(df_test["pair_embedding"].values)

In [None]:
import os

save_dir = "/content/drive/MyDrive/ai_eval_data"
os.makedirs(save_dir, exist_ok=True)

In [None]:
np.save("/content/drive/MyDrive/ai_eval_data/X_metric_train.npy", X_metric)
np.save("/content/drive/MyDrive/ai_eval_data/X_pair_train.npy", X_pair)
np.save("/content/drive/MyDrive/ai_eval_data/y_train.npy", df_train["score"])

np.save("/content/drive/MyDrive/ai_eval_data/X_metric_test.npy", X_metric_test)
np.save("/content/drive/MyDrive/ai_eval_data/X_pair_test.npy", X_pair_test)