# Train with the Prompt, Winning Response, and Open/Closed Columns

In [1]:
# import
import pandas as pd
import numpy as np
path = '/Users/sarpgunhan/Desktop/Cal/IEOR 142A/'
df = pd.read_csv(path + "ieor142a_sampled_dataset - Master Sheet.csv")
print(df.shape)
df.head()

(575, 9)


Unnamed: 0,prompt,model_a,model_b,response_a,response_b,winner,Open/Closed,Domain,Comments
0,Translate the following from English to German...,stablelm-tuned-alpha-7b,chatglm-6b,"In English, the sentence ""Considering the numb...",""" Considering the number of desserts popular i...",tie (bothbad),closed,factual,
1,You are a helpful chatbot who looks at rows of...,stablelm-tuned-alpha-7b,gpt-4,"Sure, I can help you categorize the rows of da...","["""", """", ""Shares"", ""Market Value • ($000)""]: ...",tie (bothbad),closed,factual,
2,Follow the instructions below:\n1. Analyse the...,llama-13b,RWKV-4-Raven-14B,"I just moved to 123 Elm Street, Springfield, I...",1. True\n2. False,model_b,closed,factual,
3,How to express a tree mathematically,vicuna-13b,dolly-v2-12b,There are several ways to express a tree mathe...,Let tree(x) denote a tree where x is the root ...,model_a,closed,math,
4,Tell me if the following numbers are even or o...,llama-13b,vicuna-13b,"3, 16, 17, 2110 are even.",3 is odd\n16 is even\n17 is odd\n2110 is even,model_b,closed,math,


In [2]:
print(df['Domain'].unique())
print(df['Open/Closed'].unique())

['factual' 'math' 'code' 'creative' 'opinion' nan 'creative ' 'opnion'
 'opinion ' 'math/code']
['closed' 'open' 'close' nan 'opinion' 'closed ' 'open ']


In [3]:
# clean data 
valid_domains = ['opinion', 'creative', 'factual', 'code', 'math']
df = df[df['Domain'].isin(valid_domains)].copy()
valid_open_closed = ['open', 'closed']
df = df[df['Open/Closed'].isin(valid_open_closed)].copy()
print(df.shape)

(561, 9)


In [4]:
# add winning response 
def get_winning_response(row):
    if row['winner'] == 'model_a':
        return row['response_a']
    elif row['winner'] == 'model_b':
        return row['response_b']
    else:
        return ""  # tie or bothbad

df["winning_response"] = df.apply(get_winning_response, axis=1)
df.head()

Unnamed: 0,prompt,model_a,model_b,response_a,response_b,winner,Open/Closed,Domain,Comments,winning_response
0,Translate the following from English to German...,stablelm-tuned-alpha-7b,chatglm-6b,"In English, the sentence ""Considering the numb...",""" Considering the number of desserts popular i...",tie (bothbad),closed,factual,,
1,You are a helpful chatbot who looks at rows of...,stablelm-tuned-alpha-7b,gpt-4,"Sure, I can help you categorize the rows of da...","["""", """", ""Shares"", ""Market Value • ($000)""]: ...",tie (bothbad),closed,factual,,
2,Follow the instructions below:\n1. Analyse the...,llama-13b,RWKV-4-Raven-14B,"I just moved to 123 Elm Street, Springfield, I...",1. True\n2. False,model_b,closed,factual,,1. True\n2. False
3,How to express a tree mathematically,vicuna-13b,dolly-v2-12b,There are several ways to express a tree mathe...,Let tree(x) denote a tree where x is the root ...,model_a,closed,math,,There are several ways to express a tree mathe...
4,Tell me if the following numbers are even or o...,llama-13b,vicuna-13b,"3, 16, 17, 2110 are even.",3 is odd\n16 is even\n17 is odd\n2110 is even,model_b,closed,math,,3 is odd\n16 is even\n17 is odd\n2110 is even


In [5]:
# make a combined text column for features that are used (since features do not have weights in RBF due to being nonlinear)
df["combined_text"] = (
    df["prompt"] + " [SEP] " + 
    df["winning_response"] + " [SEP] " +
    df["Open/Closed"].astype(str)
)
df.head()

Unnamed: 0,prompt,model_a,model_b,response_a,response_b,winner,Open/Closed,Domain,Comments,winning_response,combined_text
0,Translate the following from English to German...,stablelm-tuned-alpha-7b,chatglm-6b,"In English, the sentence ""Considering the numb...",""" Considering the number of desserts popular i...",tie (bothbad),closed,factual,,,Translate the following from English to German...
1,You are a helpful chatbot who looks at rows of...,stablelm-tuned-alpha-7b,gpt-4,"Sure, I can help you categorize the rows of da...","["""", """", ""Shares"", ""Market Value • ($000)""]: ...",tie (bothbad),closed,factual,,,You are a helpful chatbot who looks at rows of...
2,Follow the instructions below:\n1. Analyse the...,llama-13b,RWKV-4-Raven-14B,"I just moved to 123 Elm Street, Springfield, I...",1. True\n2. False,model_b,closed,factual,,1. True\n2. False,Follow the instructions below:\n1. Analyse the...
3,How to express a tree mathematically,vicuna-13b,dolly-v2-12b,There are several ways to express a tree mathe...,Let tree(x) denote a tree where x is the root ...,model_a,closed,math,,There are several ways to express a tree mathe...,How to express a tree mathematically [SEP] The...
4,Tell me if the following numbers are even or o...,llama-13b,vicuna-13b,"3, 16, 17, 2110 are even.",3 is odd\n16 is even\n17 is odd\n2110 is even,model_b,closed,math,,3 is odd\n16 is even\n17 is odd\n2110 is even,Tell me if the following numbers are even or o...


In [6]:
# encode domain labels
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df["domain_label"] = le.fit_transform(df["Domain"])
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(label_mapping)

{'code': 0, 'creative': 1, 'factual': 2, 'math': 3, 'opinion': 4}


In [7]:
# train test split 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df["combined_text"], df["domain_label"],
    test_size=0.2, random_state=42, stratify=df["Domain"]
)
print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")

Train size: 448, Test size: 113


In [8]:
# vectorize text data using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=3000,      # You can tweak this
    stop_words='english',   # Remove common words
    ngram_range=(1, 2)      # Optionally include bigrams
)

# Fit to training data and transform both sets
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [9]:
# hyperparameter tuning 
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Define parameter grid to search
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 0.1, 0.01, 0.001]
}

# Initialize the SVC with RBF kernel
svm_rbf = SVC(kernel='rbf', random_state=42)

# Run GridSearch with 5-fold cross-validation
grid_search = GridSearchCV(
    estimator=svm_rbf,
    param_grid=param_grid,
    cv=5,                   # 5-fold cross-validation
    scoring='accuracy',     # Optimize for accuracy
    verbose=2,              # Print progress
    n_jobs=-1               # Use all CPU cores
)

# Fit GridSearch on scaled embeddings
grid_search.fit(X_train_vec, y_train)

# Best hyperparameters
print("Best hyperparameters found:", grid_search.best_params_)

# Train your final model using the best estimator
best_svm = grid_search.best_estimator_

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END .................................C=0.1, gamma=scale; total time=   0.1s
[CV] END ...................................C=0.1, gamma=0.1; total time=   0.1s
[CV] END .................................C=0.1, gamma=scale; total time=   0.1s
[CV] END .................................C=0.1, gamma=scale; total time=   0.1s
[CV] END .................................C=0.1, gamma=scale; total time=   0.1s
[CV] END .................................C=0.1, gamma=scale; total time=   0.1s
[CV] END ...................................C=0.1, gamma=0.1; total time=   0.1s
[CV] END ...................................C=0.1, gamma=0.1; total time=   0.1s
[CV] END ...................................C=0.1, gamma=0.1; total time=   0.1s
[CV] END ..................................C=0.1, gamma=0.01; total time=   0.1s
[CV] END .................................C=0.1, gamma=0.001; total time=   0.0s
[CV] END .................................C=0.1,

In [10]:
# Evaluate the tuned model
y_pred = best_svm.predict(X_test_vec)
print(classification_report(y_test, y_pred, target_names=le.classes_))

              precision    recall  f1-score   support

        code       0.82      0.60      0.69        15
    creative       0.70      0.53      0.60        30
     factual       0.44      0.54      0.49        35
        math       0.25      0.12      0.17         8
     opinion       0.44      0.56      0.49        25

    accuracy                           0.52       113
   macro avg       0.53      0.47      0.49       113
weighted avg       0.54      0.52      0.52       113



# Train Only with the Prompt Column

In [16]:
# import 
df_prompt = pd.read_csv(path + "ieor142a_sampled_dataset - Master Sheet.csv")
df_prompt = df_prompt[df_prompt['Domain'].isin(valid_domains)].copy()
df_prompt.head()

Unnamed: 0,prompt,model_a,model_b,response_a,response_b,winner,Open/Closed,Domain,Comments
0,Translate the following from English to German...,stablelm-tuned-alpha-7b,chatglm-6b,"In English, the sentence ""Considering the numb...",""" Considering the number of desserts popular i...",tie (bothbad),closed,factual,
1,You are a helpful chatbot who looks at rows of...,stablelm-tuned-alpha-7b,gpt-4,"Sure, I can help you categorize the rows of da...","["""", """", ""Shares"", ""Market Value • ($000)""]: ...",tie (bothbad),closed,factual,
2,Follow the instructions below:\n1. Analyse the...,llama-13b,RWKV-4-Raven-14B,"I just moved to 123 Elm Street, Springfield, I...",1. True\n2. False,model_b,closed,factual,
3,How to express a tree mathematically,vicuna-13b,dolly-v2-12b,There are several ways to express a tree mathe...,Let tree(x) denote a tree where x is the root ...,model_a,closed,math,
4,Tell me if the following numbers are even or o...,llama-13b,vicuna-13b,"3, 16, 17, 2110 are even.",3 is odd\n16 is even\n17 is odd\n2110 is even,model_b,closed,math,


In [None]:
# transform domain labels
df_prompt["domain_label"] = le.fit_transform(df_prompt["Domain"])
label_mapping_prompt = dict(zip(le.classes_, le.transform(le.classes_)))

In [18]:
# train test split 
X_train_prompt, X_test_prompt, y_train_prompt, y_test_prompt = train_test_split(
    df_prompt["prompt"], df_prompt["domain_label"],
    test_size=0.2, random_state=42, stratify=df_prompt["Domain"]
)
print(f"Train size: {len(X_train_prompt)}, Test size: {len(X_test_prompt)}")

Train size: 453, Test size: 114


In [19]:
# vectorize text data using TF-IDF
X_train_vec_prompt = vectorizer.fit_transform(X_train_prompt)
X_test_vec_prompt = vectorizer.transform(X_test_prompt)

In [20]:
# hyperparameter tuning 
grid_search.fit(X_train_vec_prompt, y_train_prompt)

# Best hyperparameters
print("Best hyperparameters found:", grid_search.best_params_)

# Train your final model using the best estimator
best_svm_prompt = grid_search.best_estimator_

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END .................................C=0.1, gamma=scale; total time=   0.0s
[CV] END .................................C=0.1, gamma=scale; total time=   0.0s
[CV] END .................................C=0.1, gamma=scale; total time=   0.0s
[CV] END .................................C=0.1, gamma=scale; total time=   0.0s
[CV] END ..................................C=0.1, gamma=0.01; total time=   0.0s
[CV] END .................................C=0.1, gamma=0.001; total time=   0.0s
[CV] END ...................................C=0.1, gamma=0.1; total time=   0.0s
[CV] END .................................C=0.1, gamma=scale; total time=   0.0s
[CV] END .................................C=0.1, gamma=0.001; total time=   0.0s
[CV] END ...................................C=0.1, gamma=0.1; total time=   0.0s
[CV] END .................................C=0.1, gamma=0.001; total time=   0.0s
[CV] END .................................C=0.1,

In [21]:
# Evaluate the tuned model
y_pred_prompt = best_svm_prompt.predict(X_test_vec_prompt)
print(classification_report(y_test_prompt, y_pred_prompt, target_names=le.classes_))

              precision    recall  f1-score   support

        code       0.67      0.53      0.59        15
    creative       0.69      0.73      0.71        30
     factual       0.42      0.53      0.47        36
        math       0.25      0.12      0.17         8
     opinion       0.52      0.44      0.48        25

    accuracy                           0.54       114
   macro avg       0.51      0.47      0.48       114
weighted avg       0.53      0.54      0.53       114



# Train with the Prompt and Open/Closed Columns

In [19]:
# import
df_oc = pd.read_csv(path + "ieor142a_sampled_dataset - Master Sheet.csv")
df_oc = df_oc[df_oc['Domain'].isin(valid_domains)].copy()
df_oc.head()

Unnamed: 0,prompt,model_a,model_b,response_a,response_b,winner,Open/Closed,Domain,Comments
0,Translate the following from English to German...,stablelm-tuned-alpha-7b,chatglm-6b,"In English, the sentence ""Considering the numb...",""" Considering the number of desserts popular i...",tie (bothbad),closed,factual,
1,You are a helpful chatbot who looks at rows of...,stablelm-tuned-alpha-7b,gpt-4,"Sure, I can help you categorize the rows of da...","["""", """", ""Shares"", ""Market Value • ($000)""]: ...",tie (bothbad),closed,factual,
2,Follow the instructions below:\n1. Analyse the...,llama-13b,RWKV-4-Raven-14B,"I just moved to 123 Elm Street, Springfield, I...",1. True\n2. False,model_b,closed,factual,
3,How to express a tree mathematically,vicuna-13b,dolly-v2-12b,There are several ways to express a tree mathe...,Let tree(x) denote a tree where x is the root ...,model_a,closed,math,
4,Tell me if the following numbers are even or o...,llama-13b,vicuna-13b,"3, 16, 17, 2110 are even.",3 is odd\n16 is even\n17 is odd\n2110 is even,model_b,closed,math,


In [20]:
# make a combined text column for features that are used (since features do not have weights in RBF due to being nonlinear)
df_oc["combined_text"] = (
    df_oc["Open/Closed"] + " [SEP] " + 
    df_oc["prompt"].astype(str)
)
df_oc.head()

Unnamed: 0,prompt,model_a,model_b,response_a,response_b,winner,Open/Closed,Domain,Comments,combined_text
0,Translate the following from English to German...,stablelm-tuned-alpha-7b,chatglm-6b,"In English, the sentence ""Considering the numb...",""" Considering the number of desserts popular i...",tie (bothbad),closed,factual,,closed [SEP] Translate the following from Engl...
1,You are a helpful chatbot who looks at rows of...,stablelm-tuned-alpha-7b,gpt-4,"Sure, I can help you categorize the rows of da...","["""", """", ""Shares"", ""Market Value • ($000)""]: ...",tie (bothbad),closed,factual,,closed [SEP] You are a helpful chatbot who loo...
2,Follow the instructions below:\n1. Analyse the...,llama-13b,RWKV-4-Raven-14B,"I just moved to 123 Elm Street, Springfield, I...",1. True\n2. False,model_b,closed,factual,,closed [SEP] Follow the instructions below:\n1...
3,How to express a tree mathematically,vicuna-13b,dolly-v2-12b,There are several ways to express a tree mathe...,Let tree(x) denote a tree where x is the root ...,model_a,closed,math,,closed [SEP] How to express a tree mathematically
4,Tell me if the following numbers are even or o...,llama-13b,vicuna-13b,"3, 16, 17, 2110 are even.",3 is odd\n16 is even\n17 is odd\n2110 is even,model_b,closed,math,,closed [SEP] Tell me if the following numbers ...


In [21]:
# transform domain labels
df_oc["domain_label"] = le.fit_transform(df_oc["Domain"])
label_mapping_oc = dict(zip(le.classes_, le.transform(le.classes_)))

In [23]:
# train test split 
from sklearn.model_selection import train_test_split

X_train_oc, X_test_oc, y_train_oc, y_test_oc = train_test_split(
    df_oc["combined_text"], df_oc["domain_label"],
    test_size=0.2, random_state=42, stratify=df_oc["Domain"]
)
print(f"Train size: {len(X_train_oc)}, Test size: {len(X_test_oc)}")

Train size: 453, Test size: 114


In [24]:
# vectorize text data using TF-IDF
X_train_vec_oc = vectorizer.fit_transform(X_train_oc)
X_test_vec_oc = vectorizer.transform(X_test_oc)

In [25]:
# hyperparameter tuning 
grid_search.fit(X_train_vec_oc, y_train_oc)

# Best hyperparameters
print("Best hyperparameters found:", grid_search.best_params_)

# Train your final model using the best estimator
best_svm_oc = grid_search.best_estimator_

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END .................................C=0.1, gamma=scale; total time=   0.0s
[CV] END .................................C=0.1, gamma=scale; total time=   0.0s
[CV] END .................................C=0.1, gamma=scale; total time=   0.0s
[CV] END .................................C=0.1, gamma=scale; total time=   0.0s
[CV] END .................................C=0.1, gamma=scale; total time=   0.0s
[CV] END .................................C=0.1, gamma=0.001; total time=   0.0s
[CV] END ..................................C=0.1, gamma=0.01; total time=   0.0s
[CV] END ...................................C=0.1, gamma=0.1; total time=   0.0s
[CV] END ...................................C=0.1, gamma=0.1; total time=   0.0s[CV] END ...................................C=0.1, gamma=0.1; total time=   0.0s

[CV] END .................................C=0.1, gamma=0.001; total time=   0.0s
[CV] END .................................C=0.1,

In [26]:
# Evaluate the tuned model
y_pred_oc = best_svm_oc.predict(X_test_vec_oc)
print(classification_report(y_test_oc, y_pred_oc, target_names=le.classes_))

              precision    recall  f1-score   support

        code       0.80      0.53      0.64        15
    creative       0.67      0.73      0.70        30
     factual       0.50      0.64      0.56        36
        math       0.25      0.12      0.17         8
     opinion       0.52      0.44      0.48        25

    accuracy                           0.57       114
   macro avg       0.55      0.49      0.51       114
weighted avg       0.57      0.57      0.56       114

