# Training models

load the data first:

In [85]:
import pandas as pd
df = pd.read_csv("../data/processed_data.csv")

All metrics we may use:

In [86]:
# The set of all metrics available.
all_met = ['clean_prompt_length','clean_response_a_length','clean_response_b_length','prompt_length','response_a_length','response_b_length','length_diff','prompt_sentiment','response_a_sentiment','response_b_sentiment','response_a_readability','response_b_readability','readability_diff']
len(all_met)

13

A function to get the label for a row:

In [87]:
def get_lab(row):
    if row['winner_model_a'] == 1:
        return 0
    elif row['winner_model_b'] == 1:
        return 1
    else:
        return 2

# Apply them to the dataframe
df['label'] = df.apply(get_lab, axis=1)

Import classifier from scikit-learn:

In [88]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

x = df[all_met]
y = df['label']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)
# create and feed the random forest classifier model
model = RandomForestClassifier(
    n_estimators=100,   #
    max_depth=None,
    random_state=42
)
model.fit(x_train, y_train)

Evaluate the correctness of model for once:

In [89]:
def calc_correctness(model: RandomForestClassifier, x_test, y_test):
    y_pred = model.predict(x_test)
    return accuracy_score(y_test, y_pred)

print(calc_correctness(model, x_test, y_test))

0.4387613082811413


To find the best combination of the metrics, we brute force it by enumerating all subset.

In [None]:
import itertools

# make a function which encapsulates the whole process
def evaluate(feat):
    x = df[feat]
    y = df['label']

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)
    # create and feed the random forest classifier model
    model = RandomForestClassifier(
        n_estimators=100,   #
        max_depth=None,
        random_state=42
    )
    model.fit(x_train, y_train)
    return calc_correctness(model, x_test, y_test)

def all_sub(lis):
    yield from itertools.chain.from_iterable(itertools.combinations(lis, r) for r in range(len(lis)+1))

maxe = 0
bst_lis = []
for s in all_sub(all_met):
    if len(s) == 0:
        continue
    if "length_diff" not in s:
        continue
    if len(s) < 7:
        continue
    print("Now feat: ", s)
    ans = evaluate(list(s))
    if ans > maxe:
        maxe = ans
        bst_lis = s
    print(ans)
    print("Current max: ", maxe, bst_lis)


Now feat:  ('clean_prompt_length', 'clean_response_a_length', 'clean_response_b_length', 'prompt_length', 'response_a_length', 'response_b_length', 'length_diff')
0.4078810020876827
Current max:  0.4078810020876827 ('clean_prompt_length', 'clean_response_a_length', 'clean_response_b_length', 'prompt_length', 'response_a_length', 'response_b_length', 'length_diff')
Now feat:  ('clean_prompt_length', 'clean_response_a_length', 'clean_response_b_length', 'prompt_length', 'response_a_length', 'length_diff', 'prompt_sentiment')
0.40866388308977036
Current max:  0.40866388308977036 ('clean_prompt_length', 'clean_response_a_length', 'clean_response_b_length', 'prompt_length', 'response_a_length', 'length_diff', 'prompt_sentiment')
Now feat:  ('clean_prompt_length', 'clean_response_a_length', 'clean_response_b_length', 'prompt_length', 'response_a_length', 'length_diff', 'response_a_sentiment')
0.4177105080027836
Current max:  0.4177105080027836 ('clean_prompt_length', 'clean_response_a_length

## Try out using pytorch model:
