# Task B- Random Forests without Oversampling

 ## Loading the Datasets

In [None]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from textblob import TextBlob
from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score, r2_score
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error,
    explained_variance_score, r2_score
)


In [None]:
sns.set_theme(style="whitegrid")

# Main Dataset

# Conversation Data -- we will use this data in the "Conversation Data" section

df = pd.read_json(
    "data/chatbot-arena-conversations.jsonl.gz",
    compression="gzip",
    lines=True)
df.head(5)

In [None]:
# Auxiliary Datasets

# Embedding Data -- we will use this data in the "Embedding Data" section
prompt_embeddings = np.load(
    "data/chatbot-arena-prompts-embeddings.npy"
)

response_a_embeddings = np.load(
    "data/chatbot-arena-model_a_response-embeddings.npy"
)

response_b_embeddings = np.load(
    "data/chatbot-arena-model_b_response-embeddings.npy"
)

# Topic Modeling and Hardness Score Data -- we will use this data in the "Topic Modeling and Hardness Score Data" section
topic_and_hardness = pd.read_json(
    "data/chatbot-arena-gpt3-scores.jsonl.gz",
    lines=True,
    compression="gzip"
)

In [None]:
# Extracting the prompt
df["prompt"] = df["conversation_a"].str[0].str["content"]
df["prompt"].head(1)
df["prompt"].value_counts()

In [None]:
#add columns to separate responses from the conversation (we already did this for the prompt)
df["model_a_response"] = df["conversation_a"].str[1].str["content"]
df["model_b_response"] = df["conversation_b"].str[1].str["content"]
df["model_a_response"][2]

In [None]:
#create columns for prompt and response lengths
df["prompt_length"] = df["prompt"].str.len()
df["prompt_length"].describe() #use this if you want to know statistics about prompt and response lengths

df["response_a_length"] = df["model_a_response"].str.len()


df["response_b_length"] = df["model_b_response"].str.len()


### Engineering feature of dot product similarity between embeddings

In [None]:
assert len(prompt_embeddings) == len(df), "Mismatch between prompt embeddings and DataFrame rows."
assert len(response_a_embeddings) == len(df), "Mismatch between response_a embeddings and DataFrame rows."
assert len(response_b_embeddings) == len(df), "Mismatch between response_b embeddings and DataFrame rows."

# Compute dot products and add them as new columns to the original DataFrame
df['similarity_prompt_response_a'] = [
    np.dot(prompt, response_a)
    for prompt, response_a in zip(prompt_embeddings, response_a_embeddings)
]

df['similarity_prompt_response_b'] = [
    np.dot(prompt, response_b)
    for prompt, response_b in zip(prompt_embeddings, response_b_embeddings)
]


#### Converting the score_value_"n" columns, in topic_and_hardness dataframe, to numeric values and stored in the column "hardness_score"

In [None]:
#convert hardness scores from list to numeric
topic_and_hardness['score_value_1'] = pd.to_numeric(topic_and_hardness['score_value_1'], errors='coerce')
topic_and_hardness['score_value_2'] = pd.to_numeric(topic_and_hardness['score_value_2'], errors='coerce')
topic_and_hardness['score_value_3'] = pd.to_numeric(topic_and_hardness['score_value_3'], errors='coerce')

In [None]:
topic_and_hardness.drop_duplicates(subset='question_id', inplace=True)

In [None]:
#this will merge the mean value of the hardness scores and the topic_modeling 3 with the main data set
topic_and_hardness['hardness_score'] = topic_and_hardness[['score_value_1', 'score_value_2', 'score_value_3']].mean(axis=1)
merged_df = df.merge(topic_and_hardness[['question_id', 'hardness_score', 'topic_modeling_3']], on='question_id', how='left')

#### Removing question ids that produce null values and storing the final dataset into filtered_df

In [None]:
# To Do: fetch code where we identified these question IDs @caroline
# This will remove question ids that produced null values for topic_and_hardness
filtered_df = merged_df[
    ~merged_df['question_id'].isin([
        'e6d45ead33114cca8ee3cfa028517eff',
        'addaa796ee094f029f8014ea1468df8a',
        'd37eb99864fa41ecab49026abdddb53e',
        '6da02001e74041d0947982fb4d05db9e',
        'd93e36df73e84aa2ade15d4a038c098f',
        '50b63f92bc5948218e1555d1eae17797',
        '8277b16d9a0845d694a33c04f446926c',
        'd56d698d4c1c495682a366f2a78fcb77',
        '86c7abedb5f84b7ea752cc98d324d387',
        '974569d7b9c74ca591f1922bf3722266',
        'ba3cfbae941946c6a41ea725b46f3aa7',
        '283be8724d674cb6a74da59d0c12ea16',
        'fbd400babd914eb1ad6202b4400d6beb',
        '087b90bf76ed4c409b7c7bd5d6b0d6c5',
        'f9ad250d0194489b8b9f12d31386aab1',
        '7e371a23981847c7b19bc61195f11eda',
        'f55b273603754c0d92f5139351ca0d02',
        '255e1466eaf14cc2ba93040776c0c440',
        '6979b646271e46f4a329a5e6acaeed38',
        'ed7fa335d5524295b1dd79452778d26c',
        'f794c699e4964b11a1373976be5b0944',
        'c92bcab701cc46649368fc121b84ccc1',
        'a632607132cf4908883fd262102714c9',
        'a0857d64b4954ea6bd75c2406c3dd32c',
        'e4a1fc8a9462475d95022914cb9f5b03',
        'c7aa752f61dc46d093dc86713686f6f3'
    ])
]


#### Calculating the ELO ratings per model:

In [None]:
# Initialize ratings dictionary with a default rating (e.g., 1000 for each model)
elo_ratings = {model: 1000 for model in pd.concat([filtered_df["model_a"], filtered_df["model_b"]]).unique()}

def update_elo(winner, loser, k=32):
    """Adjust the ELO rating based on the winner and loser models."""
    winner_rating = elo_ratings[winner]
    loser_rating = elo_ratings[loser]

    # expected scores
    expected_winner = 1 / (1 + 10 ** ((loser_rating - winner_rating) / 400))
    expected_loser = 1 - expected_winner

    # update ratings
    elo_ratings[winner] += k * (1 - expected_winner)
    elo_ratings[loser] += k * (0 - expected_loser)

# Loop over each row to adjust ELO ratings
for i, row in filtered_df.iterrows():
    if row["winner"] == "model_a":
        update_elo(row["model_a"], row["model_b"])
    elif row["winner"] == "model_b":
        update_elo(row["model_b"], row["model_a"])

# convert ELO ratings to a sorted DataFrame
elo_df = pd.DataFrame(list(elo_ratings.items()), columns=["model", "ELO_rating"]).sort_values(by="ELO_rating", ascending=False)
print("ELO ratings per model:")
print(elo_df)




### Create column where the ELO ratings are compared, model a minus model b, to give a measure of which model is stronger so that we can include the prompt features as well in task A (predicting whether model a or model b will win)

In [None]:
filtered_df['elo_modela_minus_modelb'] = filtered_df.apply(
    lambda row: elo_ratings[row['model_a']] - elo_ratings[row['model_b']],
    axis=1
)

# Display the first few rows to verify the new column
filtered_df.columns

## Topics feature

In [None]:
# Define regex patterns and their corresponding feature names
core_topics = {
    r'math': 'topic_math',
    r'fact\w*': 'topic_fact',
    r'creativ\w*': 'topic_creative',
    r'problem[ -]?solving': 'topic_problem_solving'
}

# Apply one-hot encoding for the defined core topics
for pattern, feature_name in core_topics.items():
    filtered_df[feature_name] = filtered_df['topic_modeling_3'].str.contains(pattern, case=False, na=False, regex=True).astype(int)

# Check the updated DataFrame
filtered_df.head()

## Auxiliary verbs feature

In [None]:

# Define the modal verbs
modal_verbs = ['can', 'could', 'may', 'might', 'shall', 'should', 'will', 'would', 'must', 'have to']

# Create a regex pattern (case-insensitive)
modal_regex = '|'.join([re.escape(verb) for verb in modal_verbs])

# Function to one-hot encode the presence of modal verbs
def one_hot_modal_features(df, column_name, modal_regex):
    for verb in modal_verbs:
        pattern = rf'\b{re.escape(verb)}\b'
        df[f'{column_name}_contains_{verb.replace(" ", "_")}'] = df[column_name].str.contains(pattern, case=False, na=False).astype(int)
    return df

# Apply the function to both model_a_response and model_b_response
filtered_df = one_hot_modal_features(filtered_df, 'model_a_response', modal_regex)
filtered_df = one_hot_modal_features(filtered_df, 'model_b_response', modal_regex)

# Display the updated DataFrame with new features
filtered_df.head()


In [None]:
# Question words feature

# Define the question words
question_words = ['who', 'where', 'when', 'why', 'what', 'how', 'which']

# Function to one-hot encode the presence of question words as 0/1
def one_hot_question_features(df, column_name):
    for word in question_words:
        pattern = rf'\b{re.escape(word)}\b'
        df[f'{column_name}_contains_{word}'] = (
            df[column_name].str.contains(pattern, case=False, na=False).astype(int)
        )
    return df

# Apply the function to the prompt column
filtered_df = one_hot_question_features(filtered_df, 'prompt')

# Display the updated DataFrame with new features
filtered_df.head()


In [None]:
def extract_textblob_features(df, text_column):
    """ 
    To extract sentiment and linguistic features utilizing TextBlob

    Parameters:
        df: Input Dataframe
        text_column: Name of column with text data

    Returns:
        df: Updated Dataframe with the extracted features as new columns
    """
    # initialize lists to store the extracted features
    polarity = [] # sentiment polarity scores
    noun_count = [] # number of nouns 
    verb_count = [] # number of verbs

    # loop through each row in column 
    for text in df[text_column]:
        # Create a TextBlob object to analyze the text
        blob = TextBlob(text)
        polarity.append(blob.sentiment.polarity)
        # count nouns
        noun_count.append(len([word for word, pos in blob.tags if pos.startswith('NN')]))
        # count verbs 
        verb_count.append(len([word for word, pos in blob.tags if pos.startswith('VB')]))

    # Add extracted features to the DataFrame as new columns 
    df[f"{text_column}_polarity"] = polarity
    df[f"{text_column}_noun_count"] = noun_count
    df[f"{text_column}_verb_count"] = verb_count

    return df

# Use textblob featuures on prompts and responses
filtered_df = extract_textblob_features(filtered_df, 'prompt')
filtered_df = extract_textblob_features(filtered_df, 'model_a_response')
filtered_df = extract_textblob_features(filtered_df, 'model_b_response')
test_merged_df = extract_textblob_features(test_merged_df, 'prompt')
test_merged_df = extract_textblob_features(test_merged_df, 'model_a_response')
test_merged_df = extract_textblob_features(test_merged_df, 'model_b_response')

In [None]:
## Feature engineering: interaction terms

# Will the combined length of both of the models' responses have an impact on hardness score?
filtered_df.loc[:, 'length_interaction'] = (filtered_df['response_a_length'] * filtered_df['response_b_length'])

# Interaction between response lengths and sentiment polarity
# Thought process: a long positive response may be deemed a low hardness score and a long negative response may be deemed a high hardness score
filtered_df['response_a_length_polarity'] = (filtered_df['response_a_length'] * filtered_df['model_a_response_polarity'])
filtered_df['response_b_length_polarity'] = (filtered_df['response_b_length'] * filtered_df['model_b_response_polarity'])

# Will this interaction between ELO ratings provide insight into whether difference in the skill level between models contributes to hardness scores?
filtered_df['elo_modela_minus_modelb'] = filtered_df.apply(lambda row: elo_ratings[row['model_a']] - elo_ratings[row['model_b']], axis=1)

### Feature Set

In [None]:
selected_features = ['prompt_length',
    'length_interaction', 'elo_modela_minus_modelb',
    'prompt_noun_count', 'prompt_verb_count', 
    'response_a_length_polarity', 'response_b_length_polarity',
    'model_a_response_noun_count', 'model_a_response_verb_count',
    'model_b_response_noun_count', 'model_b_response_verb_count', 'topic_math', 'topic_creative']

In [None]:
# ensure y is cleaned first
y = filtered_df['hardness_score']
y = y.dropna()  # drop NaN values
y = y[~np.isinf(y)]  # drop inf values

# ensure X aligns with cleaned y
X = filtered_df[selected_features].loc[y.index]

# drop missing or infinite values in X
X = X.dropna()
#X = X[~np.isinf(X).any(axis=1)]

# aligns X and y indices after filtering
X, y = X.align(y, axis=0)


In [None]:
# iterate over each column in the feature set to visualize its the relationship with hardness score
for column in X.columns:
    plt.scatter(X[column], y)
    plt.title(f"{column} vs Hardness Score")
    plt.xlabel(column)
    plt.ylabel("Hardness Score")
    plt.show()


### Metrics

In [None]:
def calculate_performance_metrics(y_true, y_pred, y_train=None, y_train_pred=None, y_val=None, y_val_pred=None):
    """
    Calculates various performance metrics for training, testing, and validation sets.
    
    Parameters:
        y_true: True values for the test set.
        y_pred: Predicted values for the test set.
        y_train: True values for the training set (optional).
        y_train_pred: Predicted values for the training set (optional).
        y_val: True values for the validation set (optional).
        y_val_pred: Predicted values for the validation set (optional).
    
    Returns:
        metrics: Dictionary containing performance metrics.
    """
    metrics = {
        "Test RMSE": np.sqrt(mean_squared_error(y_true, y_pred)),
        "Test MSE": mean_squared_error(y_true, y_pred),
        "Test MAE": mean_absolute_error(y_true, y_pred),
        "Test R² Score": r2_score(y_true, y_pred),
        "Test Explained Variance": explained_variance_score(y_true, y_pred),
        "Test Residual Std Dev": np.std(y_true - y_pred),
    }
    
    if y_train is not None and y_train_pred is not None:
        metrics.update({
            "Train RMSE": np.sqrt(mean_squared_error(y_train, y_train_pred)),
            "Train MSE": mean_squared_error(y_train, y_train_pred),
            "Train MAE": mean_absolute_error(y_train, y_train_pred),
            "Train R² Score": r2_score(y_train, y_train_pred),
            "Train Explained Variance": explained_variance_score(y_train, y_train_pred),
        })
    
    if y_val is not None and y_val_pred is not None:
        metrics.update({
            "Validation RMSE": np.sqrt(mean_squared_error(y_val, y_val_pred)),
            "Validation MSE": mean_squared_error(y_val, y_val_pred),
            "Validation MAE": mean_absolute_error(y_val, y_val_pred),
            "Validation R² Score": r2_score(y_val, y_val_pred),
            "Validation Explained Variance": explained_variance_score(y_val, y_val_pred),
        })
    
    return metrics

# Function to display performance metrics
def display_performance_metrics(metrics):
    """
    Display performance metrics in a readable format.
    
    Parameters:
        metrics: Dictionary of performance metrics to display.
    """
    print("\nPerformance Metrics:")
    for metric_name, value in metrics.items():
        print(f"{metric_name}: {value:.2f}")


In [None]:
 #function to calculate range-specific RMSE
def calculate_range_rmse(y_true, y_pred, low_threshold=3, mid_threshold=6):
    """ 
    Calculate RMSE range scores
    """
    y_pred_rounded = np.round(y_pred)
    rmse_low = np.sqrt(mean_squared_error(y_true[y_true <= low_threshold], y_pred_rounded[y_true <= low_threshold])) if (y_true <= low_threshold).sum() > 0 else np.nan
    rmse_mid = np.sqrt(mean_squared_error(y_true[(y_true > low_threshold) & (y_true <= mid_threshold)], y_pred_rounded[(y_true > low_threshold) & (y_true <= mid_threshold)])) if ((y_true > low_threshold) & (y_true <= mid_threshold)).sum() > 0 else np.nan
    rmse_high = np.sqrt(mean_squared_error(y_true[y_true > mid_threshold], y_pred_rounded[y_true > mid_threshold])) if (y_true > mid_threshold).sum() > 0 else np.nan
    return rmse_low, rmse_mid, rmse_high

# function to display range-specific RMSE
def display_range_rmse(rmse_low, rmse_mid, rmse_high):
    """ 
    Displays RMSE range scores
    """
    print("\nRange-Specific RMSE:")
    print(f"Low Range (<=3): {rmse_low:.2f}" if not np.isnan(rmse_low) else "Low Range (<=3): N/A")
    print(f"Mid Range (3-6): {rmse_mid:.2f}" if not np.isnan(rmse_mid) else "Mid Range (3-6): N/A")
    print(f"High Range (>6): {rmse_high:.2f}" if not np.isnan(rmse_high) else "High Range (>6): N/A")


### Model

In [None]:

# Drop rows with missing target values
filtered_df = filtered_df.dropna(subset=['hardness_score'])

# Extract features and target variable
X = filtered_df[selected_features]
y = filtered_df['hardness_score']

# K-Fold Cross-Validation and Random Forest Regressor
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42
)

# Storing all the metrics
rmse_scores_low, rmse_scores_mid, rmse_scores_high = [], [], []
training_scores, validation_scores = [], []
training_mse, validation_mse = [], []
training_mae, validation_mae = [], []
y_true_all, y_pred_all = [], []

# K-Fold Cross-Validation loop
for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1):
    print(f"Processing Fold {fold}...")
    
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # Train model
    rf_model.fit(X_train, y_train)

    # Predictions
    y_pred_train = rf_model.predict(X_train)
    y_pred_test = rf_model.predict(X_test)
    y_pred_test_rounded = np.round(y_pred_test)

    # True and predicted values for plotting
    y_true_all.extend(y_test)
    y_pred_all.extend(y_pred_test_rounded)

    # Training and validation R² scores calculations
    training_scores.append(rf_model.score(X_train, y_train))
    validation_scores.append(rf_model.score(X_test, y_test))

    # Training and validation MSE/MAE
    training_mse.append(mean_squared_error(y_train, y_pred_train))
    validation_mse.append(mean_squared_error(y_test, y_pred_test))
    training_mae.append(mean_absolute_error(y_train, y_pred_train))
    validation_mae.append(mean_absolute_error(y_test, y_pred_test))

    # Calculate RMSE for each range
    rmse_low, rmse_mid, rmse_high = calculate_range_rmse(y_test, y_pred_test)
    if not np.isnan(rmse_low): rmse_scores_low.append(rmse_low)
    if not np.isnan(rmse_mid): rmse_scores_mid.append(rmse_mid)
    if not np.isnan(rmse_high): rmse_scores_high.append(rmse_high)

# Calculate metrics
average_metrics = calculate_performance_metrics(np.array(y_true_all), np.array(y_pred_all))
rmse_low_avg = np.mean(rmse_scores_low)
rmse_mid_avg = np.mean(rmse_scores_mid)
rmse_high_avg = np.mean(rmse_scores_high)

# Display overall metrics
print("\nOverall Metrics:")
display_performance_metrics(average_metrics)

# Display RMSE by range
display_range_rmse(rmse_low_avg, rmse_mid_avg, rmse_high_avg)

# Display training and validation metrics
print("\nTraining and Validation Metrics (Averages Across Folds):")
print(f"Training R²: {np.mean(training_scores):.2f}")
print(f"Validation R²: {np.mean(validation_scores):.2f}")
print(f"Training MSE: {np.mean(training_mse):.2f}")
print(f"Validation MSE: {np.mean(validation_mse):.2f}")
print(f"Training MAE: {np.mean(training_mae):.2f}")
print(f"Validation MAE: {np.mean(validation_mae):.2f}")


In [None]:
rf_model.fit(X_train, y_train)
feature_importances = pd.Series(rf_model.feature_importances_, index=X_train.columns)
print(feature_importances.sort_values(ascending=False))

### Visualizations

In [None]:

def plot_true_vs_predicted(performance_df):
    """
    Visualizes the relationship between true and predicted values
    """
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x='True Values', y='Predicted Values', data=performance_df, alpha=0.7)
    plt.plot([performance_df['True Values'].min(), performance_df['True Values'].max()],
             [performance_df['True Values'].min(), performance_df['True Values'].max()],
             color='red', linestyle='--', label='Perfect Prediction')
    plt.title('True vs Predicted Values')
    plt.xlabel('True Hardness Scores')
    plt.ylabel('Predicted Hardness Scores')
    plt.legend()
    plt.show()

def plot_residuals(performance_df):
    """
    Displays the distribution of residuals (True - Predicted)
    """
    residuals = performance_df['Residuals']
    plt.figure(figsize=(10, 6))
    sns.histplot(residuals, kde=True, bins=30, color='blue')
    plt.title('Residual Distribution')
    plt.xlabel('Residuals (True - Predicted)')
    plt.ylabel('Frequency')
    plt.axvline(0, color='red', linestyle='--')
    plt.show()

def plot_feature_importance(model, feature_names):
    """
    Visualizes the importance of features as determined by the model
    """
    feature_importance = model.feature_importances_
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importance
    }).sort_values(by='Importance', ascending=False)
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=importance_df, palette='viridis')
    plt.title('Feature Importance')
    plt.xlabel('Importance')
    plt.ylabel('Features')
    plt.show()



In [None]:
# Create the performance DataFrame
performance_df = pd.DataFrame({
    'True Values': y_true_all,
    'Predicted Values': y_pred_all
})
performance_df.head()

In [None]:
# True vs predicted plot
plot_true_vs_predicted(performance_df)


In [None]:
 # Residual plots
plot_residuals(performance_df)

In [None]:
# Plot feature importance
plot_feature_importance(rf_model, selected_features)

In [None]:
# Correlation Heatmap
def plot_correlation_heatmap(data, target_column):
    """
    Displays a Correlation Heatmap
    """
    corr = data.corr()
    plt.figure(figsize=(12, 8))
    sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
    plt.title("Correlation Heatmap")
    plt.show()
    
    # Correlation with target variable
    plt.figure(figsize=(6, 4))
    target_corr = corr[target_column].sort_values(ascending=False)
    sns.barplot(x=target_corr, y=target_corr.index)
    plt.title(f"Correlation with {target_column}")
    plt.show()

# Correlation heatmap (pass your filtered_df and target column name)
plot_correlation_heatmap(filtered_df[selected_features + ['hardness_score']], 'hardness_score')
