# Task B- Random Forests with Oversampling and the Test Set

 ## Loading the Datasets

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.utils import resample
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import pandas as pd


In [None]:
sns.set_theme(style="whitegrid")

# Main Dataset

# Conversation Data -- we will use this data in the "Conversation Data" section

df = pd.read_json(
    "data/chatbot-arena-conversations.jsonl.gz",
    compression="gzip",
    lines=True)
df.head(5)

In [None]:
# Auxiliary Datasets

# Embedding Data -- we will use this data in the "Embedding Data" section
prompt_embeddings = np.load(
    "data/chatbot-arena-prompts-embeddings.npy"
)

response_a_embeddings = np.load(
    "data/chatbot-arena-model_a_response-embeddings.npy"
)

response_b_embeddings = np.load(
    "data/chatbot-arena-model_b_response-embeddings.npy"
)

# Topic Modeling and Hardness Score Data -- we will use this data in the "Topic Modeling and Hardness Score Data" section
topic_and_hardness = pd.read_json(
    "data/chatbot-arena-gpt3-scores.jsonl.gz",
    lines=True,
    compression="gzip"
)

In [None]:
test_df = pd.read_json(
    "data/arena-test-set-prompt-and-responses.jsonl.gz",
    compression="gzip",
    lines=True)
test_df.head(5)

In [None]:
#testing data frames 

test_topic_and_hardness = pd.read_json(
    "data/arena-test-set-topic-modeling.jsonl.gz",
    lines=True,
    compression="gzip"
)

test_prompts = np.load(
    "data/arena-test-set-prompts-embeddings.npy"
)

test_response_a_embeddings = np.load(
    "data/arena-test-set-model_a_response-embeddings.npy"
)

test_response_b_embeddings = np.load(
    "data/arena-test-set-model_b_response-embeddings.npy"
)

In [None]:
# Extracting the prompt
df["prompt"] = df["conversation_a"].str[0].str["content"]
df["prompt"].head(1)
df["prompt"].value_counts()

In [None]:
# Extracting the prompt
test_df.columns

In [None]:
#add columns to separate responses from the conversation (we already did this for the prompt)
df["model_a_response"] = df["conversation_a"].str[1].str["content"]
df["model_b_response"] = df["conversation_b"].str[1].str["content"]
df["model_a_response"][2]

In [None]:
test_df.head()

In [None]:
#create columns for prompt and response lengths
df["prompt_length"] = df["prompt"].str.len()
df["prompt_length"].describe() #use this if you want to know statistics about prompt and response lengths

df["response_a_length"] = df["model_a_response"].str.len()


df["response_b_length"] = df["model_b_response"].str.len()


In [None]:
test_df["prompt_length"] = test_df["prompt"].str.len()
test_df["prompt_length"].describe()

test_df["response_a_length"] = test_df["model_a_response"].str.len()


test_df["response_b_length"] = test_df["model_b_response"].str.len()

In [None]:
test_df.columns

### Engineering feature of dot product similarity between embeddings

In [None]:
assert len(prompt_embeddings) == len(df), "Mismatch between prompt embeddings and DataFrame rows."
assert len(response_a_embeddings) == len(df), "Mismatch between response_a embeddings and DataFrame rows."
assert len(response_b_embeddings) == len(df), "Mismatch between response_b embeddings and DataFrame rows."

# Compute dot products and add them as new columns to the original DataFrame
df['similarity_prompt_response_a'] = [
    np.dot(prompt, response_a)
    for prompt, response_a in zip(prompt_embeddings, response_a_embeddings)
]

df['similarity_prompt_response_b'] = [
    np.dot(prompt, response_b)
    for prompt, response_b in zip(prompt_embeddings, response_b_embeddings)
]


In [None]:
print(topic_and_hardness[['score_value_1', 'score_value_2', 'score_value_3']].dtypes)
print(topic_and_hardness[['score_value_1', 'score_value_2', 'score_value_3']].head())


In [None]:
# Inspect the first few rows of the problematic columns
print(topic_and_hardness[['score_value_1', 'score_value_2', 'score_value_3']].head())
print(topic_and_hardness[['score_value_1', 'score_value_2', 'score_value_3']].info())


In [None]:
#this will merge the mean value of the hardness scores and the topic_modeling 3 with the main data set
topic_and_hardness['hardness_score'] = topic_and_hardness[['score_value_1', 'score_value_2', 'score_value_3']].mean(axis=1)
merged_df = df.merge(topic_and_hardness[['question_id', 'hardness_score', 'topic_modeling_3']], on='question_id', how='left')

In [None]:
test_merged_df = test_df.merge(topic_and_hardness[['question_id', 'topic_modeling_2']], on='question_id', how='left')

In [None]:
test_merged_df = test_merged_df.fillna(0)

In [None]:
# To Do: fetch code where we identified these question IDs @caroline
# This will remove question ids that produced null values for topic_and_hardness
filtered_df = merged_df[
    ~merged_df['question_id'].isin([
        'e6d45ead33114cca8ee3cfa028517eff',
        'addaa796ee094f029f8014ea1468df8a',
        'd37eb99864fa41ecab49026abdddb53e',
        '6da02001e74041d0947982fb4d05db9e',
        'd93e36df73e84aa2ade15d4a038c098f',
        '50b63f92bc5948218e1555d1eae17797',
        '8277b16d9a0845d694a33c04f446926c',
        'd56d698d4c1c495682a366f2a78fcb77',
        '86c7abedb5f84b7ea752cc98d324d387',
        '974569d7b9c74ca591f1922bf3722266',
        'ba3cfbae941946c6a41ea725b46f3aa7',
        '283be8724d674cb6a74da59d0c12ea16',
        'fbd400babd914eb1ad6202b4400d6beb',
        '087b90bf76ed4c409b7c7bd5d6b0d6c5',
        'f9ad250d0194489b8b9f12d31386aab1',
        '7e371a23981847c7b19bc61195f11eda',
        'f55b273603754c0d92f5139351ca0d02',
        '255e1466eaf14cc2ba93040776c0c440',
        '6979b646271e46f4a329a5e6acaeed38',
        'ed7fa335d5524295b1dd79452778d26c',
        'f794c699e4964b11a1373976be5b0944',
        'c92bcab701cc46649368fc121b84ccc1',
        'a632607132cf4908883fd262102714c9',
        'a0857d64b4954ea6bd75c2406c3dd32c',
        'e4a1fc8a9462475d95022914cb9f5b03',
        'c7aa752f61dc46d093dc86713686f6f3'
    ])
]


#### Calculating the ELO ratings per model:

In [None]:
# Initialize ratings dictionary with a default rating (e.g., 1000 for each model)
elo_ratings = {model: 1000 for model in pd.concat([filtered_df["model_a"], filtered_df["model_b"]]).unique()}

def update_elo(winner, loser, k=32):
    """Adjust the ELO rating based on the winner and loser models."""
    winner_rating = elo_ratings[winner]
    loser_rating = elo_ratings[loser]

    # expected scores
    expected_winner = 1 / (1 + 10 ** ((loser_rating - winner_rating) / 400))
    expected_loser = 1 - expected_winner

    # update ratings
    elo_ratings[winner] += k * (1 - expected_winner)
    elo_ratings[loser] += k * (0 - expected_loser)

# Loop over each row to adjust ELO ratings
for i, row in filtered_df.iterrows():
    if row["winner"] == "model_a":
        update_elo(row["model_a"], row["model_b"])
    elif row["winner"] == "model_b":
        update_elo(row["model_b"], row["model_a"])

# convert ELO ratings to a sorted DataFrame
elo_df = pd.DataFrame(list(elo_ratings.items()), columns=["model", "ELO_rating"]).sort_values(by="ELO_rating", ascending=False)
print("ELO ratings per model:")
print(elo_df)




### Create column where the ELO ratings are compared, model a minus model b, to give a measure of which model is stronger so that we can include the prompt features as well in our tasks

## Topics feature

In [None]:
# Define regex patterns and their corresponding feature names
core_topics = {
    r'math': 'topic_math',
    r'fact\w*': 'topic_fact',
    r'creativ\w*': 'topic_creative',
    r'problem[ -]?solving': 'topic_problem_solving'
}

# Apply one-hot encoding for the defined core topics
for pattern, feature_name in core_topics.items():
    filtered_df[feature_name] = filtered_df['topic_modeling_3'].str.contains(pattern, case=False, na=False, regex=True).astype(int)

In [None]:
# Define regex patterns and their corresponding feature names
core_topics = {
    r'math': 'topic_math',
    r'creativ\w*': 'topic_creative',
}

# Apply one-hot encoding for the defined core topics
for pattern, feature_name in core_topics.items():
    test_merged_df[feature_name] = test_merged_df['topic_modeling_2'].str.contains(pattern, case=False, na=False, regex=True).astype(int)

In [None]:
test_merged_df.head()

## Auxiliary verbs feature

In [None]:
import re

# Define the modal verbs
modal_verbs = ['can', 'could', 'may', 'might', 'shall', 'should', 'will', 'would', 'must', 'have to']

# Create a regex pattern (case-insensitive)
modal_regex = '|'.join([re.escape(verb) for verb in modal_verbs])

# Function to one-hot encode the presence of modal verbs
def one_hot_modal_features(df, column_name, modal_regex):
    for verb in modal_verbs:
        pattern = rf'\b{re.escape(verb)}\b'
        df[f'{column_name}_contains_{verb.replace(" ", "_")}'] = df[column_name].str.contains(pattern, case=False, na=False).astype(int)
    return df

# Apply the function to both model_a_response and model_b_response
filtered_df = one_hot_modal_features(filtered_df, 'model_a_response', modal_regex)
filtered_df = one_hot_modal_features(filtered_df, 'model_b_response', modal_regex)


### TextBlob

In [None]:
#Imports
from textblob import TextBlob
import matplotlib.pyplot as plt
from sklearn.metrics import (mean_squared_error, mean_absolute_error, explained_variance_score, r2_score)


In [None]:
def extract_textblob_features(df, text_column):
    """ 
    To extract sentiment and linguistic features utilizing TextBlob

    Parameters:
        df: Input Dataframe
        text_column: Name of column with text data

    Returns:
        df: Updated Dataframe with the extracted features as new columns
    """
    # initialize lists to store the extracted features
    polarity = [] # sentiment polarity scores
    noun_count = [] # number of nouns 
    verb_count = [] # number of verbs

    # loop through each row in column 
    for text in df[text_column]:
        # Create a TextBlob object to analyze the text
        blob = TextBlob(text)
        polarity.append(blob.sentiment.polarity)
        # count nouns
        noun_count.append(len([word for word, pos in blob.tags if pos.startswith('NN')]))
        # count verbs 
        verb_count.append(len([word for word, pos in blob.tags if pos.startswith('VB')]))

    # Add extracted features to the DataFrame as new columns 
    df[f"{text_column}_polarity"] = polarity
    df[f"{text_column}_noun_count"] = noun_count
    df[f"{text_column}_verb_count"] = verb_count

    return df

# Use textblob featuures on prompts and responses
filtered_df = extract_textblob_features(filtered_df, 'prompt')
filtered_df = extract_textblob_features(filtered_df, 'model_a_response')
filtered_df = extract_textblob_features(filtered_df, 'model_b_response')
test_merged_df = extract_textblob_features(test_merged_df, 'prompt')
test_merged_df = extract_textblob_features(test_merged_df, 'model_a_response')
test_merged_df = extract_textblob_features(test_merged_df, 'model_b_response')

### Feature Interactions

In [None]:
## Feature engineering: interaction terms

# Will the combined length of both of the models' responses have an impact on hardness score?
filtered_df.loc[:, 'length_interaction'] = (filtered_df['response_a_length'] * filtered_df['response_b_length'])

# Interaction between response lengths and sentiment polarity
# Thought process: a long positive response may be deemed a low hardness score and a long negative response may be deemed a high hardness score
filtered_df['response_a_length_polarity'] = (filtered_df['response_a_length'] * filtered_df['model_a_response_polarity'])
filtered_df['response_b_length_polarity'] = (filtered_df['response_b_length'] * filtered_df['model_b_response_polarity'])

# Will this interaction between ELO ratings provide insight into whether difference in the skill level between models contributes to hardness scores?
filtered_df['elo_modela_minus_modelb'] = filtered_df.apply(lambda row: elo_ratings[row['model_a']] - elo_ratings[row['model_b']], axis=1)

In [None]:
# testing
# Will the combined length of both of the models' responses have an impact on hardness score?
test_merged_df.loc[:, 'length_interaction'] = (test_merged_df['response_a_length'] * test_merged_df['response_b_length'])

# Interaction between response lengths and sentiment polarity
# Thought process: a long positive response may be deemed a low hardness score and a long negative response may be deemed a high hardness score
test_merged_df['response_a_length_polarity'] = (test_merged_df['response_a_length'] * test_merged_df['model_a_response_polarity'])
test_merged_df['response_b_length_polarity'] = (test_merged_df['response_b_length'] * test_merged_df['model_b_response_polarity'])

# Will this interaction between ELO ratings provide insight into whether difference in the skill level between models contributes to hardness scores?
test_merged_df['elo_modela_minus_modelb'] = test_merged_df.apply(lambda row: elo_ratings[row['model_a']] - elo_ratings[row['model_b']], axis=1)

In [None]:
# Analyzing the numeric columns from the data for correlation analysis for feature interaction analysis
numeric_df = filtered_df.select_dtypes(include=[np.number])

# correlation matrix for the numeric data
corr_matrix = numeric_df.corr()
sns.heatmap(corr_matrix, cmap='coolwarm')

### Establishing the feature set

In [None]:
# The selected features for training the model
selected_features = ['prompt_length',
    'length_interaction', 'elo_modela_minus_modelb',
    'prompt_noun_count', 'prompt_verb_count', 
    'response_a_length_polarity', 'response_b_length_polarity',
    'model_a_response_noun_count', 'model_a_response_verb_count',
    'model_b_response_noun_count', 'model_b_response_verb_count', 'topic_math', 'topic_creative']

In [None]:
# features for testing
testing_features = test_merged_df[['prompt_length',
    'length_interaction', 'elo_modela_minus_modelb',
    'prompt_noun_count', 'prompt_verb_count', 
    'response_a_length_polarity', 'response_b_length_polarity',
    'model_a_response_noun_count', 'model_a_response_verb_count',
    'model_b_response_noun_count', 'model_b_response_verb_count', 'topic_math', 'topic_creative']]

In [None]:
testing_features = testing_features.loc[:, ~testing_features.columns.duplicated()]

In [None]:
# ensure y is cleaned first
y = filtered_df['hardness_score']
y = y.dropna()  # drop NaN values

# Ensure X aligns with cleaned y
X = filtered_df[selected_features].loc[y.index]

# drop missing or infinite values in X
X = X.dropna()

# aligns X and y indices after filtering
X, y = X.align(y, axis=0)


In [None]:
print(X.shape)
print(testing_features.shape)

In [None]:
# iterate over each column in the feature set to visualize its the relationship with hardness score
for column in X.columns:
    plt.scatter(X[column], y)
    plt.title(f"{column} vs Hardness Score")
    plt.xlabel(column)
    plt.ylabel("Hardness Score")
    plt.show()


### Functions for model performance

In [None]:
# Function for oversampling low-range data
def oversample_low_range(X, y, low_threshold=3):
    """
    Oversample the low-range target values to address class imbalance
    
    Parameters:
        X: Features (DataFrame)
        y: Target variable (Series)
        low_threshold: Threshold to define the low range
    
    Returns:
        X_combined: Oversampled features
        y_combined: Oversampled target variable
    """
    # alignment
    X, y = X.align(y, axis=0)

    # identify low range
    low_range = y <= low_threshold
    X_low, y_low = X[low_range], y[low_range]

    # If no low-range samples exist, return original data
    if X_low.empty or y_low.empty:
        print("Warning: No low-range data available for oversampling.")
        return X, y

    # Oversample low-range data
    X_oversampled, y_oversampled = resample(
        X_low, y_low,
        replace=True,  # Oversampling with replacement
        n_samples=X.shape[0] // 3,  # Target size of oversampling
        random_state=42
    )

    # Combine oversampled data with the original dataset
    X_combined = pd.concat([X, X_oversampled])
    y_combined = pd.concat([y, y_oversampled])

    return X_combined, y_combined

In [None]:
def calculate_performance_metrics(y_true, y_pred, y_train=None, y_train_pred=None, y_val=None, y_val_pred=None):
    """
    Calculates various performance metrics for training, testing, and validation sets.
    
    Parameters:
        y_true: True values for the test set
        y_pred: Predicted values for the test set
        y_train: True values for the training set
        y_train_pred: Predicted values for the training set
        y_val: True values for the validation set
        y_val_pred: Predicted values for the validation set
    
    Returns:
        metrics: Dictionary containing performance metrics
    """
    metrics = {
        "Test RMSE": np.sqrt(mean_squared_error(y_true, y_pred)),
        "Test MSE": mean_squared_error(y_true, y_pred),
        "Test MAE": mean_absolute_error(y_true, y_pred),
        "Test R² Score": r2_score(y_true, y_pred),
        "Test Explained Variance": explained_variance_score(y_true, y_pred),
        "Test Residual Std Dev": np.std(y_true - y_pred),
    }
    
    if y_train is not None and y_train_pred is not None:
        metrics.update({
            "Train RMSE": np.sqrt(mean_squared_error(y_train, y_train_pred)),
            "Train MSE": mean_squared_error(y_train, y_train_pred),
            "Train MAE": mean_absolute_error(y_train, y_train_pred),
            "Train R² Score": r2_score(y_train, y_train_pred),
            "Train Explained Variance": explained_variance_score(y_train, y_train_pred),
        })
    
    if y_val is not None and y_val_pred is not None:
        metrics.update({
            "Validation RMSE": np.sqrt(mean_squared_error(y_val, y_val_pred)),
            "Validation MSE": mean_squared_error(y_val, y_val_pred),
            "Validation MAE": mean_absolute_error(y_val, y_val_pred),
            "Validation R² Score": r2_score(y_val, y_val_pred),
            "Validation Explained Variance": explained_variance_score(y_val, y_val_pred),
        })
    
    return metrics

# Function to display performance metrics
def display_performance_metrics(metrics):
    """
    Display performance metrics in a readable format.
    
    Parameters:
        metrics: Dictionary of performance metrics to display.
    """
    print("\nPerformance Metrics:")
    for metric_name, value in metrics.items():
        print(f"{metric_name}: {value:.2f}")


In [None]:
 #function to calculate range-specific RMSE
def calculate_range_rmse(y_true, y_pred, low_threshold=3, mid_threshold=6):
    """ 
    Calculate RMSE range scores
    """
    y_pred_rounded = np.round(y_pred)
    rmse_low = np.sqrt(mean_squared_error(y_true[y_true <= low_threshold], y_pred_rounded[y_true <= low_threshold])) if (y_true <= low_threshold).sum() > 0 else np.nan
    rmse_mid = np.sqrt(mean_squared_error(y_true[(y_true > low_threshold) & (y_true <= mid_threshold)], y_pred_rounded[(y_true > low_threshold) & (y_true <= mid_threshold)])) if ((y_true > low_threshold) & (y_true <= mid_threshold)).sum() > 0 else np.nan
    rmse_high = np.sqrt(mean_squared_error(y_true[y_true > mid_threshold], y_pred_rounded[y_true > mid_threshold])) if (y_true > mid_threshold).sum() > 0 else np.nan
    return rmse_low, rmse_mid, rmse_high

# function to display range-specific RMSE
def display_range_rmse(rmse_low, rmse_mid, rmse_high):
    """ 
    Displays RMSE range scores
    """
    print("\nRange-Specific RMSE:")
    print(f"Low Range (<=3): {rmse_low:.2f}" if not np.isnan(rmse_low) else "Low Range (<=3): N/A")
    print(f"Mid Range (3-6): {rmse_mid:.2f}" if not np.isnan(rmse_mid) else "Mid Range (3-6): N/A")
    print(f"High Range (>6): {rmse_high:.2f}" if not np.isnan(rmse_high) else "High Range (>6): N/A")


### Model

In [None]:
# Drop rows with missing target values
filtered_df = filtered_df.dropna(subset=['hardness_score'])

# Extract features and target variable
X = filtered_df[selected_features]
y = filtered_df['hardness_score']

# Oversample the low range 0-3
X_oversampled, y_oversampled = oversample_low_range(X, y)

# K-Fold Cross-Validation and Random Forest Regressor
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=20,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42
)

# Storing all the metrics
rmse_scores_low, rmse_scores_mid, rmse_scores_high = [], [], []
training_scores, validation_scores = [], []
training_mse, validation_mse = [], []
training_mae, validation_mae = [], []
y_true_all, y_pred_all = [], []

# K-Fold Cross-Validation loop
for fold, (train_idx, test_idx) in enumerate(kf.split(X_oversampled), 1):
    print(f"Processing Fold {fold}...")
    
    X_train, X_test = X_oversampled.iloc[train_idx], X_oversampled.iloc[test_idx]
    y_train, y_test = y_oversampled.iloc[train_idx], y_oversampled.iloc[test_idx]

    # Train model
    rf_model.fit(X_train, y_train)

    # Predictions
    y_pred_train = rf_model.predict(X_train)
    y_pred_test = rf_model.predict(X_test)
    y_pred_test_rounded = np.round(y_pred_test)

    # True and predicted values for plotting
    y_true_all.extend(y_test)
    y_pred_all.extend(y_pred_test_rounded)

    # Training and validation R^2 scores calculations
    training_scores.append(rf_model.score(X_train, y_train))
    validation_scores.append(rf_model.score(X_test, y_test))

    # Training and validation MSE/MAE
    training_mse.append(mean_squared_error(y_train, y_pred_train))
    validation_mse.append(mean_squared_error(y_test, y_pred_test))
    training_mae.append(mean_absolute_error(y_train, y_pred_train))
    validation_mae.append(mean_absolute_error(y_test, y_pred_test))

    # Calculate RMSE for each range
    rmse_low, rmse_mid, rmse_high = calculate_range_rmse(y_test, y_pred_test)
    if not np.isnan(rmse_low): rmse_scores_low.append(rmse_low)
    if not np.isnan(rmse_mid): rmse_scores_mid.append(rmse_mid)
    if not np.isnan(rmse_high): rmse_scores_high.append(rmse_high)

# Calculate metrics
average_metrics = calculate_performance_metrics(np.array(y_true_all), np.array(y_pred_all))
rmse_low_avg = np.mean(rmse_scores_low)
rmse_mid_avg = np.mean(rmse_scores_mid)
rmse_high_avg = np.mean(rmse_scores_high)

# Display the metrics
display_performance_metrics(average_metrics)
display_range_rmse(rmse_low_avg, rmse_mid_avg, rmse_high_avg)

# Display training and validation metrics
print("\nTraining and Validation Metrics (Averages Across Folds):")
print(f"Training R²: {np.mean(training_scores):.2f}")
print(f"Validation R²: {np.mean(validation_scores):.2f}")
print(f"Training MSE: {np.mean(training_mse):.2f}")
print(f"Validation MSE: {np.mean(validation_mse):.2f}")
print(f"Training MAE: {np.mean(training_mae):.2f}")
print(f"Validation MAE: {np.mean(validation_mae):.2f}")

### Completing the Test Set

In [None]:
# Use test data to create our predictions for test set
y_pred_t = rf_model.predict(testing_features)
print(y_pred_t)
print(len(y_pred_t))

In [None]:
# Create dataframe for test predictions
results_b_df = pd.DataFrame({
    'question_id': test_df['question_id'],
    'hardness_score': y_pred_t.round()})

# Save the DataFrame to a CSV file
results_b_df.to_csv('test_predictions_b.csv', index=False)

print("Predictions saved to 'test_predictions_b.csv'")


### Grid-Search

In [None]:
# Define the Random Forest model
rf_model = RandomForestRegressor(random_state=42)

# hyperparameter grid for GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [10, 20, None],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
}

# Define GridSearchCV
grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    scoring='mean_squared_error',  # Use negative MSE as the scoring metric
    cv=5,  # 5-fold cross-validation
    verbose=2,
    n_jobs=-1 
)

# Fit the grid search on oversampled training data
grid_search.fit(X_oversampled, y_oversampled)

# Get the best hyperparameters
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", -grid_search.best_score_)  # Converting negative MSE to positive

# Train the model with the best hyperparameters
best_rf_model = grid_search.best_estimator_


### Performance Visualizations

In [None]:
# Create performance DataFrame for evaluation
performance_df = pd.DataFrame({
    'True Values': y_true_all,
    'Predicted Values': y_pred_all
})
performance_df.head()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor

def plot_true_vs_predicted(performance_df):
    """
    Visualizes the relationship between true and predicted values

    Parameters:
        performance_df: Dataframe
    """
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x='True Values', y='Predicted Values', data=performance_df, alpha=0.7)
    plt.plot([performance_df['True Values'].min(), performance_df['True Values'].max()],
             [performance_df['True Values'].min(), performance_df['True Values'].max()],
             color='red', linestyle='--', label='Perfect Prediction')
    plt.title('True vs Predicted Values')
    plt.xlabel('True Hardness Scores')
    plt.ylabel('Predicted Hardness Scores')
    plt.legend()
    plt.show()

def plot_hardness_score_distributions(performance_df):
    """
    Compares the distributions of true and predicted hardness scores

    Parameters:
        performance_df: Dataframe
    """
    plt.figure(figsize=(10, 6))
    sns.histplot(performance_df['True Values'], kde=True, bins=30, color='blue', label='True', alpha=0.5)
    sns.histplot(performance_df['Predicted Values'], kde=True, bins=30, color='green', label='Predicted', alpha=0.5)
    plt.title('Distribution of True vs Predicted Hardness Scores')
    plt.xlabel('Hardness Scores')
    plt.ylabel('Frequency')
    plt.legend()
    plt.show()

def plot_correlation_heatmap(data, features):
    """
    Displays a heatmap of feature correlations

    Parameters:
        data: Dataframe
        features: Selected features
    """
    corr_matrix = data[features].corr()
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", square=True)
    plt.title('Correlation Heatmap')
    plt.show()

def plot_prediction_density_heatmap(performance_df):
    """
    Displays a heatmap of prediction density

    Parameters:
        performance_df: Dataframe
    """
    plt.figure(figsize=(10, 6))
    sns.kdeplot(
        x=performance_df['True Values'],
        y=performance_df['Predicted Values'],
        cmap="Blues", fill=True
    )
    plt.plot([performance_df['True Values'].min(), performance_df['True Values'].max()],
             [performance_df['True Values'].min(), performance_df['True Values'].max()],
             color='red', linestyle='--', label='Perfect Prediction')
    plt.title('Prediction Density Heatmap')
    plt.xlabel('True Hardness Scores')
    plt.ylabel('Predicted Hardness Scores')
    plt.legend()
    plt.show()

def plot_residuals_vs_predicted(performance_df):
    """
    Visualizes residuals against predicted values

    Parameters:
        performance_df: Dataframe
    """
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=performance_df['Predicted Values'], y=performance_df['Residuals'], alpha=0.7)
    plt.axhline(0, color='red', linestyle='--')
    plt.title('Residuals vs Predicted Values')
    plt.xlabel('Predicted Hardness Scores')
    plt.ylabel('Residuals')
    plt.show()

def plot_error_vs_true(performance_df):
    """
    Displays the relationship between absolute errors and true values

    Parameters:
        performance_df: Dataframe
    """
    performance_df['Error'] = np.abs(performance_df['True Values'] - performance_df['Predicted Values'])
    plt.figure(figsize=(10, 6))
    sns


In [None]:
# Prepare performance_df with necessary columns
performance_df['Residuals'] = performance_df['True Values'] - performance_df['Predicted Values']
performance_df['Error'] = np.abs(performance_df['Residuals'])
performance_df['Range'] = pd.cut(performance_df['True Values'], bins=[0, 3, 6, 10], labels=['Low', 'Mid', 'High'])

# True vs Predicted Values
plot_true_vs_predicted(performance_df)

# Distribution of True vs Predicted Hardness Scores
plot_hardness_score_distributions(performance_df)

# Correlation Heatmap
plot_correlation_heatmap(filtered_df, selected_features)

# Prediction Density Heatmap
plot_prediction_density_heatmap(performance_df)

# Residuals vs Predicted Values
plot_residuals_vs_predicted(performance_df)

# Error vs True Values
plot_error_vs_true(performance_df)
