In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics.pairwise import cosine_similarity
import regex as re
from textstat.textstat import textstat


In [2]:
sns.set_theme(style="whitegrid")

# Main Dataset

# Conversation Data -- we will use this data in the "Conversation Data" section

sentiment_df = pd.read_json(
    "sentiment_scores.json",
    lines=True
)


# Topic Modeling and Hardness Score Data -- we will use this data in the "Topic Modeling and Hardness Score Data" section
topic_and_hardness = pd.read_json(
    "chatbot-arena-gpt3-scores.jsonl.gz",
    lines=True,
    compression="gzip"
)

In [3]:
# Embedding Data -- we will use this data in the "Embedding Data" section
prompt_embeddings = np.load(
    "chatbot-arena-prompts-embeddings.npy"
)

response_a_embeddings = np.load(
    "chatbot-arena-model_a_response-embeddings.npy"
)

response_b_embeddings = np.load(
    "chatbot-arena-model_b_response-embeddings.npy"
)

In [4]:
# convert names of models a and b to one hot encodings
categories = [
    'RWKV-4-Raven-14B', 'alpaca-13b', 'chatglm-6b', 'claude-instant-v1', 'claude-v1',
    'dolly-v2-12b', 'fastchat-t5-3b', 'gpt-3.5-turbo', 'gpt-4', 'gpt4all-13b-snoozy',
    'guanaco-33b', 'koala-13b', 'llama-13b', 'mpt-7b-chat', 'oasst-pythia-12b',
    'palm-2', 'stablelm-tuned-alpha-7b', 'vicuna-13b', 'vicuna-7b', 'wizardlm-13b', 'tie', 'tie(bothbad)'
]

one_hot_dict = {}

for idx, category in enumerate(categories):
    one_hot_vector = [0] * len(categories)
    one_hot_vector[idx] = 1
    one_hot_dict[category] = ''.join(map(str, one_hot_vector))

print(one_hot_dict)


{'RWKV-4-Raven-14B': '1000000000000000000000', 'alpaca-13b': '0100000000000000000000', 'chatglm-6b': '0010000000000000000000', 'claude-instant-v1': '0001000000000000000000', 'claude-v1': '0000100000000000000000', 'dolly-v2-12b': '0000010000000000000000', 'fastchat-t5-3b': '0000001000000000000000', 'gpt-3.5-turbo': '0000000100000000000000', 'gpt-4': '0000000010000000000000', 'gpt4all-13b-snoozy': '0000000001000000000000', 'guanaco-33b': '0000000000100000000000', 'koala-13b': '0000000000010000000000', 'llama-13b': '0000000000001000000000', 'mpt-7b-chat': '0000000000000100000000', 'oasst-pythia-12b': '0000000000000010000000', 'palm-2': '0000000000000001000000', 'stablelm-tuned-alpha-7b': '0000000000000000100000', 'vicuna-13b': '0000000000000000010000', 'vicuna-7b': '0000000000000000001000', 'wizardlm-13b': '0000000000000000000100', 'tie': '0000000000000000000010', 'tie(bothbad)': '0000000000000000000001'}


In [5]:
#add a new column that displays the name of the winning chatbot model (or tie)
sentiment_df['winner_model'] = np.where(
    sentiment_df['winner'] == 'model_b', sentiment_df['model_b'],
    np.where(
        sentiment_df['winner'] == 'model_a', sentiment_df['model_a'],
        np.where(
            sentiment_df['winner'] == 'tie', 'tie', 'tie(bothbad)'
        )
    )
)

In [6]:
#replace model names in model_a and model_b with onehot encodings
sentiment_df['winner_model'] = sentiment_df['winner_model'].map(one_hot_dict)

In [7]:
# Find cosine similarity of prompt embeddings
# Takes about a minute to run
response_a_prompt_similarity = np.array([
    cosine_similarity(response_a_embeddings[i].reshape(1, -1), prompt_embeddings[i].reshape(1, -1))[0, 0]
    for i in range(len(sentiment_df))
])
response_b_prompt_similarity = np.array([
    cosine_similarity(response_b_embeddings[i].reshape(1, -1), prompt_embeddings[i].reshape(1, -1))[0, 0]
    for i in range(len(sentiment_df))
])
response_ab_similarity = np.array([
    cosine_similarity(response_a_embeddings[i].reshape(1,-1), response_b_embeddings[i].reshape(1, -1))[0,0]
    for i in range(len(sentiment_df))
])

sentiment_df['a_prompt_text_similarity'] = response_a_prompt_similarity
sentiment_df['b_prompt_text_similarity'] = response_b_prompt_similarity
sentiment_df['ab_text_similarity'] = response_ab_similarity

In [8]:
#function to extract numeric score from sentiment columns
def process_sentiment(sentiment):
    sentiment_dict = sentiment[0]
    score = sentiment_dict['score']
    if sentiment_dict['label'] == 'NEGATIVE':
        return -score
    return score

In [9]:
sentiment_df['prompt_sentiment_score'] = sentiment_df['prompt_sentiment'].apply(process_sentiment)
sentiment_df['response_a_sentiment_score'] = sentiment_df['response_a_sentiment'].apply(process_sentiment)
sentiment_df['response_b_sentiment_score'] = sentiment_df['response_b_sentiment'].apply(process_sentiment)
sentiment_df.drop(columns=['prompt_sentiment'], inplace=True)
sentiment_df.drop(columns=['response_a_sentiment'], inplace=True)
sentiment_df.drop(columns=['response_b_sentiment'], inplace=True)

In [10]:
sentiment_df[['prompt_sentiment_score', 'response_a_sentiment_score', 'response_b_sentiment_score']].isna().sum()

Unnamed: 0,0
prompt_sentiment_score,0
response_a_sentiment_score,0
response_b_sentiment_score,0


In [11]:
#find the difference in sentiment scores between model a and b and between prompt and responses
sentiment_df["prompt_a_sentiment_diff"] = sentiment_df["prompt_sentiment_score"] - sentiment_df["response_a_sentiment_score"]
sentiment_df["prompt_b_sentiment_diff"] = sentiment_df["prompt_sentiment_score"] - sentiment_df["response_b_sentiment_score"]
sentiment_df["ab_sentiment_diff"] = sentiment_df["response_a_sentiment_score"] - sentiment_df["response_b_sentiment_score"]

In [12]:
search_words = ["can't", "won't", "will not", "cannot", "sorry"]
def contains_negation(response, target_words):
    response = response.lower()  # Convert the response to lowercase
    for word in target_words:
        if word.lower() in response:
            return 1
    return 0
sentiment_df["response_a_contains_negation"] = sentiment_df["model_a_response"].apply(
    lambda x: contains_negation(x, search_words)
)
sentiment_df["response_b_contains_negation"] = sentiment_df["model_b_response"].apply(
    lambda x: contains_negation(x, search_words)
)


In [13]:
# Define the modal verbs
modal_verbs = ['can', 'could', 'may', 'might', 'shall', 'should', 'will', 'would', 'must', 'have to']

# Create a regex pattern (case-insensitive)
modal_regex = '|'.join([re.escape(verb) for verb in modal_verbs])

# Function to create the modal_a and modal_b columns
def check_modal_verbs(df, model_a_col, model_b_col, modal_verbs):
    # Initialize 'modal_a' and 'modal_b' columns with 0
    df['modal_a'] = 0
    df['modal_b'] = 0

    # Check if any of the modal verbs are present in model_a_response or model_b_response
    for verb in modal_verbs:
        # Create regex pattern for each verb
        pattern = rf'\b{re.escape(verb)}\b'

        # If model_a_response contains the verb, set 'modal_a' to 1
        df['modal_a'] = df['modal_a'] | df[model_a_col].str.contains(pattern, case=False, na=False).astype(int)

        # If model_b_response contains the verb, set 'modal_b' to 1
        df['modal_b'] = df['modal_b'] | df[model_b_col].str.contains(pattern, case=False, na=False).astype(int)

    return df


# Apply the function to both model_a_response and model_b_response
sentiment_df = check_modal_verbs(sentiment_df, 'model_a_response', 'model_b_response', modal_verbs)


In [14]:
#analyze response readability with textstat
sentiment_df['response_a_readability'] = sentiment_df['model_a_response'].apply(lambda x: textstat.flesch_reading_ease(str(x)))
sentiment_df['response_b_readability'] = sentiment_df['model_b_response'].apply(lambda x: textstat.flesch_reading_ease(str(x)))

In [15]:
sentiment_df['response_readability_diff'] = sentiment_df['response_a_readability'] - sentiment_df['response_b_readability']

In [16]:
#create columns for prompt and response lengths
sentiment_df["prompt_length"] = sentiment_df["prompt"].str.len()
sentiment_df["response_a_length"] = sentiment_df["model_a_response"].str.len()
sentiment_df["response_b_length"] = sentiment_df["model_b_response"].str.len()


In [17]:
#create features to compare prompt an response length as well as lengths of responses a and b
sentiment_df["prompt_minus_response_a_length"] = sentiment_df["prompt_length"] - sentiment_df["response_a_length"]
sentiment_df["prompt_minus_response_b_length"] = sentiment_df["prompt_length"] - sentiment_df["response_b_length"]
sentiment_df["response_a_minus_response_b_length"] = sentiment_df["response_a_length"] - sentiment_df["response_b_length"]

In [18]:
# Initialize Elo ratings for all models
elo_ratings = {model: 1000 for model in pd.concat([sentiment_df["model_a"], sentiment_df["model_b"]]).unique()}

def calculate_elo_ratings(df, k=32):
    """
    Calculate and update Elo ratings for model_a and model_b for each row.

    Args:
        df (pd.DataFrame): DataFrame containing 'model_a', 'model_b', and 'winner' columns.
        k (int): The K-factor to control the adjustment magnitude.

    Returns:
        pd.DataFrame: Updated DataFrame with Elo ratings for model_a and model_b at each row.
    """
    # Create columns to store Elo ratings
    df['model_a_elo'] = 0.0
    df['model_b_elo'] = 0.0

    # Loop through rows
    for idx, row in df.iterrows():
        model_a = row['model_a']
        model_b = row['model_b']
        winner = row['winner']

        # Current ratings
        rating_a = elo_ratings[model_a]
        rating_b = elo_ratings[model_b]

        # Calculate expected scores
        expected_a = 1 / (1 + 10 ** ((rating_b - rating_a) / 400))
        expected_b = 1 - expected_a

        # Update ratings based on the winner
        if winner == "model_a":
            elo_ratings[model_a] += k * (1 - expected_a)
            elo_ratings[model_b] += k * (0 - expected_b)
        elif winner == "model_b":
            elo_ratings[model_a] += k * (0 - expected_a)
            elo_ratings[model_b] += k * (1 - expected_b)

        # update df
        df.at[idx, 'model_a_elo'] = elo_ratings[model_a]
        df.at[idx, 'model_b_elo'] = elo_ratings[model_b]

    return df

# Find difference in ELO rating for each row
sentiment_df = calculate_elo_ratings(sentiment_df)

sentiment_df['model_a_elo_change'] = sentiment_df['model_a_elo'].diff()
sentiment_df['model_b_elo_change'] = sentiment_df['model_b_elo'].diff()
sentiment_df.at[0, 'model_a_elo_change'] = -16
sentiment_df.at[0, 'model_b_elo_change'] = 16


In [19]:
#convert hardness scores to int so that we can find the mean
topic_and_hardness['score_value_1'] = pd.to_numeric(topic_and_hardness['score_value_1'], errors='coerce')
topic_and_hardness['score_value_2'] = pd.to_numeric(topic_and_hardness['score_value_2'], errors='coerce')
topic_and_hardness['score_value_3'] = pd.to_numeric(topic_and_hardness['score_value_3'], errors='coerce')

In [20]:
topic_and_hardness['hardness_score'] = topic_and_hardness[['score_value_1', 'score_value_2', 'score_value_3']].mean(axis=1)
merged_df = sentiment_df.merge(topic_and_hardness[['question_id', 'hardness_score', 'topic_modeling_3']], on='question_id', how='left')

In [21]:
filtered_df = merged_df.dropna(subset=['hardness_score'])

In [22]:
# The list of topics that are repeated the most does give us some information.
# Looking at it, we can manually discern four areas; math, fact (factual analysis), creativity/creative writing, and problem-solving/problems
# This code analyzes those four categories.
# Define core words of interest
core_words = {
    'math': r'math',                       # Matches "math"
    'fact': r'fact\w*',                    # Matches "fact", "facts", "factual", etc.
    'creativity': r'creativ\w*',           # Matches "creative", "creativity", etc.
    'problem_solving': r'problem[ -]?solving',  # Matches "problem-solving" and "problem solving"
    'comparison': r'comparison'            # Matches "comparison"
}
def assign_topic_columns(text):
    if not isinstance(text, str):
        return {key: 0 for key in core_words}
    result = {key: 0 for key in core_words}
    for category, pattern in core_words.items():
        if re.search(pattern, text, flags=re.IGNORECASE):
            result[category] = 1
    return result

topic_columns = filtered_df['topic_modeling_3'].apply(assign_topic_columns)

# Convert the result into a DataFrame and concatenate with the original DataFrame
topic_df = pd.DataFrame(topic_columns.tolist(), index=filtered_df.index)

# Concatenate the topic columns with the original DataFrame
filtered_df = pd.concat([filtered_df, topic_df], axis=1)

In [38]:
#search for non-english characters in responses
def contains_non_english(text):
    return 1 if re.search(r'[^\x00-\x7F]', text) else 0

filtered_df['response_a_foreign'] = filtered_df['model_a_response'].apply(contains_non_english)
filtered_df['response_b_foreign'] = filtered_df['model_b_response'].apply(contains_non_english)

In [56]:
filtered_df.head(10)

Unnamed: 0,question_id,model_a,model_b,winner,judge,conversation_a,conversation_b,prompt,model_a_response,model_b_response,...,fact,creativity,problem_solving,comparison,winner_tie (bothbad),winner_tie,winner_model_a,winner_model_b,response_a_foreign,response_b_foreign
0,58210e39b3fd4441a2bd4a518bb44c2d,chatglm-6b,koala-13b,model_b,arena_user_973,[{'content': 'What is the difference between O...,[{'content': 'What is the difference between O...,What is the difference between OpenCL and CUDA?,OpenCL and CUDA are two different programming ...,OpenCL and CUDA are both programming languages...,...,0,0,0,1,0,0,0,0,1,0
1,2564acd09e3942fd97657d05282d4389,oasst-pythia-12b,alpaca-13b,tie,arena_user_973,[{'content': 'Why did my parent not invite me ...,[{'content': 'Why did my parent not invite me ...,Why did my parent not invite me to their wedding?,It is possible that your parent did not invite...,It is likely that they wanted to keep the gues...,...,0,0,0,0,0,0,0,0,0,1
2,90bfd142157948aba01931726c888e7f,koala-13b,oasst-pythia-12b,model_b,arena_user_973,"[{'content': 'Fuji vs. Nikon, which is better?...","[{'content': 'Fuji vs. Nikon, which is better?...","Fuji vs. Nikon, which is better?",Both Fuji and Nikon are popular camera brands ...,This is a subjective question and the answer d...,...,0,0,0,1,0,0,0,0,0,0
3,a7c5accc53e649a3bc6b2e41d962ebc4,vicuna-13b,oasst-pythia-12b,model_b,arena_user_973,[{'content': 'How to build an arena for chatbo...,[{'content': 'How to build an arena for chatbo...,How to build an arena for chatbots?,Building an arena for chatbots can be done by ...,Building an arena for chatbots is a great way ...,...,0,0,0,0,0,0,0,0,0,0
4,adf27e819a3c494cb6e993f0c660e097,vicuna-13b,koala-13b,model_a,arena_user_973,"[{'content': 'When is it today?', 'role': 'use...","[{'content': 'When is it today?', 'role': 'use...",When is it today?,"I'm sorry, I cannot determine the current date...","Today is February 23, 2023.",...,0,0,0,0,0,0,0,0,0,0
5,c0fc42c6f5f14f2aa5a89f71f8553730,vicuna-13b,koala-13b,model_a,arena_user_973,[{'content': 'Count from 1 to 10 with step = 3...,[{'content': 'Count from 1 to 10 with step = 3...,Count from 1 to 10 with step = 3,"1, 4, 7, 10\n\nCounting with a step of 3 means...","1, 4, 7, 10",...,0,0,0,0,0,0,0,0,0,0
6,c4938f25c1d94fc1b110ace95a2243d0,vicuna-13b,koala-13b,model_a,arena_user_973,"[{'content': 'Emoji for ""sharing"". List 10', '...","[{'content': 'Emoji for ""sharing"". List 10', '...","Emoji for ""sharing"". List 10",1. 💪 Sharing progress or achievements\n2. 📝 Sh...,1. Sharing hands\n* Share icon\n* Group ...,...,0,0,0,0,0,0,0,0,1,0
7,65e923b1f9c2433aae082d32e6e05f16,vicuna-13b,dolly-v2-12b,model_a,arena_user_973,[{'content': 'How to parallelize a neural netw...,[{'content': 'How to parallelize a neural netw...,How to parallelize a neural network?,Parallelizing a neural network can be achieved...,To parallelize the forward pass of a neural ne...,...,0,0,0,0,0,0,0,0,0,0
8,cbbb83487f534ec5b4cc92b93b79fa2c,stablelm-tuned-alpha-7b,oasst-pythia-12b,model_a,arena_user_973,"[{'content': 'A = 5, B =10, A+B=?', 'role': 'u...","[{'content': 'A = 5, B =10, A+B=?', 'role': 'u...","A = 5, B =10, A+B=?","To find the sum of A and B, we need to add the...",To find the sum of two numbers we can use the ...,...,0,0,0,0,0,0,0,0,0,0
9,ce4b1e7676444384994dbda7b228018c,koala-13b,vicuna-13b,tie,arena_user_973,"[{'content': 'A = 5, B =10, A+B=?', 'role': 'u...","[{'content': 'A = 5, B =10, A+B=?', 'role': 'u...","A = 5, B =10, A+B=?",A + B = 5 + 10 = 15,A + B = 5 + 10 = 15.,...,0,0,0,0,0,0,0,0,0,0


In [72]:
filtered_df['conversation_b'][19]

[{'content': 'What can we do in AI research to address climate change?',
  'role': 'user'},
 {'content': 'AI has the potential to dramatically change the energy landscape. By combining data about energy consumption and generation with powerful AI models, we can produce more energy more efficiently. We can also reduce our reliance on fossil fuels by training AI models to identify patterns in noise and make new predictions about which areas have untapped energy resources.\n\nAI can also help us prepare for the effects of climate change. For example, through advanced natural language processing, we can better understand and respond to the millions of environmental messages being generated by the government and NGOs. We can also use AI to create more resilient infrastructure. By identifying vulnerable areas and critical services, we can develop AI-generated alternatives that can take over where humans cannot or should not go (e.g., in the case of infrastructure systems).\n\nWe can also use

In [87]:
#check for spelling or grammatical errors in responses
from spellchecker import SpellChecker

spell = SpellChecker()

def contains_misspelling(text):
    words = text.split()
    misspelled = spell.unknown(words)
    return len(misspelled) > 0


filtered_df['response_a_spelling_errors'] = filtered_df['model_a_response'].apply(contains_misspelling)
filtered_df['response_b_spelling_errors'] = filtered_df['model_b_response'].apply(contains_misspelling)

In [75]:
#check for subjectivity of response
def calculate_subjectivity(text):
    blob = TextBlob(text)
    return blob.sentiment.subjectivity

filtered_df['response_a_subjectivity'] = filtered_df['model_a_response'].apply(calculate_subjectivity)
filtered_df['response_b_subjectivity'] = filtered_df['model_b_response'].apply(calculate_subjectivity)

In [88]:
X = filtered_df[['a_prompt_text_similarity', 'b_prompt_text_similarity', 'ab_text_similarity',
                 'prompt_a_sentiment_diff', 'prompt_b_sentiment_diff', 'ab_sentiment_diff',
                 'prompt_minus_response_a_length', 'prompt_minus_response_b_length',
                 'response_a_minus_response_b_length', 'response_a_contains_negation',
                 'response_b_contains_negation', 'model_a_elo_change', 'model_b_elo_change',
                 'hardness_score', 'math', 'fact', 'creativity', 'problem_solving', 'comparison',
                 'response_readability_diff', 'modal_a', 'modal_b', 'winner_model', 'response_a_foreign',
                 'response_b_foreign', 'response_a_subjectivity', 'response_b_subjectivity',
                 'response_a_spelling_errors', 'response_b_spelling_errors']]
y = filtered_df['winner']

In [89]:
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

Shape of X: (25243, 29)
Shape of y: (25243,)


In [90]:
nan_columns = X.isna().any()
nan_columns

Unnamed: 0,0
a_prompt_text_similarity,False
b_prompt_text_similarity,False
ab_text_similarity,False
prompt_a_sentiment_diff,False
prompt_b_sentiment_diff,False
ab_sentiment_diff,False
prompt_minus_response_a_length,False
prompt_minus_response_b_length,False
response_a_minus_response_b_length,False
response_a_contains_negation,False


In [91]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2, interaction_only=True)
interaction_features = poly.fit_transform(X)
interaction_columns = poly.get_feature_names_out(X.columns)
interaction_df = pd.DataFrame(interaction_features, columns=interaction_columns)
X = X.reset_index(drop=True)
interaction_df = interaction_df.reset_index(drop=True)
X_combined = pd.concat([X, interaction_df], axis=1)

In [92]:
X.columns = X.columns.astype(str)

In [93]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X_combined)
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)

In [94]:
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y_encoded, test_size=0.2, random_state=42)

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the model
model = LogisticRegression(max_iter=1000)

# Define the hyperparameters grid
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['liblinear']
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Print the best parameters
print("Best parameters found: ", grid_search.best_params_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


KeyboardInterrupt: 

In [45]:
model = LogisticRegression(C=10, penalty='l2', solver='liblinear', max_iter=1000)
model.fit(X_train, y_train)

In [46]:
y_pred = model.predict(X_test)

In [47]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

y_prob = model.predict_proba(X_test)
auc_score = roc_auc_score(y_test, y_prob, multi_class='ovr', average='weighted')

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1}')
print("Accuracy:", accuracy)
print(f'Multiclass ROC AUC: {auc_score}')

Precision: 0.5115449493175411
Recall: 0.5321845910081204
F1-score: 0.5002538950789225
Accuracy: 0.5321845910081204
Multiclass ROC AUC: 0.7497786615415349


In [95]:
clf = RandomForestClassifier(n_estimators=500, max_depth= None, min_samples_split=2,
                            min_samples_leaf=1, max_features= 'sqrt', n_jobs=-1,
                            class_weight= None, bootstrap=True)
clf.fit(X_train, y_train)
y_pred_clf = clf.predict(X_test)

In [96]:
accuracy = accuracy_score(y_test, y_pred_clf)
precision = precision_score(y_test, y_pred_clf, average='weighted')
recall = recall_score(y_test, y_pred_clf, average='weighted')
f1 = f1_score(y_test, y_pred_clf, average='weighted')

y_prob_clf = clf.predict_proba(X_test)
auc_score = roc_auc_score(y_test, y_prob_clf, multi_class='ovr', average='weighted')

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1}')
print("Accuracy:", accuracy)
print(f'Multiclass ROC AUC: {auc_score}')

Precision: 0.6075264339191757
Recall: 0.6173499702911468
F1-score: 0.6033521529321526
Accuracy: 0.6173499702911468
Multiclass ROC AUC: 0.8355927348271479


In [None]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],
    'class_weight': ['balanced', None]
}

# GridSearchCV to find the best parameters
grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Print the best parameters
print("Best parameters:", grid_search.best_params_)

KeyboardInterrupt: 

In [97]:
from sklearn.tree import DecisionTreeClassifier

# Fit Decision Tree model
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
y_pred_tree = tree.predict(X_test)

In [98]:
accuracy = accuracy_score(y_test, y_pred_tree)
precision = precision_score(y_test, y_pred_tree, average='weighted')
recall = recall_score(y_test, y_pred_tree, average='weighted')
f1 = f1_score(y_test, y_pred_tree, average='weighted')

y_prob_tree = tree.predict_proba(X_test)
auc_score = roc_auc_score(y_test, y_prob_tree, multi_class='ovr', average='weighted')

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1}')
print("Accuracy:", accuracy)
print(f'Multiclass ROC AUC: {auc_score}')

Precision: 0.5143317873737782
Recall: 0.5109922756981581
F1-score: 0.5125680650086594
Accuracy: 0.5109922756981581
Multiclass ROC AUC: 0.6579515983176766


In [None]:
class_distribution = sentiment_df['winner'].value_counts()
print(class_distribution)

winner
model_a          9002
model_b          8862
tie (bothbad)    4632
tie              2786
Name: count, dtype: int64


In [99]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, SimpleRNN, Embedding, LSTM
from keras.utils import to_categorical
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

# Reshape the data for RNN input
X_rnn = X_normalized.reshape(X_normalized.shape[0], 1, X_normalized.shape[1])  # (samples, timesteps=1, features)

# Reshape X_test for RNN input (ensure it matches the shape of X_rnn)
X_test_rnn = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])

# Number of folds for cross-validation
num_folds = 5

# Initialize KFold cross-validation
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Lists to hold performance metrics for each fold
train_accuracies = []
test_accuracies = []

# Perform k-fold cross-validation
for train_index, val_index in kf.split(X_rnn):
    # Split data into training and validation sets
    X_train, X_val = X_rnn[train_index], X_rnn[val_index]
    y_train, y_val = y_encoded[train_index], y_encoded[val_index]

    # Define the RNN model
    model = Sequential()
    model.add(SimpleRNN(128, input_shape=(X_train.shape[1], X_train.shape[2]), activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(len(np.unique(y_encoded)), activation='softmax'))

    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    # Train the model
    model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)

    # Evaluate the model on training data
    train_accuracy = model.evaluate(X_train, y_train, verbose=0)[1]
    train_accuracies.append(train_accuracy)

    # Evaluate the model on test data (using reshaped X_test_rnn)
    test_accuracy = model.evaluate(X_test_rnn, y_test, verbose=0)[1]
    test_accuracies.append(test_accuracy)

# Compute average accuracy for both training and test sets
average_train_accuracy = np.mean(train_accuracies)
average_test_accuracy = np.mean(test_accuracies)

# Print out results
print(f'Average Training Accuracy: {average_train_accuracy}')
print(f'Average Test Accuracy: {average_test_accuracy}')

# Check for overfitting
if average_train_accuracy - average_test_accuracy > 0.1:
    print("Possible overfitting detected. Training accuracy is much higher than test accuracy.")
else:
    print("Model seems to be generalizing well.")



  super().__init__(**kwargs)


Average Training Accuracy: 0.6436933040618896
Average Test Accuracy: 0.6194890022277832
Model seems to be generalizing well.
