In [18]:
import pandas as pd
from sklearn.calibration import LabelEncoder
import numpy as np
# 1. Load the data
df = pd.read_csv('./megablunders.csv')

In [19]:
# look at class imbalances

import matplotlib.pyplot as plt

# get the count for each label
print(df["error"].value_counts())

df_filtered = df[df["error"] != "AGREEERROR"]

# Verify the class has been removed
print("\nClass distribution after dropping:")
print(df_filtered["error"].value_counts())

# Update your DataFrame
df = df_filtered


print(df["error"].value_counts())

error
PR            22
PAR           22
ROS           22
FRAG          20
MM            20
DM            19
CASE          19
NONE          19
AGREE         17
AGREEERROR     1
Name: count, dtype: int64

Class distribution after dropping:
error
PR       22
PAR      22
ROS      22
FRAG     20
MM       20
DM       19
CASE     19
NONE     19
AGREE    17
Name: count, dtype: int64
error
PR       22
PAR      22
ROS      22
FRAG     20
MM       20
DM       19
CASE     19
NONE     19
AGREE    17
Name: count, dtype: int64


In [20]:
# use lable encoder to encode the error
label_encoder = LabelEncoder()
df['error'] = label_encoder.fit_transform(df['error'])

In [21]:
#  vectorize the text 
from sentence_transformers import SentenceTransformer

st = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

df = df.reset_index(drop=True)

In [22]:
# Convert to list of strings before encoding
sentences = df["original_sentence"].tolist()
embeddings = st.encode(sentences)

In [23]:
df["error"].nunique()

9

In [24]:
df.head()

Unnamed: 0,original_sentence,error
0,By dropping a game to the pathetic Tampa Bay D...,2
1,"Although, if history is any indication, the te...",3
2,"The Sox almost never go down uneventfully, whi...",7
3,Because of the accumulated bad karma that hang...,0
4,The team not only has squandered huge leads bu...,6


In [25]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(embeddings, df['error'], test_size=0.1, random_state=42, stratify=df["error"])

In [26]:
# Download the necessary NLTK data first
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger_eng')

# Check if the data was downloaded successfully
try:
    from nltk.corpus import wordnet
    print("WordNet loaded successfully")
except:
    print("WordNet loading failed")

WordNet loaded successfully


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/felixstuart/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/felixstuart/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/felixstuart/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [27]:
# use nlaug to swap synonms 


In [36]:
# try xgboost
from sklearn.utils import compute_class_weight
import xgboost as xgb
from sklearn.metrics import accuracy_score

# Convert class weights to sample weights
# Create a dictionary mapping class labels to weights

unique_classes = np.unique(y_train)

weights = compute_class_weight(
    class_weight="balanced",
    classes=unique_classes,
    y=y_train
)

class_weight_dict = dict(zip(unique_classes, weights))

# Map each sample to its appropriate weight
sample_weights = np.array([class_weight_dict[cls] for cls in y_train])

# Fit the XGBoost model with sample weights
xboost = xgb.XGBClassifier(n_estimators=5000,
                           learning_rate=0.1,
                           random_state=42,
                        #    reg_alpha=0.7,
                        #    reg_lambda=1,
                        #    max_depth=3,
                           scale_pos_weight=sample_weights,
                           )


xboost.fit(X_train, y_train, sample_weight=sample_weights)

# Make predictions
y_pred = xboost.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"XGBoost Accuracy: {accuracy*100:.2f}%")

Parameters: { "scale_pos_weight" } are not used.



XGBoost Accuracy: 33.33%
