In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

# Load datasets
train_df = pd.read_csv('/kaggle/input/gametoxicitydetection/train.csv')
test_df = pd.read_csv('/kaggle/input/gametoxicitydetection/valid.csv')  # Treat validation set as test

# Handle missing values and errors
train_df['tokenized'] = train_df['tokenized'].replace('#ERROR!', pd.NA)
test_df['tokenized'] = test_df['tokenized'].replace('#ERROR!', pd.NA)

# Use tokenized text where available, fallback to utterance
train_df['text_clean'] = train_df['tokenized'].fillna(train_df['utterance'])
test_df['text_clean'] = test_df['tokenized'].fillna(test_df['utterance'])

# Prepare features and labels
X_train = train_df['text_clean']
X_test = test_df['text_clean']
y_train = train_df['intentClass']
y_test = test_df['intentClass']

In [3]:
# Initialize TF-IDF with custom token pattern to include special tokens like [SEPA]
vectorizer = TfidfVectorizer(
    token_pattern=r'(?u)\b\w\w+\b|\[SEPA\]',  # Keep words and [SEPA] tokens
    min_df=2,  # Ignore terms that appear in fewer than 2 documents
    max_features=10000
)

# Fit and transform training data
X_train_tfidf = vectorizer.fit_transform(X_train)

# Transform test data
X_test_tfidf = vectorizer.transform(X_test)

In [4]:
# Encode string labels to numerical values
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

In [5]:
# Initialize and train Random Forest
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=50,
    random_state=42,
    class_weight='balanced'  # Handle class imbalance
)
rf_model.fit(X_train_tfidf, y_train_encoded)

In [6]:
# Predict on test set
y_pred = rf_model.predict(X_test_tfidf)

# Convert predictions back to original labels
y_pred_labels = le.inverse_transform(y_pred)

# Generate classification report
print(classification_report(
    y_test,
    y_pred_labels,
    zero_division=0
))

              precision    recall  f1-score   support

           A       0.73      0.72      0.72       580
           E       0.82      0.74      0.78      1181
           I       0.86      0.64      0.74       580
           O       0.92      0.95      0.93      6365

    accuracy                           0.89      8706
   macro avg       0.83      0.77      0.79      8706
weighted avg       0.89      0.89      0.89      8706

