In [50]:
import os
import pandas as pd
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

In [51]:
def is_kaggle_platform():
    return "kaggle" in os.listdir("/")

In [52]:
def get_xgboost_model():
    """Get the XGBoost model"""
    model = xgb.XGBClassifier(objective="binary:logistic", eval_metric="auc")
    return model

In [53]:
def get_train_data():
    """Get the data"""
    DATA_ROOT = "/Users/kaiqu/kaggle-datasets/llm-detect-ai-generated-text" if not is_kaggle_platform() else "/kaggle/input/llm-detect-ai-generated-text"
    data_df = pd.read_csv(f"{DATA_ROOT}/train_essays.csv")
    # Here, replace this with actual data loading
    ids = data_df["id"].tolist()
    essays = data_df["text"].tolist()  # Replace with actual texts
    labels = data_df[
        "generated"
    ].tolist()  # Replace with actual labels (0 or 1)
    return ids, essays, labels

# TODO: think of way to avoid such code duplication
def get_test_data():
    """Get the test data"""
    DATA_ROOT = "/Users/kaiqu/kaggle-datasets/llm-detect-ai-generated-text" if not is_kaggle_platform() else "/kaggle/input/llm-detect-ai-generated-text"
    data_df = pd.read_csv(f"{DATA_ROOT}/test_essays.csv")
    # Here, replace this with actual data loading
    ids = data_df["id"].tolist()
    essays = data_df["text"].tolist()  # Replace with actual texts
    return ids, essays 

In [54]:
def tokenize_texts(essays):
    """Tokenize texts using BERT tokenizer"""
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    # You may need additional preprocessing depending on your data
    encoded_essays = tokenizer(
        essays, padding="max_length", truncation=True, return_tensors="pt"
    )
    return encoded_essays["input_ids"].numpy()  # Convert to numpy array for XGBoost

In [55]:
def train(X, y):
    """Train the XGBoost model"""
    model = get_xgboost_model()
    model.fit(X, y)
    return model

In [56]:
if __name__ == "__main__":
    train_mode = False
    
    if train_mode:
        _, essays, labels = get_train_data()

        # Convert essays to format suitable for XGBoost
        X = tokenize_texts(essays)

        # Split data into training and test set
        X_train, X_test, y_train, y_test = train_test_split(
            X, labels, test_size=0.2, random_state=42
        )

        # Train the model
        model = train(X_train, y_train)

        # Evaluate the model
        y_pred = model.predict_proba(X_test)[:, 1]  # Get probability predictions
        auc_score = roc_auc_score(y_test, y_pred)
        print(f"AUC Score: {auc_score}")
    else:
        # First we train on the entire training set 
        _, essays, labels = get_train_data()
        X = tokenize_texts(essays)
        
        model = train(X, labels)
        
        # Then we predict on the test set
        test_ids, test_essays = get_test_data()
        X_test = tokenize_texts(test_essays)
        test_y_pred = model.predict_proba(X_test)[:, 1]
    
        # Get the submission dataframe
        submission_df = pd.DataFrame({"id": test_ids, "generated": test_y_pred})
        submission_df.to_csv("submission.csv", index=False)
    

X (1378, 512)
X_test (3, 512)
