In [None]:
import sys, gc, os

import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split
import numpy as np
from sklearn.metrics import roc_auc_score
import numpy as np
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

from datasets import Dataset
from tqdm.auto import tqdm
from transformers import PreTrainedTokenizerFast

from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier

In [None]:
# read data depending on whether it is on kaggle, colab or local
if "KAGGLE_KERNEL_RUN_TYPE" in os.environ:
    print("Running on Kaggle!")
    kernel = 'kaggle'
    test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
    submission = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv')
    org_train = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/train_essays.csv')
    extra = pd.read_csv("/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv", sep=',')
elif "google.colab" in sys.modules:
    print("Running on Google Colab!")
    kernel = 'google_colab'
    from google.colab import drive
    drive.mount('/content/drive')
    data_path = '/content/drive/MyDrive/Kaggle/LLM_Detect_AI_Generated_Text/data/'
    test = pd.read_csv(data_path + "test_essays.csv")
    submission = pd.read_csv(data_path + "sample_submission.csv")
    org_train = pd.read_csv(data_path + "train_essays.csv")
    extra = pd.read_csv(data_path + "train_v2_drcat_02.csv", sep=",")
else:
    print("Running locally.")
    kernel = 'local'
    test = pd.read_csv("./data/test_essays.csv")
    submission = pd.read_csv("./data/sample_submission.csv")
    org_train = pd.read_csv("./data/train_essays.csv")
    train = pd.read_csv("./data/train_v2_drcat_02.csv", sep=",")

In [None]:
# drop duplicates
extra = extra.drop_duplicates(subset=['text'])
extra.reset_index(drop=True, inplace=True)


In [None]:
# train, val, test set 90%, 10%, 10% respectively
extra = extra.sample(frac=1.0, random_state=42).reset_index(drop=True)
X, y = extra['text'], extra['label']
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)
print(f"Train set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")

In [None]:
LOWERCASE, VOCAB_SIZE = False, 30522

In [None]:
# Creating Byte-Pair Encoding tokenizer
raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
raw_tokenizer.normalizer = normalizers.Sequence([normalizers.NFC()] + [normalizers.Lowercase()] if LOWERCASE else [])
raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.BpeTrainer(vocab_size=VOCAB_SIZE, special_tokens=special_tokens)
dataset = Dataset.from_pandas(pd.DataFrame(X_test))

def train_corp_iter(): 
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["text"]

raw_tokenizer.train_from_iterator(train_corp_iter(), trainer=trainer)
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

tokenized_texts_test, tokenized_texts_train = [], []

for text in tqdm(X_test.tolist()):
    tokenized_texts_test.append(tokenizer.tokenize(text))
for text in tqdm(X_train.tolist()):
    tokenized_texts_train.append(tokenizer.tokenize(text))

In [None]:
def dummy(text):
    return text


tfidf_params = {
    "ngram_range": (3, 5),
    "lowercase": False,
    "sublinear_tf": True,
    "analyzer": "word",
    "tokenizer": dummy,
    "preprocessor": dummy,
    "token_pattern": None,
    "strip_accents": "unicode",
}

# learn TF-IDF vocabulary on test set
vectorizer = TfidfVectorizer(**tfidf_params)
vectorizer.fit(tokenized_texts_test)
vocab = vectorizer.vocabulary_
# print(f"Test dataset TF-IDF vocabulary: {vocab}")

# fit TF-IDF on train set using only vocaulary learned from test set
vectorizer = TfidfVectorizer(**tfidf_params)
tf_train = vectorizer.fit_transform(tokenized_texts_train)
tf_test = vectorizer.transform(tokenized_texts_test)
del vectorizer
gc.collect()

In [None]:
clf = MultinomialNB(alpha=0.02)
sgd_model = SGDClassifier(max_iter=8000, tol=1e-4, loss="modified_huber") 
lgb_params = {
    'n_iter': 1500,
    'verbose': -1,
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.05073909898961407,
    'colsample_bytree': 0.726023996436955,
    'colsample_bynode': 0.5803681307354022,
    'lambda_l1': 8.562963348932286, 
    'lambda_l2': 4.893256185259296,
    'min_data_in_leaf': 115,
    'max_depth': 23,
    'max_bin': 898
}
lgb = LGBMClassifier(**lgb_params)
cat = CatBoostClassifier(
    iterations=1000,
    verbose=0,
    l2_leaf_reg=6.6591278779517808,
    learning_rate=0.005689066836106983,
    allow_const_label=True,
    loss_function = 'CrossEntropy'
)
weights = [0.07, 0.31, 0.31, 0.31]

ensemble = VotingClassifier(
    estimators=[
        ('mnb',clf),
        ('sgd', sgd_model),
        ('lgb',lgb), 
        ('cat', cat)
    ],
    weights=weights, 
    voting='soft',
    n_jobs=-1
)
ensemble.fit(tf_train, y_train)
gc.collect()

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

def evaluate_model_predictions(model, X, y, set_name):
    predictions = model.predict(X)
    proba = model.predict_proba(X)[:, 1]
    auc = roc_auc_score(y, proba)

    print(f"{set_name} Set Evaluation:")
    print(f"AUC: {auc}")
    print("\nClassification Report:")
    print(classification_report(y, predictions))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y, predictions))
    print("\n\n")

evaluate_model_predictions(ensemble, tf_train, y_train, "Training")
evaluate_model_predictions(ensemble, tf_test, y_test, "Testing")