In [None]:
!pip install /kaggle/input/pyspellchecker/pyspellchecker-0.8.4-py3-none-any.whl

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import math
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import re
from spellchecker import SpellChecker
import seaborn as sns
import warnings   # We will turn of the future warnings that xgboost gives us
warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV

import warnings   
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
# Load all datasets here. 

train_essays = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/train_essays.csv', index_col='id')
test_essays = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv', index_col='id')
sample_sub = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv')

train = pd.read_csv("/kaggle/input/train-final/train_WORK.csv", sep=',', index_col = 'id')

train_y = train['generated']
train_X = train['text']

In [None]:
train["text"] = train["text"].str.lower()

# Define inputs for variables 

input_transition = ["first", "firstly", "second", "secondly", "third", "thirdly", "meanwhile", "previously", "subsequently", "eventually", 
                    "finally", "lastly", "ultimately", "conclusion", "addition", "additionally", "furthermore", "moreover", "besides", 
                    "equally", "however", "contrary", "conversely", "despite", "contrast", "nevertheless", "nonetheless", "whereas", "while",
                    "although", "though", "therefore", "thus", "hence", "consequently", "accordingly", "namely", "specifically", "indeed", 
                    "importantly", "significantly", "especially", "notably", "undoubtedly", "likewise", "similarly", "correspondingly", "sum",
                    "summary", "overall", "conclude", "conclusion", "simultaneously", "formerly", "lately", "recently", "opposite", "adjacent",
                    "provided", "admittedly", "regarding"]

input_hyperbole = ["powerful", "groundbreaking", "illuminating", "vital", "invaluable", "indelible", "essential", "poignant", "profound", 
                   "remarkable", "transformative", "revolutionary", "unparalleled", "extraordinary", "compelling", "significant", "exceptional",
                   "crucial", "monumental", "dramatic", "robust", "innovative", "pivotal", "impressive", "astonishing", "visionary", "inspiring",
                   "striking", "dynamic", "iconic", "seminal", "trailblazing", "revolutionary", "extreme", "shocking"]

input_abn_symbols = ["[", "]", "_", "*", "<", ">", "{", "}", "^", "@", "#", "|", "\\"]

input_prompt_lang = ["here you go", "as an ai", "as a language model", "i generated", "here's the essay", "here's your essay", "let me", 
                     "help you", "sure,", "i hope this helps", "your prompt", "your request", "here is", "here's", "sure!", 
                     "here is the essay", "here is your essay", "language model", "large language", "llm", "generative ai", "chatbot", 
                     "your essay"]

# Create new variables 

### Spelling errors
spell = SpellChecker()

def clean_text_for_spellcheck(text):
    if pd.isna(text):
        return ""
    for symbol in input_abn_symbols:
        text = text.replace(symbol, "")
    text = re.sub(r"[^a-z\s'-]", "", text.lower())
    return text

def misspelling_ratio(text):
    text_clean = clean_text_for_spellcheck(text)
    words = text_clean.split()
    if len(words) == 0:
        return 0.0 
    misspelled = spell.unknown(words)
    return len(misspelled) / len(words)

def count_misspellings(text):
    text_clean = clean_text_for_spellcheck(text)
    words = text_clean.split()
    if len(words) == 0:
        return 0
    misspelled = spell.unknown(words)
    return len(misspelled)


train["text_modified"] = train["text"].apply(clean_text_for_spellcheck)
train["misspelling_ratio"] = train["text_modified"].apply(misspelling_ratio)
train["n_misspellings"] = train["text_modified"].apply(count_misspellings)

### Exclamation points
def count_exclamation_points(text):
    return text.count('!')
train["n_exclamations"] = train["text"].apply(count_exclamation_points)


### Em dashes
def count_em_dash(text):
    return text.count('â€”')
train["n_em_dash"] = train["text"].apply(count_em_dash)


### Transitional words
def count_transition(text):
    return sum(text.count(word) for word in input_transition)
train["n_transition_words"] = train["text"].apply(count_transition)


### Hyperbolic phrasing
def count_hyperbolic(text):
    return sum(text.count(word) for word in input_hyperbole)
train["n_hyperbolic"] = train["text"].apply(count_hyperbolic)


### Abnormal symbols
def count_abn_symbols(text):
    return sum(text.count(word) for word in input_abn_symbols)
train["n_abn_symbols"] = train["text"].apply(count_abn_symbols)


## Prompt indicator
def contains_prompt_indicators(text):
    return sum(text.count(word) for word in input_prompt_lang)
train["n_prompt_indicator"] = train["text"].apply(contains_prompt_indicators)

In [None]:
feature_cols = [
    'n_prompt_indicator',
    'n_misspellings',
    'n_exclamations',
    'n_em_dash',
    'n_abn_symbols',
    'n_transition_words',
    'n_hyperbolic',
    'misspelling_ratio'
]

X_train_features = train[feature_cols]
y_train = train['generated']

preprocess = Pipeline([
    ('scale', StandardScaler())
])

In [None]:
# Random forests pipe - no PCA
pipe_rf = Pipeline([
    ('prep', preprocess),
    ('model', RandomForestClassifier())
])

params_rf = {
    "model": [RandomForestClassifier(random_state=38)],
    "model__n_estimators": [100, 200],
    "model__max_depth": [None, 5, 10],
    "model__min_samples_split": [5, 10, 20],
    "model__class_weight": [None, "balanced"]
}

grid_rf = GridSearchCV(pipe_rf, params_rf, cv=5, scoring='roc_auc', n_jobs=-1)

grid_rf.fit(X_train_features, y_train)

# XGB pipe
pipe_xgb = Pipeline([
    ('prep', preprocess),
    ('model', XGBClassifier())
])

params_xgb = {
    "model": [XGBClassifier(eval_metric='logloss', random_state=38)],
    "model__learning_rate": [0.05, 0.1, 0.2],
    "model__n_estimators": [100, 200],
    "model__max_depth": [3, 5, 7]
}

grid_xgb = GridSearchCV(pipe_xgb, params_xgb, cv=5, scoring='roc_auc', n_jobs=-1)

grid_xgb.fit(X_train_features, y_train)

In [None]:
best_grid = max([grid_rf, grid_xgb], key=lambda g: g.best_score_)
pipe_final = best_grid.best_estimator_
pipe_final.fit(X_train_features, y_train)

In [None]:
test_essays["text_modified"] = test_essays["text"].apply(clean_text_for_spellcheck)
test_essays['misspelling_ratio'] = test_essays['text'].apply(misspelling_ratio)
test_essays['n_misspellings'] = test_essays['text'].apply(count_misspellings)
test_essays['n_exclamations'] = test_essays['text'].apply(count_exclamation_points)
test_essays['n_em_dash'] = test_essays['text'].apply(count_em_dash)
test_essays['n_transition_words'] = test_essays['text'].apply(count_transition)
test_essays['n_hyperbolic'] = test_essays['text'].apply(count_hyperbolic)
test_essays['n_abn_symbols'] = test_essays['text'].apply(count_abn_symbols)
test_essays["n_prompt_indicator"] = test_essays["text"].apply(contains_prompt_indicators)

X_test_features = test_essays[feature_cols]

test_preds = pipe_final.predict(X_test_features)

final_submission = pd.DataFrame({
    'id': sample_sub['id'],   # keep Kaggle ids intact
    'generated': test_preds
})

final_submission.to_csv('submission.csv', index=False)