In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv
/kaggle/input/llm-detect-ai-generated-text/train_prompts.csv
/kaggle/input/llm-detect-ai-generated-text/test_essays.csv
/kaggle/input/llm-detect-ai-generated-text/train_essays.csv


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report



In [3]:
# Load the data
train_essays = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/train_essays.csv')
train_prompts = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/train_prompts.csv')
test_essays = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')

In [4]:
# Merge the essays with their prompts
train_data = pd.merge(train_essays, train_prompts, on='prompt_id', how='left')

In [5]:
# Preprocess the text data
# Add any additional preprocessing steps as needed (e.g., stemming, lemmatization)
train_data['processed_text'] = train_data['text'].apply(lambda x: x.lower())
test_essays['processed_text'] = test_essays['text'].apply(lambda x: x.lower())

In [6]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    train_data['processed_text'],
    train_data['generated'],
    test_size=0.2,
    random_state=42
)

In [7]:
# Convert text data to numerical features using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust the number of features
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)

In [8]:
# Initialize and train a RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_tfidf, y_train)

In [9]:
# Make predictions on the validation set
predictions = rf_model.predict(X_val_tfidf)

In [10]:
# Evaluate the model
accuracy = accuracy_score(y_val, predictions)
print(f'Accuracy on validation set: {accuracy:.2f}')

Accuracy on validation set: 1.00


In [11]:
# Display classification report
print(classification_report(y_val, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       275
           1       0.00      0.00      0.00         1

    accuracy                           1.00       276
   macro avg       0.50      0.50      0.50       276
weighted avg       0.99      1.00      0.99       276



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
# Cross-validation to assess generalization performance
cv_scores = cross_val_score(rf_model, X_train_tfidf, y_train, cv=5, scoring='accuracy')
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.2f}')



Cross-Validation Scores: [0.99547511 0.99547511 1.         1.         1.        ]
Mean CV Accuracy: 1.00


In [13]:
# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

In [14]:
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train_tfidf, y_train)



In [15]:
best_params = grid_search.best_params_
print(f'Best Hyperparameters: {best_params}')

Best Hyperparameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}


In [16]:
# Use the best model from GridSearchCV
best_rf_model = grid_search.best_estimator_

In [17]:
# Predict on the hidden test set
X_test_tfidf = tfidf_vectorizer.transform(test_essays['processed_text'])
test_predictions = best_rf_model.predict(X_test_tfidf)

In [18]:
# Create a submission file
submission_df = pd.DataFrame({'id': test_essays['id'], 'generated': test_predictions})
submission_df.to_csv('submission.csv', index=False)