# Importing library

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report, accuracy_score


# Importing files and Feature Engineering

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
train_prompts = pd.read_csv('/content/drive/MyDrive/dataset/train_prompts.csv')
train_essays = pd.read_csv('/content/drive/MyDrive/dataset/train_essays.csv')
test_essays = pd.read_csv('/content/drive/MyDrive/dataset/test_essays.csv')
df_train_extra = pd.read_csv('/content/drive/MyDrive/dataset/train_v4_drcat_01.csv')

In [4]:
# Assuming df_train_extra and train_essays are already defined
df_train_extra.rename(columns={"label": "generated"}, inplace=True)
df_train_essays_final = pd.concat([df_train_extra[["text", "generated"]], train_essays[["text", "generated"]]])

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df_train_essays_final["text"],
    df_train_essays_final["generated"],
    test_size=0.2,  # 80% training, 20% testing
    random_state=42  # For reproducibility
)


In [36]:
# Sample 5% of the training data
X_train_sampled, _, y_train_sampled, _ = train_test_split(
    X_train,
    y_train,
    test_size=0.96,  # Keep only 5% for training
    random_state=42
)


In [37]:
# Initialize the vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 3), sublinear_tf=True)

# Vectorize the sampled training data and the full test data
X_train_tfidf = vectorizer.fit_transform(X_train_sampled)
X_test_tfidf = vectorizer.transform(X_test)

print("Vectorization complete.")

Vectorization complete.


In [38]:
# Creating a voting classifier with SGD and Logistic Regression
sgd_clf = SGDClassifier(max_iter=1000, tol=1e-3, loss="modified_huber")
lr_clf = LogisticRegression(solver="liblinear")
voting_clf = VotingClassifier(
    estimators=[('sgd', sgd_clf), ('lr', lr_clf)],
    voting='soft'
)

# Training the classifier on the sampled data
voting_clf.fit(X_train_tfidf, y_train_sampled)

In [39]:
#Predicting and evaluating the classifier
y_pred = voting_clf.predict(X_test_tfidf)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

Accuracy: 0.9471015942899073
              precision    recall  f1-score   support

           0       0.89      0.99      0.93      5727
           1       0.99      0.92      0.96      9264

    accuracy                           0.95     14991
   macro avg       0.94      0.95      0.95     14991
weighted avg       0.95      0.95      0.95     14991



In [40]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, matthews_corrcoef, log_loss,
    cohen_kappa_score, average_precision_score
)
y_pred_proba = voting_clf.predict_proba(X_test_tfidf)[:, 1]  # Get probability scores

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
conf_matrix = confusion_matrix(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)
logloss = log_loss(y_test, y_pred_proba)
kappa = cohen_kappa_score(y_test, y_pred)
average_precision = average_precision_score(y_test, y_pred_proba)

# Displaying the metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"ROC AUC: {roc_auc}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Matthews Correlation Coefficient: {mcc}")
print(f"Log Loss: {logloss}")
print(f"Cohen's Kappa: {kappa}")
print(f"Average Precision-Recall Score: {average_precision}")

Accuracy: 0.9471015942899073
Precision: 0.9906173983551488
Recall: 0.9231433506044905
F1 Score: 0.9556908979158518
ROC AUC: 0.987566433037097
Confusion Matrix:
[[5646   81]
 [ 712 8552]]
Matthews Correlation Coefficient: 0.8936871835193595
Log Loss: 0.23463988708783584
Cohen's Kappa: 0.8902743572343514
Average Precision-Recall Score: 0.993569483036042


In [43]:
import joblib

# After training and evaluation, save the metrics and probabilities
model_metrics = {
    'accuracy': accuracy_score(y_test, y_pred),
    'precision': precision_score(y_test, y_pred),
    'recall': recall_score(y_test, y_pred),
    'f1': f1_score(y_test, y_pred),
    'roc_auc': roc_auc_score(y_test, y_pred_proba),
    'log_loss': log_loss(y_test, y_pred_proba),
    'mcc': matthews_corrcoef(y_test, y_pred)
}

# Save the metrics and probabilities
joblib.dump(model_metrics, '/content/drive/MyDrive/dataset/model_metrics.pkl')
joblib.dump(y_pred_proba, '/content/drive/MyDrive/dataset/model_y_pred_proba.pkl')


['/content/drive/MyDrive/dataset/model_y_pred_proba.pkl']