In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
import json
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from glob import glob
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
import re
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the datasets
train = pd.read_csv('/kaggle/input/m4-dataset/train_balanced.csv', dtype={'text': str, 'label': str})
val = pd.read_csv('/kaggle/input/m4-dataset/val_balanced.csv', dtype={'text': str, 'label': str})
test = pd.read_csv('/kaggle/input/m4-dataset/test_balanced.csv', dtype={'text': str, 'label': str})
test_MGTBench = pd.read_csv('/kaggle/input/m4-dataset/test_MGTBench.csv', dtype={'text': str, 'label': str})
test_MixSet = pd.read_csv('/kaggle/input/m4-dataset/test_MixSet.csv', dtype={'text': str, 'label': str})


# ************************* # 
# Please don't directly load all the training data for SVM, it will work extermely long for doing the training + testing, it is basically unreproducable 
# With that setting
# Please just make the training set smaller before training (Still get similar result)
# getting 10k / 15k data are both OK, depends on the time needed

# train = train.head(15000)  # Using only the first 15000 entries
# val = val.head(8000) # Used  if futher speedup the performance
# ************************* # 

# Remove NaNs and reset index
datasets = [train, val, test, test_MGTBench, test_MixSet]
for dataset in datasets:
    dataset.dropna(subset=['text'], inplace=True)
    dataset.reset_index(drop=True, inplace=True)

    
# Combine the datasets for overall statistics
all_data = pd.concat([train, val, test], ignore_index=True)
all_data = test_MixSet
print(all_data.head())
print(all_data[all_data['label'] == 0].shape[0])
# 1. Ratio of human and AI text
plt.figure(figsize=(8, 6))
sns.countplot(x='label', data=all_data)
plt.title('Ratio of Human and AI Text')
plt.xticks([0, 1], ['Human', 'AI'])
plt.xlabel('Type of Text')
plt.ylabel('Count')
plt.show()

# 2. Ratio of different domain sources
plt.figure(figsize=(10, 8))
sns.countplot(y='source', data=all_data, order = all_data['source'].value_counts().index)
plt.title('Ratio of Different Domain Sources')
plt.xlabel('Count')
plt.ylabel('Source Domain')
plt.show()

# 3. Ratio of different models
plt.figure(figsize=(10, 8))
sns.countplot(y='model', data=all_data, order = all_data['model'].value_counts().index)
plt.title('Ratio of Different Models')
plt.xlabel('Count')
plt.ylabel('Model')
plt.show()

# Print the number of entries for each model within each domain
model_domain_counts = all_data.groupby(['source', 'model']).size().reset_index(name='count')
print(model_domain_counts)


### SVM

In [39]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score, roc_curve, precision_score, recall_score
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm.auto import tqdm
from collections import defaultdict

def evaluate_dataset(clf, X_texts, y_true, test_data, dataset_name, kernel):
    print(f"\nEvaluating on {dataset_name} with {kernel} kernel...")
    X = vectorizer.transform(tqdm(X_texts, desc=f"Vectorizing {dataset_name} Data")).toarray()
    y_pred = clf.predict(X)
    y_proba = clf.predict_proba(X)[:, 1]  # Get probabilities for the positive class
    
    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='weighted')
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    auc = roc_auc_score(y_true, y_proba)
    conf_matrix = confusion_matrix(y_true, y_pred)
    
    # Print overall results
    print(f"Results for {dataset_name}:")
    print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}, AUC Score: {auc:.4f}")

    # Store predictions and probabilities
    results[(kernel, dataset_name)] = {
        'y_true': y_true,
        'y_pred': y_pred,
        'y_proba': y_proba,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'AUC Score': auc,
        'Confusion Matrix': conf_matrix.tolist()  # Convert numpy array to list for JSON serializable format
    }


    # Append domain-specific accuracy
    domain_accuracies = {}
    for domain in test_data['source'].unique():
        domain_mask = (test_data['source'] == domain)

        domain_acc = accuracy_score(y_true[domain_mask], y_pred[domain_mask])
        domain_accuracies[domain] = domain_acc
        print(f"Accuracy for domain {domain} ({kernel} Kernel): {domain_acc:.4f}")

    # Append model-specific accuracy
    model_accuracies = {}
    for model_type in test_data['model'].unique():
        model_mask = (test_data['model'] == model_type)
        model_acc = accuracy_score(y_true[model_mask], y_pred[model_mask])
        model_accuracies[model_type] = model_acc
        print(f"Accuracy for model {model_type} ({kernel} Kernel): {model_acc:.4f}")

    # Save predictions, probabilities, and metrics for each kernel and dataset
    predictions_df = pd.DataFrame({'Actual': y_true, 'Predicted': y_pred, 'Probability': y_proba})
    predictions_df.to_csv(f'{output_directory}/{kernel}_{dataset_name}_predictions.csv', index=False)
    
    metrics_df = pd.DataFrame({
        'Metric': ['Accuracy', 'Precision', 'Recall', 'Precision', 'F1 Score', 'AUC Score'],
        'Value': [accuracy, precision, recall, precision, f1, auc]
    })
    metrics_df.to_csv(f'{output_directory}/{kernel}_{dataset_name}_metrics.csv', index=False)

    # Save domain and model accuracies
    domain_df = pd.DataFrame(list(domain_accuracies.items()), columns=['Domain', 'Accuracy'])
    domain_df.to_csv(f'{output_directory}/{kernel}_{dataset_name}_domain_accuracies.csv', index=False)

    model_df = pd.DataFrame(list(model_accuracies.items()), columns=['Model', 'Accuracy'])
    model_df.to_csv(f'{output_directory}/{kernel}_{dataset_name}_model_accuracies.csv', index=False)

def evaluate_dataset_accuracies(clf, X_texts, y_true, test_data, dataset_name, kernel):
    print(f"\nEvaluating on {dataset_name} with {kernel} kernel...")
    X = vectorizer.transform(tqdm(X_texts, desc=f"Vectorizing {dataset_name} Data")).toarray()
    y_pred = clf.predict(X)
    y_proba = clf.predict_proba(X)[:, 1]  # Get probabilities for the positive class
    
    # Initialize DataFrame from test_data for easier manipulation
    test_data_df = pd.DataFrame(test_data)
    test_data_df['Predicted'] = y_pred

    # Dataset-specific accuracies
    dataset_accuracies = {}
    dataset_labels = {
        'MixsetHUMANIZE': 'Humanize',
        'MixsetCOMPLETE': 'Complete',
        'MixsetREWRITE': 'Rewrite',
        'MixsetPOLISH_SENTENCE': 'Polish'
    }

    for dataset, label in dataset_labels.items():
        dataset_mask = (test_data['dataset'] == dataset)
        dataset_acc = accuracy_score(y_true[dataset_mask], y_pred[dataset_mask])
        dataset_accuracies[label] = dataset_acc
        print(f"Accuracy for method {label} ({kernel} Kernel): {dataset_acc:.4f}")

    return dataset_accuracies 

# Usage of evaluate_dataset function within the loop over kernels
X_train_texts = train['text']
y_train = train['label'].astype(int).values
X_test_texts = test['text']
y_test = test['label'].astype(int).values
X_test_MGTBench_texts = test_MGTBench['text']
y_test_MGTBench = test_MGTBench['label'].astype(int).values
X_test_MixSet_texts = test_MixSet['text']
y_test_MixSet = test_MixSet['label'].astype(int).values

vectorizer = TfidfVectorizer(max_features=650)
X_train = vectorizer.fit_transform(tqdm(X_train_texts, desc="Vectorizing Train Data")).toarray()

# Initialize your metrics and results storage
kernels = ['rbf', 'linear', 'poly', 'sigmoid']
results = {}
output_directory = '/kaggle/working'  # Change this to your desired path

for kernel in kernels:
    clf = SVC(kernel=kernel, probability=True)
    clf.fit(X_train, y_train)

    # ************************* # 
    # The above ones are for general testing
    # ************************* # 
    
    # Evaluate on standard test set
    evaluate_dataset(clf, test['text'], test['label'].astype(int).values, test, "standard_test", kernel)
    
    # Evaluate on MGTBench
    evaluate_dataset(clf, test_MGTBench['text'], test_MGTBench['label'].astype(int).values, test_MGTBench, "MGTBench", kernel)
    
    # Evaluate on MixSet
    evaluate_dataset(clf, test_MixSet['text'], test_MixSet['label'].astype(int).values, test_MixSet, "MixSet", kernel)
    
    
    # ************************* # 
    # While this one is for testing different methods in MixSet
    # ************************* # 
    
    test_MixSet_GPT4 = test_MixSet[test_MixSet['model'] == 'GPT4']
    #print(test_MixSet_GPT4)
    evaluate_dataset_accuracies(clf, test_MixSet_GPT4['text'], test_MixSet_GPT4['label'].astype(int).values, test_MixSet_GPT4, "MixSet_GPT4", kernel)

Vectorizing Train Data:   0%|          | 0/14706 [00:00<?, ?it/s]


Evaluating on MixSet_GPT4 with rbf kernel...


Vectorizing MixSet_GPT4 Data:   0%|          | 0/765 [00:00<?, ?it/s]

Accuracy for method Humanize (rbf Kernel): 0.6433
Accuracy for method Complete (rbf Kernel): 0.9806
Accuracy for method Rewrite (rbf Kernel): 0.9935
Accuracy for method Polish (rbf Kernel): 0.9806

Evaluating on MixSet_GPT4 with linear kernel...


Vectorizing MixSet_GPT4 Data:   0%|          | 0/765 [00:00<?, ?it/s]

Accuracy for method Humanize (linear Kernel): 0.6367
Accuracy for method Complete (linear Kernel): 0.9806
Accuracy for method Rewrite (linear Kernel): 0.9613
Accuracy for method Polish (linear Kernel): 0.9419

Evaluating on MixSet_GPT4 with poly kernel...


Vectorizing MixSet_GPT4 Data:   0%|          | 0/765 [00:00<?, ?it/s]

Accuracy for method Humanize (poly Kernel): 0.5567
Accuracy for method Complete (poly Kernel): 0.9677
Accuracy for method Rewrite (poly Kernel): 0.9935
Accuracy for method Polish (poly Kernel): 0.9677

Evaluating on MixSet_GPT4 with sigmoid kernel...


Vectorizing MixSet_GPT4 Data:   0%|          | 0/765 [00:00<?, ?it/s]

Accuracy for method Humanize (sigmoid Kernel): 0.5867
Accuracy for method Complete (sigmoid Kernel): 0.9742
Accuracy for method Rewrite (sigmoid Kernel): 0.9419
Accuracy for method Polish (sigmoid Kernel): 0.9097
