<a href="https://colab.research.google.com/github/dany-xu/AI-Generated-Text-Detection-using-LLM/blob/main/models/baseline_3models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive') # current dir: '/content'
import pandas as pd
import random
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from gensim.models import Word2Vec
from collections import Counter

Mounted at /content/drive


In [2]:
data = pd.read_csv("./drive/MyDrive/ColabNotebooks/llm/concat_ori.csv").drop(columns=["Unnamed: 0"])
data1 = data[data['label'] == 1]
data0 = data[data['label'] == 0]
train1, test1 = train_test_split(data1, test_size=0.2)
train0, test0 = train_test_split(data0, test_size=0.2)
train = pd.concat([train0, train1], ignore_index=True)
test = pd.concat([test0, test1], ignore_index=True)
train = train.loc[np.random.permutation(train.index)]
test = test.loc[np.random.permutation(test.index)]
train.shape, test.shape

((40559, 5), (10140, 5))

In [3]:
train_data = list(train.abstract)
train_label = list(train.label)
test_data = list(test.abstract)
test_label = list(test.label)

Bag-of-Words (BoW) with Logistic Regression

In [None]:
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(train_data)
X_test_bow = vectorizer.transform(test_data)

# Train logistic regression model
model = LogisticRegression()
model.fit(X_train_bow, train_label)

# Evaluate model
y_pred = model.predict(X_test_bow)
acc = accuracy_score(test_label, y_pred)
print(f'Logistic Regression Accuracy: {acc}')

Logistic Regression Accuracy: 0.9064102564102564


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# measure results
accuracy = accuracy_score(test_label, y_pred)
precision = precision_score(test_label, y_pred)
recall = recall_score(test_label, y_pred)
f1 = f1_score(test_label, y_pred)
conf_matrix = confusion_matrix(test_label, y_pred) # confusion matrix

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("Confusion Matrix:", conf_matrix)

Accuracy: 0.9064102564102564
Precision: 0.9037358549062161
Recall: 0.9467359532315687
F1-score: 0.9247362994686334
Confusion Matrix: [[3361  621]
 [ 328 5830]]


N-gram Models with Naive Bayes

In [None]:
vectorizer = CountVectorizer(ngram_range=(1, 2))
X_train_ngram = vectorizer.fit_transform(train_data)
X_test_ngram = vectorizer.transform(test_data)

nb_model = MultinomialNB()
nb_model.fit(X_train_ngram, train_label)

y_pred_nb = nb_model.predict(X_test_ngram)
acc_nb = accuracy_score(test_label, y_pred_nb)
print(f'Naive Bayes Accuracy (N-grams): {acc_nb}')

Naive Bayes Accuracy (N-grams): 0.8787968441814595


In [None]:
# measure results
accuracy = accuracy_score(test_label, y_pred_nb)
precision = precision_score(test_label, y_pred_nb)
recall = recall_score(test_label, y_pred_nb)
f1 = f1_score(test_label, y_pred_nb)
conf_matrix = confusion_matrix(test_label, y_pred_nb) # confusion matrix

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("Confusion Matrix:", conf_matrix)

Accuracy: 0.8787968441814595
Precision: 0.872449750642285
Recall: 0.9374797012016889
F1-score: 0.9037964774951076
Confusion Matrix: [[3138  844]
 [ 385 5773]]


Word Embeddings with SVM

In [5]:
text_data = list(data.abstract) # tokenize
tokens = [word for sentence in text_data for word in sentence.lower().split()]
token_counts = Counter(tokens)# frequency
num_unique_tokens = len(token_counts)
max_features = int(num_unique_tokens * 1.1) # set max_features slightly higher

print("unique token num:", num_unique_tokens)
print("my max_features:", max_features)

unique token num: 107263
my max_features: 117989


In [6]:
tfidf_vectorizer = TfidfVectorizer(max_features=50)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data)
X_test_tfidf = tfidf_vectorizer.transform(test_data)

svm_classifier = SVC(kernel='linear')  # linear kernel for SVM
svm_classifier.fit(X_train_tfidf, train_label)
y_pred = svm_classifier.predict(X_test_tfidf)

acc = accuracy_score(test_label, y_pred)
print("Accuracy:", acc)

Accuracy: 0.7971400394477317


In [7]:
# measure results
accuracy = accuracy_score(test_label, y_pred)
precision = precision_score(test_label, y_pred)
recall = recall_score(test_label, y_pred)
f1 = f1_score(test_label, y_pred)
conf_matrix = confusion_matrix(test_label, y_pred) # confusion matrix

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("Confusion Matrix:", conf_matrix)

Accuracy: 0.7971400394477317
Precision: 0.8060904612628751
Recall: 0.8769080870412471
F1-score: 0.8400093334370383
Confusion Matrix: [[2683 1299]
 [ 758 5400]]
