In [20]:
# Create the dataset
import pandas as pd
import json

df = pd.DataFrame(columns=['id', 'text', 'label'])
id = 0

# Add to the dataset from the AI-generated essays
with open('ai_gen_essays.json', 'r') as f:
    data = json.load(f)
    full_texts = [record['full_text'][record['full_text'].index('\n')+2:] if '\n' in record['full_text'] else record['full_text'] for record in data]
    df = pd.concat([df, pd.DataFrame({'id': range(len(full_texts)), 'text': full_texts, 'label': [0]*len(full_texts)})], ignore_index=True)
    
# Add to the dataset from the human-written essays
human_essays = pd.read_csv('human_essays.csv')
full_texts = human_essays['full_text'].tolist()
df = pd.concat([df, pd.DataFrame({'id': range(len(full_texts)), 'text': full_texts, 'label': [1]*len(full_texts)})], ignore_index=True)

In [21]:
# Display info about the dataset
print(f'Number of essays: {len(df)}')
print(f'AI-generated essays: {len(df[df["label"] == 0])}')
print(f'Human-generated essays: {len(df[df["label"] == 1])}')
print(df.head())

Number of essays: 19999
AI-generated essays: 2692
Human-generated essays: 17307
  id                                               text label
0  0  In the contemporary world, the art of storytel...     0
1  1  As a student at university, I have always been...     0
2  2  Stories have the remarkable ability to simplif...     0
3  3  As a student and storyteller, I have often fac...     0
4  4  As a student at university, I have always been...     0


In [None]:
# Import libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

# Split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=123, stratify=df['label'], shuffle=True)

y_train = y_train.astype(int)
y_test = y_test.astype(int)

# Print the shape of the data
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

# Print the first 5 rows of the data
print(X_train.head())
print(y_train.head())

print(f'labels in train: {set(y_train)}')
print(f'type of labels in train: {type(y_train)}')

(15999,)
(4000,)
(15999,)
(4000,)
5894    After I read the passage I think Luke really w...
3728    I know this wasn't created by aliens in many w...
8958    Dear Florida State Senator,\n\nThe Electoral C...
7671    Although the school system has advanced quite ...
5999    Cars are not something that have to be used ev...
Name: text, dtype: object
5894    1
3728    1
8958    1
7671    1
5999    1
Name: label, dtype: int64
labels in train: {0, 1}
type of labels in train: <class 'pandas.core.series.Series'>


In [23]:
# Create a TfidfVectorizer object
# Set common parameters to limit overfitting in vectorizers
max_features = 10000       # Limit vocabulary size
min_df = 5                 # Ignore terms that appear in less than 5 documents
max_df = 0.8               # Ignore terms that appear in more than 80% of documents
vectorizer = TfidfVectorizer(max_features=max_features, min_df=min_df, max_df=max_df)

# Fit the vectorizer to the training data
vectorizer.fit(X_train)

# Transform the training data
X_train_tfidf = vectorizer.transform(X_train)

# Transform the test data
X_test_tfidf = vectorizer.transform(X_test)

# Train the binary classifier
clf = LogisticRegression()
clf.fit(X_train_tfidf, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test_tfidf)

# Print the accuracy of the classifier
a_s = accuracy_score(y_test, y_pred)
print("TfidfVectorizer")
print("Accuracy:", a_s)

# Print the classification report
print(classification_report(y_test, y_pred))

# Print the confusion matrix
print(confusion_matrix(y_test, y_pred))

# Print the precision, recall, and F1 score
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 score:", f1_score(y_test, y_pred))

TfidfVectorizer
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       513
           1       1.00      1.00      1.00      3487

    accuracy                           1.00      4000
   macro avg       1.00      1.00      1.00      4000
weighted avg       1.00      1.00      1.00      4000

[[ 513    0]
 [   0 3487]]
Precision: 1.0
Recall: 1.0
F1 score: 1.0


In [24]:
# Take user input to predict the label of a new essay
essay = input("Enter the essay to predict the label: ")

# Tokenize the essay
tokens = vectorizer.transform([essay])

# Predict the label of the essay
predicted_label = clf.predict(tokens)

# Print the predicted label
print(f"The predicted label is: {'Human' if predicted_label[0] == 1 else 'AI'}")

The predicted label is: Human


In [25]:
overlap = set(X_train).intersection(set(X_test))
print(f"Number of overlapping samples in train and test: {len(overlap)}")


Number of overlapping samples in train and test: 0
