In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/combined-data/eli5_clean_combined.jsonl
/kaggle/input/chatgpt-detector-training-dataset/processed_dataset.csv
/kaggle/input/reddit-eli5/reddit_eli5.jsonl


In [3]:
import pickle
import math
import gensim.downloader
from tqdm import tqdm 
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [8]:
hc3_df = pd.read_json("/kaggle/input/combined-data/eli5_clean_combined.jsonl", lines=True)
hc3_df

Unnamed: 0,question,human_answers,chatgpt_answers,index,scraped_answers
0,"Why is every book I hear about a "" NY Times # ...","[Basically there are many categories of "" Best...",[There are many different best seller lists th...,,
1,"If salt is so bad for cars , why do we use it ...",[salt is good for not dying in car crashes and...,[Salt is used on roads to help melt ice and sn...,,
2,Why do we still have SD TV channels when HD lo...,[The way it works is that old TV stations got ...,[There are a few reasons why we still have SD ...,,
3,Why has nobody assassinated Kim Jong - un He i...,[You ca n't just go around assassinating the l...,[It is generally not acceptable or ethical to ...,,
4,How was airplane technology able to advance so...,[Wanting to kill the shit out of Germans drive...,[After the Wright Brothers made the first powe...,,
...,...,...,...,...,...
34351,What is closed-loop verification?,[Let's pretend that it's purge night and you a...,,,[Closed-loop verification is a process used in...
34352,Why is mustard oil illegal to consume in the US?,[Erucic acid is poison so there is a blanket b...,,,[Mustard oil is not illegal to consume in the ...
34353,How space probes shoot pictures back at Earth,[The technology to transmit information has be...,,,[Space probes shoot pictures back at Earth usi...
34354,What exactly do international waters mean for ...,[Every ship has a country of registry (flag st...,,,"[International waters, also known as the high ..."


In [10]:
# shift the data in scraped_answers to chatgpt_answers
for index, row in hc3_df.iterrows():
    # check if chatgpt_answers is nan
    if pd.isnull(row["chatgpt_answers"]).all() if type(row["chatgpt_answers"]) == list else pd.isnull(row["chatgpt_answers"]):
      hc3_df.at[index,"chatgpt_answers"] = row['scraped_answers']

hc3_df

Unnamed: 0,question,human_answers,chatgpt_answers,index,scraped_answers
0,"Why is every book I hear about a "" NY Times # ...","[Basically there are many categories of "" Best...",[There are many different best seller lists th...,,
1,"If salt is so bad for cars , why do we use it ...",[salt is good for not dying in car crashes and...,[Salt is used on roads to help melt ice and sn...,,
2,Why do we still have SD TV channels when HD lo...,[The way it works is that old TV stations got ...,[There are a few reasons why we still have SD ...,,
3,Why has nobody assassinated Kim Jong - un He i...,[You ca n't just go around assassinating the l...,[It is generally not acceptable or ethical to ...,,
4,How was airplane technology able to advance so...,[Wanting to kill the shit out of Germans drive...,[After the Wright Brothers made the first powe...,,
...,...,...,...,...,...
34351,What is closed-loop verification?,[Let's pretend that it's purge night and you a...,[Closed-loop verification is a process used in...,,[Closed-loop verification is a process used in...
34352,Why is mustard oil illegal to consume in the US?,[Erucic acid is poison so there is a blanket b...,[Mustard oil is not illegal to consume in the ...,,[Mustard oil is not illegal to consume in the ...
34353,How space probes shoot pictures back at Earth,[The technology to transmit information has be...,[Space probes shoot pictures back at Earth usi...,,[Space probes shoot pictures back at Earth usi...
34354,What exactly do international waters mean for ...,[Every ship has a country of registry (flag st...,"[International waters, also known as the high ...",,"[International waters, also known as the high ..."


In [11]:
# drop index col which is all NaN
processed_hc3_df = hc3_df.drop(["scraped_answers"], axis=1)
processed_hc3_df = processed_hc3_df.drop(["index"], axis=1)
processed_hc3_df.dropna(inplace=True)
processed_hc3_df

Unnamed: 0,question,human_answers,chatgpt_answers
0,"Why is every book I hear about a "" NY Times # ...","[Basically there are many categories of "" Best...",[There are many different best seller lists th...
1,"If salt is so bad for cars , why do we use it ...",[salt is good for not dying in car crashes and...,[Salt is used on roads to help melt ice and sn...
2,Why do we still have SD TV channels when HD lo...,[The way it works is that old TV stations got ...,[There are a few reasons why we still have SD ...
3,Why has nobody assassinated Kim Jong - un He i...,[You ca n't just go around assassinating the l...,[It is generally not acceptable or ethical to ...
4,How was airplane technology able to advance so...,[Wanting to kill the shit out of Germans drive...,[After the Wright Brothers made the first powe...
...,...,...,...
34351,What is closed-loop verification?,[Let's pretend that it's purge night and you a...,[Closed-loop verification is a process used in...
34352,Why is mustard oil illegal to consume in the US?,[Erucic acid is poison so there is a blanket b...,[Mustard oil is not illegal to consume in the ...
34353,How space probes shoot pictures back at Earth,[The technology to transmit information has be...,[Space probes shoot pictures back at Earth usi...
34354,What exactly do international waters mean for ...,[Every ship has a country of registry (flag st...,"[International waters, also known as the high ..."


In [12]:
# keep rows with non empty chatgpt ans
has_chatgpt_ans = processed_hc3_df["chatgpt_answers"].map(lambda ans_lst:ans_lst!=[])
processed_hc3_df = processed_hc3_df[has_chatgpt_ans]

#Need to expand the answers and chatgpt ans -- as for one question, there can be several human answers obtained
row_list = []
for index, row in processed_hc3_df.iterrows():
  human_ans_list = row["human_answers"]
  chatgpt_ans_list = row["chatgpt_answers"]
  # note that sometimes there can be 2 human ans, and 1 chatgpt ans for e.g. 
  num_chatgpt_ans = len(chatgpt_ans_list)
  for i in range(len(human_ans_list)):
    human_ans = human_ans_list[i]
    if i >= num_chatgpt_ans:
      chatgpt_ans = chatgpt_ans_list[num_chatgpt_ans-1]
    else:
      chatgpt_ans = chatgpt_ans_list[i]
    row_list.append([row["question"],human_ans,chatgpt_ans])

processed_hc3_df = pd.DataFrame(row_list, columns = ["question", "human_answer", "chatgpt_answer"])
print(processed_hc3_df.shape)
processed_hc3_df.head()

(68701, 3)


Unnamed: 0,question,human_answer,chatgpt_answer
0,"Why is every book I hear about a "" NY Times # ...","Basically there are many categories of "" Best ...",There are many different best seller lists tha...
1,"Why is every book I hear about a "" NY Times # ...","If you 're hearing about it , it 's because it...",There are many different best seller lists tha...
2,"Why is every book I hear about a "" NY Times # ...","One reason is lots of catagories . However , h...",There are many different best seller lists tha...
3,"If salt is so bad for cars , why do we use it ...",salt is good for not dying in car crashes and ...,Salt is used on roads to help melt ice and sno...
4,"If salt is so bad for cars , why do we use it ...","In Minnesota and North Dakota , they tend to u...",Salt is used on roads to help melt ice and sno...


In [13]:
chatgpt_responses = processed_hc3_df.loc[:, ["chatgpt_answer"]]
chatgpt_responses["label"]=1
chatgpt_responses.rename(columns = {'chatgpt_answer':'response'}, inplace = True)
chatgpt_responses.drop_duplicates(inplace=True)
print(chatgpt_responses.shape)
chatgpt_responses.head()

(33366, 2)


Unnamed: 0,response,label
0,There are many different best seller lists tha...,1
3,Salt is used on roads to help melt ice and sno...,1
6,There are a few reasons why we still have SD (...,1
9,It is generally not acceptable or ethical to a...,1
12,After the Wright Brothers made the first power...,1


In [14]:
human_responses = processed_hc3_df.loc[:, ["human_answer"]]
human_responses["label"]=0
human_responses.rename(columns = {'human_answer':'response'}, inplace = True)
human_responses.drop_duplicates(inplace=True)
print(human_responses.shape)
human_responses.head()

(64250, 2)


Unnamed: 0,response,label
0,"Basically there are many categories of "" Best ...",0
1,"If you 're hearing about it , it 's because it...",0
2,"One reason is lots of catagories . However , h...",0
3,salt is good for not dying in car crashes and ...,0
4,"In Minnesota and North Dakota , they tend to u...",0


In [15]:
# sample the same number of human responses as chatgpt responses to make type equal in number
sampled_human_responses = human_responses.sample(n = chatgpt_responses.shape[0])
sampled_human_responses

Unnamed: 0,response,label
37916,I always thought it was just a story of how a ...,0
9837,"The * real * question is , why do we think pup...",0
20297,What comes to my mind is Shirley Temple . So I...,0
23651,A very fat person has a lot of fat to cut thro...,0
32826,They do n't absorb heat . They absorb light . ...,0
...,...,...
49169,I 've been a trance DJ for years and the lines...,0
37650,> How are DUI checkpoints legal in California ...,0
36949,"No , they are not exempt . Taxes apply to even...",0
43029,"Some ovens heat up fast , some heat up slow . ...",0


In [16]:
Corpus = pd.concat([chatgpt_responses, sampled_human_responses], ignore_index=True)
# # join the corpus with the scraped gltr features
# Corpus = pd.merge(Corpus, gltr_df, on='response', how='inner')
print(Corpus.shape)
Corpus.head()

(66732, 2)


Unnamed: 0,response,label
0,There are many different best seller lists tha...,1
1,Salt is used on roads to help melt ice and sno...,1
2,There are a few reasons why we still have SD (...,1
3,It is generally not acceptable or ethical to a...,1
4,After the Wright Brothers made the first power...,1


# Try no preprocess

In [1]:
Encoder = LabelEncoder()
Y = Encoder.fit_transform(Corpus['label'])
Y

array([1, 1, 1, ..., 0, 0, 0])

In [1]:
Tfidf_vect = TfidfVectorizer()
Tfidf_vect.fit(Corpus['response'])
X_Tfidf = Tfidf_vect.transform(Corpus['response'])
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(X_Tfidf,Y,test_size=0.3)
Train_X

<46712x70478 sparse matrix of type '<class 'numpy.float64'>'
	with 3766872 stored elements in Compressed Sparse Row format>

## Default tfidf

In [2]:
%%time
RBF_SVM = svm.SVC(C=1.0, kernel='rbf')
RBF_SVM.fit(Train_X,Train_Y)
# predict the labels on validation dataset
predictions_RBF_SVM = RBF_SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_RBF_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_RBF_SVM))

SVM Accuracy Score ->  96.39360639360639
              precision    recall  f1-score   support

           0       0.96      0.96      0.96      9922
           1       0.96      0.96      0.96     10098

    accuracy                           0.96     20020
   macro avg       0.96      0.96      0.96     20020
weighted avg       0.96      0.96      0.96     20020

CPU times: user 47min 54s, sys: 601 ms, total: 47min 55s
Wall time: 47min 56s


## tfidf with english stopwords

In [4]:
Tfidf_vect = TfidfVectorizer(stop_words="english")
Tfidf_vect.fit(Corpus['response'])
X_Tfidf = Tfidf_vect.transform(Corpus['response'])
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(X_Tfidf,Y,test_size=0.3)
Train_X

<46712x70169 sparse matrix of type '<class 'numpy.float64'>'
	with 2298365 stored elements in Compressed Sparse Row format>

In [5]:
%%time
RBF_SVM = svm.SVC(C=1.0, kernel='rbf')
RBF_SVM.fit(Train_X,Train_Y)
# predict the labels on validation dataset
predictions_RBF_SVM = RBF_SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_RBF_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_RBF_SVM))

SVM Accuracy Score ->  94.36563436563436
              precision    recall  f1-score   support

           0       0.94      0.95      0.94     10012
           1       0.94      0.94      0.94     10008

    accuracy                           0.94     20020
   macro avg       0.94      0.94      0.94     20020
weighted avg       0.94      0.94      0.94     20020

CPU times: user 48min 22s, sys: 595 ms, total: 48min 22s
Wall time: 48min 23s


## tfidf with unigrams and bigrams

In [14]:
Tfidf_vect = TfidfVectorizer(ngram_range=(1,2))
Tfidf_vect.fit(Corpus['response'])
X_Tfidf = Tfidf_vect.transform(Corpus['response'])
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(X_Tfidf,Y,test_size=0.3)
Train_X

<46712x1797201 sparse matrix of type '<class 'numpy.float64'>'
	with 9744346 stored elements in Compressed Sparse Row format>

In [7]:
%%time
RBF_SVM = svm.SVC(C=1.0, kernel='rbf')
RBF_SVM.fit(Train_X,Train_Y)
# predict the labels on validation dataset
predictions_RBF_SVM = RBF_SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_RBF_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_RBF_SVM))

SVM Accuracy Score ->  97.77222777222777
              precision    recall  f1-score   support

           0       0.97      0.98      0.98      9979
           1       0.98      0.97      0.98     10041

    accuracy                           0.98     20020
   macro avg       0.98      0.98      0.98     20020
weighted avg       0.98      0.98      0.98     20020

CPU times: user 2h, sys: 1.56 s, total: 2h 2s
Wall time: 2h 4s


In [15]:
%%time
SVM = svm.SVC(C=1.0, kernel='linear')
SVM.fit(Train_X,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_SVM))

SVM Accuracy Score ->  98.41658341658342
              precision    recall  f1-score   support

           0       0.98      0.99      0.98     10080
           1       0.99      0.98      0.98      9940

    accuracy                           0.98     20020
   macro avg       0.98      0.98      0.98     20020
weighted avg       0.98      0.98      0.98     20020

CPU times: user 1h 6min 47s, sys: 1.15 s, total: 1h 6min 49s
Wall time: 1h 6min 50s


## tfidf with unigram and bigrams and stop words

In [8]:
Tfidf_vect = TfidfVectorizer(ngram_range=(1,2), stop_words = "english")
Tfidf_vect.fit(Corpus['response'])
X_Tfidf = Tfidf_vect.transform(Corpus['response'])
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(X_Tfidf,Y,test_size=0.3)
Train_X

<46712x2450751 sparse matrix of type '<class 'numpy.float64'>'
	with 5366686 stored elements in Compressed Sparse Row format>

In [9]:
%%time
RBF_SVM = svm.SVC(C=1.0, kernel='rbf')
RBF_SVM.fit(Train_X,Train_Y)
# predict the labels on validation dataset
predictions_RBF_SVM = RBF_SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_RBF_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_RBF_SVM))

SVM Accuracy Score ->  95.4045954045954
              precision    recall  f1-score   support

           0       0.94      0.97      0.95      9994
           1       0.96      0.94      0.95     10026

    accuracy                           0.95     20020
   macro avg       0.95      0.95      0.95     20020
weighted avg       0.95      0.95      0.95     20020

CPU times: user 1h 13min 4s, sys: 446 ms, total: 1h 13min 4s
Wall time: 1h 13min 6s


## tfidf with bigrams only

In [10]:
Tfidf_vect = TfidfVectorizer(ngram_range=(2,2))
Tfidf_vect.fit(Corpus['response'])
X_Tfidf = Tfidf_vect.transform(Corpus['response'])
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(X_Tfidf,Y,test_size=0.3)
Train_X

<46712x1726723 sparse matrix of type '<class 'numpy.float64'>'
	with 5970559 stored elements in Compressed Sparse Row format>

In [11]:
%%time
RBF_SVM = svm.SVC(C=1.0, kernel='rbf')
RBF_SVM.fit(Train_X,Train_Y)
# predict the labels on validation dataset
predictions_RBF_SVM = RBF_SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_RBF_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_RBF_SVM))

SVM Accuracy Score ->  97.55244755244755
              precision    recall  f1-score   support

           0       0.96      0.99      0.98      9965
           1       0.99      0.96      0.98     10055

    accuracy                           0.98     20020
   macro avg       0.98      0.98      0.98     20020
weighted avg       0.98      0.98      0.98     20020

CPU times: user 1h 46min, sys: 921 ms, total: 1h 46min 1s
Wall time: 1h 46min 3s


## tfidf with bigrams and stopwords

In [12]:
Tfidf_vect = TfidfVectorizer(ngram_range=(2,2), stop_words = "english")
Tfidf_vect.fit(Corpus['response'])
X_Tfidf = Tfidf_vect.transform(Corpus['response'])
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(X_Tfidf,Y,test_size=0.3)
Train_X

<46712x2380582 sparse matrix of type '<class 'numpy.float64'>'
	with 3058181 stored elements in Compressed Sparse Row format>

In [13]:
%%time
RBF_SVM = svm.SVC(C=1.0, kernel='rbf')
RBF_SVM.fit(Train_X,Train_Y)
# predict the labels on validation dataset
predictions_RBF_SVM = RBF_SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_RBF_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_RBF_SVM))

SVM Accuracy Score ->  89.67032967032968
              precision    recall  f1-score   support

           0       0.84      0.98      0.91     10123
           1       0.97      0.81      0.89      9897

    accuracy                           0.90     20020
   macro avg       0.91      0.90      0.90     20020
weighted avg       0.91      0.90      0.90     20020

CPU times: user 1h 34min 34s, sys: 472 ms, total: 1h 34min 35s
Wall time: 1h 34min 37s
