In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]=""

In [None]:
import pandas as pd
import pickle

from sklearn.metrics import accuracy_score, f1_score, r2_score, mean_squared_error
from sklearn.linear_model import SGDClassifier, LinearRegression
from sklearn import svm

from textstat import textstat
import stanza

from tqdm.notebook import tqdm, trange
import numpy as np
import matplotlib

import spacy
import neuralcoref

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import make_scorer
f1_scorer = make_scorer(f1_score, pos_label="machine")

# Assorted Setup Utilities

Initializer tokenizers and lemmatizers

In [None]:
stanza.download('en') 

In [None]:
nlp_lem = stanza.Pipeline(lang='en', processors='tokenize,lemma', use_gpu=False) # GPU issues in current stanza

In [None]:
const_nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency', use_gpu=False)

In [None]:
nlp_spacy = spacy.load('en')
neuralcoref.add_to_pipe(nlp_spacy)

# Build Lists for Complex Phrasal Features
We obtained these by scraping several websites (see scraping code).  These take the form of special word lists that may occur less frequently in computer-generated text.  We've constructed three phrase datasets for this purpose.

In [None]:
from pathlib import Path

# Load the lemmas if we've already generated them
if (
    Path("intermediate_data/cliche_lemmas.pkl").is_file() and
    Path("intermediate_data/archaisms_lemmas.pkl").is_file() and
    Path("intermediate_data/idiom_lemmas.pkl").is_file()):
    print("Loading generated lemmas...")
    
    with open("intermediate_data/cliche_lemmas.pkl", "rb") as f:
        cliches = pickle.load(f)
    with open("intermediate_data/archaisms_lemmas.pkl", "rb") as f:
        archaisms = pickle.load(f)
    with open("intermediate_data/idiom_lemmas.pkl", "rb") as f:
        wiki_english_idioms = pickle.load(f)
else:
    print("Re-generating lemmas and saving them...")
    # Provided a copy here to make things easier: https://github.com/ecrows/cliche500
    with open("./data/cliche500/cliches.txt", "r") as f:
        cliches = f.read().splitlines()

    with open("./data/archaisms/archaisms.txt", "r") as f:
        archaisms = f.read().splitlines()

    with open("./data/idioms/wiki_english_idioms.txt", "r") as f:
        wiki_english_idioms = f.read().splitlines()
        # TODO: Drop any that contain colons?

    print(cliches[:5])
    print(archaisms[:5])
    print(wiki_english_idioms[617:622])

    # Convert all phrase features to lemmas.

    for i,c in enumerate(cliches):
        cliches[i] = [b.lemma for b in nlp_lem(c.lower()).iter_words()]

    for i,c in enumerate(archaisms):
        archaisms[i] = [b.lemma for b in nlp_lem(c.lower()).iter_words()]

    for i,c in enumerate(wiki_english_idioms):
        wiki_english_idioms[i] = [b.lemma for b in nlp_lem(c.lower()).iter_words()]

    with open("intermediate_data/cliche_lemmas.pkl", "wb") as f:
        pickle.dump(cliches, f)
    with open("intermediate_data/archaisms_lemmas.pkl", "wb") as f:
        pickle.dump(archaisms, f)
    with open("intermediate_data/idiom_lemmas.pkl", "wb") as f:
        pickle.dump(wiki_english_idioms, f)

We could also harvest the Yorkshire dialect corpus and use that as well, but given that the feature was the weakest in the statistical paper, the dataset is not available in easily computer-readable form, and we don't expect yorkshire dialect to be particularly present in either class, we will omit this for now.
If interested, features might be harvested from https://www.yorkshiredialect.com/words/A.htm

## Load Data

In [None]:
gpt2_355m_k40 = pd.read_json("./data/gpt-2-output-dataset/data/medium-345M-k40.train.jsonl", lines=True)
webtext = pd.read_json("./data/gpt-2-output-dataset/data/webtext.train.jsonl", lines=True)
gpt2_355m_k40_test = pd.read_json("./data/gpt-2-output-dataset/data/medium-345M-k40.test.jsonl", lines=True)
webtext_test = pd.read_json("./data/gpt-2-output-dataset/data/webtext.test.jsonl", lines=True)

In [None]:
gpt2_355m_k40['class'] = "machine"
webtext['class'] = "human"
gpt2_355m_k40_test['class'] = "machine"
webtext_test['class'] = "human"

In [None]:
df_train = gpt2_355m_k40.append(webtext, ignore_index=True).sample(frac=1)
df_train['text_lower'] = df_train['text'].str.lower()

In [None]:
subsample_train = df_train.sample(10000)

In [None]:
subsample_train = pd.read_csv("intermediate_data/3k_of_10k_train.csv")

### Generate Lemmas

In [None]:
text_lemmas = subsample_train["text_lower"].apply(lambda a: [b.lemma for b in nlp_lem(a).iter_words()])
subsample_train["text_lower_lemmas"] = text_lemmas
subsample_train.to_csv("./intermediate_data/10k_subsample_train.csv")

## Generate Frequency Features

We get the distribution of lemmas within input documents, and take the log of rank and log of number of occurrences, and calculate a slope feature of a linear regression line for each document.  We then evaluate the quality of this linear regression line by comparing the lemma distribution to a Zipfian distribution.  We would expect this line to fit better if the distribution is more "Zipfian", as seen in human-generated text in past work.

In [None]:
def generate_freq_features(df):
    col = df["text_lower_lemmas"]
    freq_features = []
    
    for b in col:
        if type(b) is str:
            print("Error: string type, expected list")
            raise TypeError
            
        tokens, counts = np.unique(b, return_counts=True)
        
        if len(tokens) == 0:
            freq_features.append((0, 0, 0))
            continue

        log_counts = sorted(np.log(counts), reverse=True)
        ranks = np.log(np.arange(1, len(log_counts)+1)).reshape(-1,1)
        reg = LinearRegression().fit(ranks, log_counts)

        preds = reg.predict(ranks)

        r2 = r2_score(log_counts, preds)
        slope = reg.coef_[0]
        mse = mean_squared_error(log_counts, preds)

        freq_features.append((slope, r2, mse))
        
    return freq_features

In [None]:
train_freq_features = generate_freq_features(subsample_train)

In [None]:
pd.DataFrame(train_freq_features, columns=["Slope", "R2", "MSE"]).to_csv("features/frequency_features_train_10k.csv", index=False)

## Generate Consistency Features

We use a parsing tree to identify phrasal verbs, and calculate the ratio of phrasal verbs to the number of words.  We also calculate coreference resolution relationships that reflect text cohesion.  Greater numbers of coreference resolution per number of words indicate a higher likelihood that text is human-generated.

### Verb Phrases

In [None]:
def generate_verb_phrase_ratios(df):
    col = df['text']
    phrasal_ratios = []
    for s in tqdm(col, desc="Generating Verb Phrase Ratios"):
        phrasal_count = 0
        doc = const_nlp(s)

        for sentence in doc.sentences:
            c = sentence.constituency

            for p in c.yield_preterminals(): # for newer stanza
                if p.label == 'RP':
                    phrasal_count += 1

        if (doc.num_words == 0):
            phrasal_ratios.append(0)
        else:
            phrasal_ratios.append(phrasal_count/doc.num_words)
        
        # to allow resuming
        with open("intermediate_data/latest_ratios_const.pkl", "wb") as f:
            pickle.dump(phrasal_ratios, f)
        
    return phrasal_ratios

In [None]:
train_phrasal_ratios = generate_verb_phrase_ratios(subsample_train)

In [None]:
subsample_train = subsample_train.head(3000)
subsample_train.to_csv("intermediate_data/3k_of_10k_train.csv")
pd.DataFrame(phrasal_ratios).to_csv("features/verb_phrase_ratios_train_3048_of_10k.csv", index=False)

### Coreference Resolution Relationships

In [None]:
def generate_coref_ratios(df):
    coref_ratios = []
    col = df['text']
    for s in tqdm(col, desc="Generating Coreference Ratios"):
        doc = nlp_spacy(s)
        coref_ratios.append(len(doc._.coref_clusters)/len(doc))
    return coref_ratios

In [None]:
train_coref_ratios = generate_coref_ratios(subsample_train)

In [None]:
pd.DataFrame(coref_ratios).to_csv("features/coreference_ratios_train_3k_of_10k.csv", index=False)

In [None]:
subsample_train.to_csv("features/subsample_train.csv")

## Generate Fluency Features

Also create features built from Gunning-Fog index and Flesch reading ease tests, as in “Fake News Detection using LDA Topic Modelling and K-Nearest Neighbor” at CSoNET 2021.

We leverage the pip package "textstat" for this.

In [None]:
def generate_fluency_features(df):
    gunning_fog_scores = []
    flesch_reading_ease_scores = []

    col = df['text']
    
    for s in tqdm(col, desc="Generating Fluency Features"):
        gunning_fog_scores.append(textstat.gunning_fog(s))
        flesch_reading_ease_scores.append(textstat.flesch_reading_ease(s))
        
    return gunning_fog_scores, flesch_reading_ease_scores

In [None]:
train_gunning_fog_scores, train_flesch_reading_ease_scores = generate_fluency_features(subsample_train)

In [None]:
pd.DataFrame(gunning_fog_scores).to_csv("features/3k_of_10k_gunning_fog_scores_train.csv", index=False)
pd.DataFrame(flesch_reading_ease_scores).to_csv("features/3k_of_10k_flesch_reading_ease_scores_train.csv", index=False)

## Generate Complex Phrasal Features

In [None]:
def array_match_count(an, search):
    """
    Find matching sequences of elements in an array, fairly efficiently
    """
    if len(search) == 0:
        return 0
        
    try:
        search_index = 0
        found_count = 0
        
        while search_index < len(an):
            first_word_index = an.index(search[0], search_index)
            if (an[first_word_index:first_word_index+len(search)] == search):
                found_count += 1

            search_index = first_word_index+1
    except ValueError:
        pass
    
    return found_count

In [None]:
def get_rate_of_wordlist(df, wordlist):
    """
    Given a dataframe and a list of words (or phrases),
    get the number of occurences of these in the relevant dataframe column,
    divided by the length of the list
    """
    ratios = []
    for t in tqdm(df['text_lower_lemmas'], desc="Generating Complex Phrasal Features"):
        count = 0
        for w in wordlist:
            count += array_match_count(t, w)

        if (len(t) > 0):
            ratios.append(count/len(t))
        else:
            ratios.append(0)
    
    return ratios

In [None]:
import concurrent.futures # multithreading for speed here

def get_all_wordlist_ratios(passed_df):
    df = passed_df
    
    with concurrent.futures.ThreadPoolExecutor() as executor:
        future_ratio_archaisms = executor.submit(get_rate_of_wordlist, df, archaisms)
        future_ratio_idioms = executor.submit(get_rate_of_wordlist, df, wiki_english_idioms)
        future_ratio_cliches = executor.submit(get_rate_of_wordlist, df, cliches)
        
        ratio_archaisms = future_ratio_archaisms.result()
        ratio_idioms = future_ratio_idioms.result()
        ratio_cliches = future_ratio_cliches.result()
        
        return(list(zip(ratio_archaisms, ratio_idioms, ratio_cliches)))

In [None]:
train_ratios = get_all_wordlist_ratios(subsample_train)

In [None]:
train_wordlist_ratios = train_ratios

In [None]:
pd.DataFrame(train_ratios, columns=["archaisms", "idioms", "cliches"]).to_csv("features/3k_of_10k_phrasal_wordlist_ratios.csv", index=False)

# Train a Model

In [None]:
features = []

for i in range(0,subsample_train.shape[0]):
    vector = []
    vector.extend(train_freq_features[i])
    
    vector.append(train_phrasal_ratios[i])
    vector.append(train_coref_ratios[i])
    
    vector.extend(train_wordlist_ratios[i])
    
    vector.append(gunning_fog_scores[i])
    vector.append(flesch_reading_ease_scores[i])

    features.append(vector)

In [None]:
pd.DataFrame(features, columns=["Slope", "R2", "MSE", "Verb Phrase", "Coreference", "Archaisms", "Idioms", "Cliches", "Gunning-Fog", "Flesch"]).to_csv("features/3k_of_10k_combined_features.csv", index=False)
features = pd.read_csv("features/3k_of_10k_combined_features.csv")

In [None]:
x = features
scaler = StandardScaler()
scaler.fit(x)
x = scaler.transform(x) # Have to re-use this scaler later to prevent any data leakage

In [None]:
clf = SGDClassifier(loss="hinge", penalty="l2", max_iter=10000)
clf.fit(x, y)

In [None]:
def get_best_c(feats):
    print("Hyperparameter search")
    param_grid = [
      {'C': [1, 10, 100, 1000]},
     ]

    svc = svm.SVC(kernel='linear', random_state=0)
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=0)
    search = GridSearchCV(estimator=svc, param_grid=param_grid, scoring=f1_scorer, cv=cv, verbose=10)

    _x = feats
    _y = subsample_train['class']
    
    search.fit(_x, _y)
    print(search.best_params_)
    return search.best_params_['C']

In [None]:
# This comes out to C=100
best_stat_c = get_best_c(x) # 100

In [None]:
y = subsample_train['class']
clf = svm.SVC(C=100, kernel='linear', probability=True, random_state=0)
clf.fit(x, y)

In [None]:
train_results = clf.predict(x)
print(f"Train set accuracy: {accuracy_score(subsample_train['class'], train_results):.4f}")
print(f"Train set F1 score: {f1_score(subsample_train['class'], train_results, pos_label='machine'):.4f}")

In [None]:
with open('models/linear_svm_3k_of_10k_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f, pickle.HIGHEST_PROTOCOL)

In [None]:
with open('models/linear_svm_3k_of_10k_proba.pickle', 'wb') as f:
    pickle.dump(clf, f, pickle.HIGHEST_PROTOCOL)

# Evaluate Model Against GPT-355M

In [None]:
EXAMPLES_OF_EACH = 500 # How many human and machine examples to put into test set

In [None]:
df_test = gpt2_355m_k40_test.sample(EXAMPLES_OF_EACH, random_state=0).append(webtext_test.sample(EXAMPLES_OF_EACH, random_state=0), ignore_index=True).sample(frac=1, random_state=0)
df_test['text_lower'] = df_test['text'].str.lower()
df_test['class'].value_counts()

In [None]:
subsample_test["text_lower_lemmas"] = subsample_test["text_lower"].apply(lambda a: [b.lemma for b in nlp_lem(a).iter_words()])
subsample_test.to_pickle("./intermediate_data/1k_subsample_test.pkl")

In [None]:
subsample_test = pd.read_pickle("./intermediate_data/1k_subsample_test.pkl")

In [None]:
# Generate all features for test set
test_freq_features = generate_freq_features(subsample_test)
test_verb_phrase = generate_verb_phrase_ratios(subsample_test)
test_coref_ratios = generate_coref_ratios(subsample_test)
test_phrasal_ratios = get_all_wordlist_ratios(subsample_test)
test_gf_scores, test_fre_scores = generate_fluency_features(subsample_test)

In [None]:
test_features = []
for i in range(0,subsample_test.shape[0]):
    vector = []
    vector.extend(test_freq_features[i])
    
    vector.append(test_verb_phrase[i])
    vector.append(test_coref_ratios[i])
    
    vector.extend(test_phrasal_ratios[i])
    
    vector.append(test_gf_scores[i])
    vector.append(test_fre_scores[i])

    test_features.append(vector)

In [None]:
feature_columns = ["Slope", "R2", "MSE", "Verb Phrase", "Coreference", "Archaisms", "Idioms", "Cliches", "Gunning-Fog", "Flesch"]

In [None]:
pd.DataFrame(test_features, columns=feature_columns).to_csv("features/1k_combined_features_test.csv", index=False)
test_features = pd.read_csv("features/1k_combined_features_test.csv")

In [None]:
# Scaling using previous scaler from training
x_test = test_features
x_test = scaler.transform(x_test)

In [None]:
test_results = clf.predict(x_test)
print(f"Test set accuracy: {accuracy_score(subsample_test['class'], test_results):.4f}")
print(f"Test set F1 score: {f1_score(subsample_test['class'], test_results, pos_label='machine'):.4f}")

Plot relative feature importance of each feature for classifier

In [None]:
pd.Series(abs(clf.coef_[0]), index=feature_columns).nlargest(10).plot(kind='barh')

Save model to disk

In [None]:
with open("results/svm-sgd-complex-phrase.pickle", "wb") as f:
    pickle.dump(clf, f)

In [None]:
with open("results/svm-sgd-complex-phrase.pickle", "rb") as f:
    clf = pickle.load(f)

# Evaluate against GPT-2 1.5B and GPT3

In [None]:
gpt2_1542m_k40_test = pd.read_json("./data/gpt-2-output-dataset/data/xl-1542M-k40.test.jsonl", lines=True)
webtext_test = pd.read_json("./data/gpt-2-output-dataset/data/webtext.test.jsonl", lines=True)

In [None]:
gpt3 = pd.read_json("./data/gpt3_175b_samples.jsonl", lines=True)
# Fixing the dataset since <|endoftext|> is included erroneously.  OpenAI should probably accept pull requests for this sort of thing...
gpt3_rebuild = []
for f in gpt3[0]:
    split = f.split("<|endoftext|>")
    for s in split:
        gpt3_rebuild.append(s)
gpt3_df = pd.DataFrame(gpt3_rebuild, columns=["text"])

In [None]:
gpt2_1542m_k40_test['class'] = "machine"
webtext_test['class'] = "human"
gpt3_df['class'] = "machine"

In [None]:
gpt2_1532m_eval = gpt2_1542m_k40_test.sample(EXAMPLES_OF_EACH, random_state=0).append(webtext_test.sample(EXAMPLES_OF_EACH, random_state=0), ignore_index=True)
gpt2_1532m_eval = gpt2_1532m_eval.sample(frac=1, random_state=0)
gpt2_1532m_eval['class'].value_counts()

In [None]:
gpt2_1532m_eval['text_lower'] = gpt2_1532m_eval['text'].str.lower()
gpt2_1532m_eval["text_lower_lemmas"] = gpt2_1532m_eval["text_lower"].apply(lambda a: [b.lemma for b in nlp_lem(a).iter_words()])

In [None]:
gpt2_1532m_eval.to_pickle("./intermediate_data/1k_subsample_gpt2_1532m_test.pkl")

In [None]:
gpt2_1532m_eval = pd.read_pickle("./intermediate_data/1k_subsample_gpt2_1532m_test.pkl")

In [None]:
gpt3_eval = gpt3_df.append(webtext_test.sample(gpt3_df.shape[0]), ignore_index=True)
gpt3_eval = gpt3_df.sample(EXAMPLES_OF_EACH, random_state=0).append(webtext_test.sample(EXAMPLES_OF_EACH, random_state=0), ignore_index=True)
gpt3_eval = gpt3_eval.sample(frac=1, random_state=0)
gpt3_eval['class'].value_counts()

In [None]:
gpt3_eval['text_lower'] = gpt3_eval['text'].str.lower()
gpt3_eval["text_lower_lemmas"] = gpt3_eval["text_lower"].apply(lambda a: [b.lemma for b in nlp_lem(a).iter_words()])

In [None]:
gpt3_eval.to_pickle("./intermediate_data/1k_subsample_gpt3_test.pkl")

In [None]:
gpt3_eval = pd.read_pickle("./intermediate_data/1k_subsample_gpt3_test.pkl")

In [None]:
def generate_all_features(df, vp_feats=None):
    if vp_feats is not None: # had to do this piecemeal on CPU to avoid stanza crashes
        _verb_phrase = vp_feats
    else:
        _verb_phrase = generate_verb_phrase_ratios(df)
    _freq_features = generate_freq_features(df)
    _coref_ratios = generate_coref_ratios(df)
    _phrasal_ratios = get_all_wordlist_ratios(df)
    _gf_scores, _fre_scores = generate_fluency_features(df)

    test_features = []
    for i in range(0,subsample_test.shape[0]):
        vector = []
        vector.extend(_freq_features[i])

        vector.append(_verb_phrase[i])
        vector.append(_coref_ratios[i])

        vector.extend(_phrasal_ratios[i])

        vector.append(_gf_scores[i])
        vector.append(_fre_scores[i])

        test_features.append(vector)
        
    return test_features

In [None]:
with open("intermediate_data/latest_ratios_const-to463.pkl", "rb") as f:
    gpt2_1532m_features_to_463 = pickle.load(f)
    
with open("intermediate_data/latest_ratios_const-463-1000.pkl", "rb") as f:
    gpt2_1532m_features_463_1000 = pickle.load(f)
gpt2_1532m_features_to_463.extend(gpt2_1532m_features_463_1000)

Combine and output GPT2_1532M features
pd.DataFrame(gpt2_1532m_features_to_463).to_csv("intermediate_data/gpt2_1532m_test_vp_complete.csv", index=False)
gpt2_1532m_features_to_463 = pd.read_csv("intermediate_data/gpt2_1532m_test_vp_complete.csv")

In [None]:
# Note: Constituency parsing seems to be allocating tensors on CPU sporadically, inhibiting performance
gpt2_1532m_features = generate_all_features(gpt2_1532m_eval, vp_feats = gpt2_1532m_features_to_463["0"].to_list())

In [None]:
pd.DataFrame(gpt2_1532m_features, columns=feature_columns).to_csv("features/1k_combined_features_gpt2_1532m_test.csv", index=False)

In [None]:
gpt2_1532m_features = pd.read_csv("features/1k_combined_features_gpt2_1532m_test.csv")

In [None]:
# Scaling using previous scaler from training
x_test_gpt2_1532m = gpt2_1532m_features
x_test_gpt2_1532m = scaler.transform(x_test_gpt2_1532m)
test_gpt2_1532m_results = clf.predict(x_test_gpt2_1532m)

In [None]:
print(f"GPT2-1.5B Test set accuracy: {accuracy_score(test_gpt2_1532m_results, gpt2_1532m_eval['class']):.4f}")
print(f"GPT2-1.5B Test set F1 score: {f1_score(test_gpt2_1532m_results,  gpt2_1532m_eval['class'], pos_label='machine'):.4f}")

In [None]:
with open("intermediate_data/latest_ratios_const-gpt3-298.pkl", "rb") as f:
    gpt3_vp_to_298 = pickle.load(f)
    
with open("intermediate_data/latest_ratios_const-gpt3-298to.pkl", "rb") as f:
    gpt3_vp_298_on = pickle.load(f)
    
with open("intermediate_data/latest_ratios_const-gpt3-fin.pkl", "rb") as f:
    gpt3_vp_fin = pickle.load(f)

In [None]:
gpt3_vp_to_298.extend(gpt3_vp_298_on)
gpt3_vp_to_298.extend(gpt3_vp_fin)

In [None]:
with open("intermediate_data/latest_vp_ratios_const_gpt3_combined.pkl", "wb") as f:
    pickle.dump(gpt3_vp_to_298, f)

In [None]:
gpt3_eval

In [None]:
gpt3_features = generate_all_features(gpt3_eval, vp_feats=gpt3_vp_to_298)

In [None]:
pd.DataFrame(gpt3_features, columns=feature_columns).to_csv("features/1k_combined_features_gpt3_test.csv", index=False)

In [None]:
# Scaling using previous scaler from training
x_test_gpt3 = gpt3_features
x_test_gpt3 = scaler.transform(x_test_gpt3)
test_gpt3_results = clf.predict(x_test_gpt3)

In [None]:
print(f"GPT3 Test set accuracy: {accuracy_score(test_gpt3_results, gpt3_eval['class']):.4f}")
print(f"GPT3 Test set F1 score: {f1_score(test_gpt3_results, gpt3_eval['class'], pos_label='machine'):.4f}")