# Supplemental Code

## Preliminary calculation and comparison of features between AI-generated and human-written text

In [25]:
import openai
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.util import ngrams
from collections import Counter
import numpy as np
from nltk.tree import Tree
import spacy


# Initialize spaCy English model
nlp_spacy = spacy.load('en_core_web_sm')

In [39]:
# Function to extract lexical features
def extract_lexical_features(text):
    # ... your extract_lexical_features function implementation ...
    words = word_tokenize(text)
    sentences = sent_tokenize(text)
    
    total_word_count = len(words)
    avg_word_length = sum(len(word) for word in words) / len(words)
    avg_sentence_length = sum(len(sentence.split()) for sentence in sentences) / len(sentences)
    word_counts = Counter(words)
    TTR = len(word_counts) / len(words)
    stop_words = set(stopwords.words('english'))
    stop_word_count = sum(1 for word in words if word.lower() in stop_words)
    unique_word_count = sum(1 for _, count in word_counts.items() if count == 1)
    word_freq = word_counts
    bigram_freq = Counter(ngrams(words, 2))
    trigram_freq = Counter(ngrams(words, 3))
    rare_word_count = sum(1 for _, count in word_counts.items() if count == 1)

    return {
        'total_word_count': total_word_count,
        'avg_word_length': avg_word_length,
        'avg_sentence_length': avg_sentence_length,
        'TTR': TTR,
        'stop_word_count': stop_word_count,
        'unique_word_count': unique_word_count,
        'word_freq': word_freq,
        'bigram_freq': bigram_freq,
        'trigram_freq': trigram_freq,
        'rare_word_count': rare_word_count
    }

#load data from excel file and save as list
merged_df = pd.read_excel('../cleanData/processedAsap.xlsx')
all_essays = merged_df['essay'].tolist()

#extract ai-specific dataset
ai_generated_df = merged_df.loc[(merged_df["ai_generated"] == 1), :]
ai_generated_essays = ai_generated_df['essay'].tolist()

#extract human-specific dataset
human_written_df = merged_df.loc[(merged_df["ai_generated"] == 0), :]
human_written_essays = human_written_df['essay'].tolist()

# Extract lexical features from AI-generated and human-written essays
all_lexical_features = [extract_lexical_features(essay) for essay in all_essays]
ai_generated_features = [extract_lexical_features(essay) for essay in ai_generated_essays]
human_written_features = [extract_lexical_features(essay) for essay in human_written_essays]


# Calculate average values of some lexical features for both AI-generated and human-written essays
def average_feature_value(features, feature_key):
    return sum(feature[feature_key] for feature in features) / len(features)

ai_avg_word_length = average_feature_value(ai_generated_features, 'avg_word_length')
ai_avg_TTR = average_feature_value(ai_generated_features, 'TTR')
ai_avg_stop_word_count = average_feature_value(ai_generated_features, 'stop_word_count')

ai_avg_sentence_length = average_feature_value(ai_generated_features, 'avg_sentence_length')
human_avg_sentence_length = average_feature_value(human_written_features, 'avg_sentence_length')
human_avg_word_length = average_feature_value(human_written_features, 'avg_word_length')
human_avg_TTR = average_feature_value(human_written_features, 'TTR')
human_avg_stop_word_count = average_feature_value(human_written_features, 'stop_word_count')

# Calculate average values of the total word count for both AI-generated and human-written essays
ai_avg_total_word_count = average_feature_value(ai_generated_features, 'total_word_count')
human_avg_total_word_count = average_feature_value(human_written_features, 'total_word_count')


In [40]:
pd.concat(merged_df, pd.DataFrame(all_lexical_features)).to_excel("../cleanData/Feature Extraction/")

Unnamed: 0,total_word_count,avg_word_length,avg_sentence_length,TTR,stop_word_count,unique_word_count,word_freq,bigram_freq,trigram_freq,rare_word_count
0,386,3.984456,21.125000,0.468912,176,120,"{'Dear': 1, 'local': 2, 'newspaper': 1, ',': 1...","{('Dear', 'local'): 1, ('local', 'newspaper'):...","{('Dear', 'local', 'newspaper'): 1, ('local', ...",120
1,464,4.030172,20.950000,0.450431,195,127,"{'Dear': 1, '@': 10, 'CAPS1': 1, 'CAPS2': 1, '...","{('Dear', '@'): 1, ('@', 'CAPS1'): 1, ('CAPS1'...","{('Dear', '@', 'CAPS1'): 1, ('@', 'CAPS1', '@'...",127
2,313,4.035144,19.928571,0.514377,143,111,"{'Dear': 1, ',': 9, '@': 7, 'CAPS1': 1, 'CAPS2...","{('Dear', ','): 1, (',', '@'): 2, ('@', 'CAPS1...","{('Dear', ',', '@'): 1, (',', '@', 'CAPS1'): 1...",111
3,611,4.328969,19.407407,0.436989,223,182,"{'Dear': 1, 'Local': 1, 'Newspaper': 3, ',': 1...","{('Dear', 'Local'): 1, ('Local', 'Newspaper'):...","{('Dear', 'Local', 'Newspaper'): 1, ('Local', ...",182
4,517,4.071567,15.500000,0.408124,241,125,"{'Dear': 1, '@': 4, 'LOCATION1': 1, ',': 13, '...","{('Dear', '@'): 1, ('@', 'LOCATION1'): 1, ('LO...","{('Dear', '@', 'LOCATION1'): 1, ('@', 'LOCATIO...",125
...,...,...,...,...,...,...,...,...,...,...
24412,274,4.664234,20.166667,0.492701,119,90,"{'Dear': 1, 'Editor': 1, ',': 16, 'As': 1, 'a'...","{('Dear', 'Editor'): 1, ('Editor', ','): 1, ('...","{('Dear', 'Editor', ','): 1, ('Editor', ',', '...",90
24413,247,4.619433,18.000000,0.603239,101,114,"{'Dear': 1, 'Editor': 1, ',': 14, 'As': 1, 'a'...","{('Dear', 'Editor'): 1, ('Editor', ','): 1, ('...","{('Dear', 'Editor', ','): 1, ('Editor', ',', '...",114
24414,216,4.717593,15.583333,0.601852,77,100,"{'Dear': 1, 'Editor': 1, ',': 12, 'I': 2, 'am'...","{('Dear', 'Editor'): 1, ('Editor', ','): 1, ('...","{('Dear', 'Editor', ','): 1, ('Editor', ',', '...",100
24415,272,4.977941,19.916667,0.566176,106,110,"{'To': 2, 'the': 9, 'Editor': 1, ':': 1, 'As':...","{('To', 'the'): 1, ('the', 'Editor'): 1, ('Edi...","{('To', 'the', 'Editor'): 1, ('the', 'Editor',...",110


In [16]:

# Function to extract syntactic features
def extract_syntactic_features(text):
    # ... your extract_syntactic_features function implementation ...
    doc = nlp_spacy(text)

    # Calculate average sentence length
    sentence_lengths = [len(sent) for sent in doc.sents]
    avg_sentence_length = np.mean(sentence_lengths)

    # Calculate parse tree depth
    def calc_tree_depth(sent):
        root = [token for token in sent if token.head == token][0]
        return max([len(list(token.ancestors)) for token in sent])

    tree_depths = [calc_tree_depth(sent) for sent in doc.sents]
    avg_parse_tree_depth = np.mean(tree_depths)
    parse_tree_depth_variation = np.std(tree_depths)

    return {
        'avg_sentence_length': avg_sentence_length,
        'avg_parse_tree_depth': avg_parse_tree_depth,
        'parse_tree_depth_variation': parse_tree_depth_variation,
    }


# Extract syntactic features from AI-generated and human-written essays
all_syntactic_features = [extract_syntactic_features(essay) for essay in all_essays]
ai_generated_syntactic_features = [extract_syntactic_features(essay) for essay in ai_generated_essays]
human_written_syntactic_features = [extract_syntactic_features(essay) for essay in human_written_essays]

# Compare the syntactic features
ai_avg_sentence_length = np.mean([features['avg_sentence_length'] for features in ai_generated_syntactic_features])
human_avg_sentence_length = np.mean([features['avg_sentence_length'] for features in human_written_syntactic_features])

ai_avg_parse_tree_depth = np.mean([features['avg_parse_tree_depth'] for features in ai_generated_syntactic_features])
human_avg_parse_tree_depth = np.mean([features['avg_parse_tree_depth'] for features in human_written_syntactic_features])

ai_parse_tree_depth_variation = np.mean([features['parse_tree_depth_variation'] for features in ai_generated_syntactic_features])
human_parse_tree_depth_variation = np.mean([features['parse_tree_depth_variation'] for features in human_written_syntactic_features])


In [17]:
pd.DataFrame(all_syntactic_features)

Unnamed: 0,avg_sentence_length,avg_parse_tree_depth,parse_tree_depth_variation
0,24.687500,5.250000,2.410913
1,19.826087,5.565217,2.990217
2,21.857143,5.214286,1.779991
3,21.333333,4.925926,2.017074
4,17.266667,4.833333,1.593389
...,...,...,...
24412,24.250000,6.583333,1.255543
24413,21.166667,4.916667,1.605113
24414,18.916667,5.500000,2.101587
24415,23.750000,5.416667,1.656217


In [18]:
# Combine lexical and syntactic features
def combined_features(text):
    lexical = extract_lexical_features(text)
    syntactic = extract_syntactic_features(text)
    return {**lexical, **syntactic}

# Extract combined features for AI-generated and human-written essays
all_combined_features = [combined_features(essay) for essay in all_essays]
ai_generated_combined_features = [combined_features(essay) for essay in ai_generated_essays]
human_written_combined_features = [combined_features(essay) for essay in human_written_essays]

# Create a DataFrame for AI-generated essays
ai_generated_df = pd.DataFrame(ai_generated_combined_features)
ai_generated_df['type'] = 'AI-generated'

# Create a DataFrame for human-written essays
human_written_df = pd.DataFrame(human_written_combined_features)
human_written_df['type'] = 'Human-written'

# Calculate the average values of features for both AI-generated and human-written essays
def average_feature_value(features, feature_key):
    return sum(feature[feature_key] for feature in features) / len(features)

# Define a list of feature keys to extract from the combined features
feature_keys = [
    'total_word_count',
    'avg_word_length',
    'avg_sentence_length',
    'TTR',
    'stop_word_count',
    'unique_word_count',
    'rare_word_count',
    'avg_parse_tree_depth',
    'parse_tree_depth_variation'
]

# Calculate the average values for each feature
ai_generated_avgs = [average_feature_value(ai_generated_combined_features, key) for key in feature_keys]
human_written_avgs = [average_feature_value(human_written_combined_features, key) for key in feature_keys]

In [19]:
#Stylistic Features
def extract_stylistic_features(text):
    sentences = sent_tokenize(text)
    num_sentences = len(sentences)
    
    tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
    pos_tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    
    num_adjectives = sum(sum(1 for word, pos in sentence if pos.startswith('JJ')) for sentence in pos_tagged_sentences)
    num_adverbs = sum(sum(1 for word, pos in sentence if pos.startswith('RB')) for sentence in pos_tagged_sentences)
    num_verbs = sum(sum(1 for word, pos in sentence if pos.startswith('VB')) for sentence in pos_tagged_sentences)
    num_nouns = sum(sum(1 for word, pos in sentence if pos.startswith('NN')) for sentence in pos_tagged_sentences)

    avg_adjectives_per_sentence = num_adjectives / num_sentences
    avg_adverbs_per_sentence = num_adverbs / num_sentences
    avg_verbs_per_sentence = num_verbs / num_sentences
    avg_nouns_per_sentence = num_nouns / num_sentences
    
    return {
        'avg_adjectives_per_sentence': avg_adjectives_per_sentence,
        'avg_adverbs_per_sentence': avg_adverbs_per_sentence,
        'avg_verbs_per_sentence': avg_verbs_per_sentence,
        'avg_nouns_per_sentence': avg_nouns_per_sentence,
    }

# Extract stylistic features from AI-generated and human-written essays
all_stylistic_features = [extract_stylistic_features(essay) for essay in all_essays]
ai_generated_stylistic_features = [extract_stylistic_features(essay) for essay in ai_generated_essays]
human_written_stylistic_features = [extract_stylistic_features(essay) for essay in human_written_essays]

# Calculate average values of stylistic features for both AI-generated and human-written essays
ai_avg_adjectives_per_sentence = average_feature_value(ai_generated_stylistic_features, 'avg_adjectives_per_sentence')
ai_avg_adverbs_per_sentence = average_feature_value(ai_generated_stylistic_features, 'avg_adverbs_per_sentence')
ai_avg_verbs_per_sentence = average_feature_value(ai_generated_stylistic_features, 'avg_verbs_per_sentence')
ai_avg_nouns_per_sentence = average_feature_value(ai_generated_stylistic_features, 'avg_nouns_per_sentence')

human_avg_adjectives_per_sentence = average_feature_value(human_written_stylistic_features, 'avg_adjectives_per_sentence')
human_avg_adverbs_per_sentence = average_feature_value(human_written_stylistic_features, 'avg_adverbs_per_sentence')
human_avg_verbs_per_sentence = average_feature_value(human_written_stylistic_features, 'avg_verbs_per_sentence')
human_avg_nouns_per_sentence = average_feature_value(human_written_stylistic_features, 'avg_nouns_per_sentence')

import string

def count_punctuation(text):
    punctuation_count = sum(1 for char in text if char in string.punctuation)
    return punctuation_count

def average_value(values):
    return sum(values) / len(values)

all_avg_punctuation = average_value([count_punctuation(essay) for essay in all_essays])
ai_avg_punctuation = average_value([count_punctuation(essay) for essay in ai_generated_essays])
human_avg_punctuation = average_value([count_punctuation(essay) for essay in human_written_essays])


# ai_avg_punctuation = average_feature_value([(count_punctuation(essay)) for essay in ai_generated_essays])
# human_avg_punctuation = average_feature_value([(count_punctuation(essay)) for essay in human_written_essays])

In [None]:
all_syntactic_features, all_stylistic_features

In [41]:
#merge the extracted features with the original data
features_df = pd.concat([merged_df, pd.DataFrame(all_lexical_features), pd.DataFrame(all_syntactic_features), pd.DataFrame(all_stylistic_features)], axis = 1)

#save as excel document
features_df.to_excel("../cleanData/featuresAsap.xlsx")

Unnamed: 0.1,Unnamed: 0,essay_id,essay_set,essay,ai_llm,ai_generated,word_tokens,sentence_tokens,lemmatized_word_tokens,avg_sentence_length,avg_parse_tree_depth,parse_tree_depth_variation,avg_adjectives_per_sentence,avg_adverbs_per_sentence,avg_verbs_per_sentence,avg_nouns_per_sentence
0,0,1,1,"Dear local newspaper, I think effects computer...",human-generated,0,"['dear', 'local', 'newspaper', 'think', 'effec...","['dear local newspaper, i think effects comput...","['dear', 'local', 'newspaper', 'think', 'effec...",24.687500,5.250000,2.410913,1.250000,1.437500,4.250000,5.250000
1,1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",human-generated,0,"['dear', 'caps1', 'caps2', 'believe', 'using',...","['dear @caps1 @caps2, i believe that using com...","['dear', 'caps1', 'caps2', 'believe', 'using',...",19.826087,5.565217,2.990217,1.050000,0.900000,4.250000,5.700000
2,2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",human-generated,0,"['dear', 'caps1', 'caps2', 'caps3', 'people', ...","['dear, @caps1 @caps2 @caps3 more and more peo...","['dear', 'caps1', 'caps2', 'caps3', 'people', ...",21.857143,5.214286,1.779991,1.357143,1.071429,3.785714,6.142857
3,3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",human-generated,0,"['dear', 'local', 'newspaper', 'caps1', 'found...","['dear local newspaper, @caps1 i have found th...","['dear', 'local', 'newspaper', 'caps1', 'found...",21.333333,4.925926,2.017074,1.925926,1.074074,3.629630,7.407407
4,4,5,1,"Dear @LOCATION1, I know having computers has a...",human-generated,0,"['dear', 'location1', 'know', 'computers', 'po...","['dear @location1, i know having computers has...","['dear', 'location1', 'know', 'computer', 'pos...",17.266667,4.833333,1.593389,1.000000,1.266667,2.900000,3.866667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24412,24412,213,1,"\n\nDear Editor, \n\nAs a citizen of this comm...",text-davinci-003,1,"['dear', 'editor', 'citizen', 'community', 'fe...","[""\n\ndear editor, \n\nas a citizen of this co...","['dear', 'editor', 'citizen', 'community', 'fe...",24.250000,6.583333,1.255543,1.833333,1.166667,3.500000,5.166667
24413,24413,214,1,"\n\n\nDear Editor,\n\nAs a concerned citizen a...",text-davinci-003,1,"['dear', 'editor', 'concerned', 'citizen', 'lo...","['\n\n\ndear editor,\n\nas a concerned citizen...","['dear', 'editor', 'concerned', 'citizen', 'lo...",21.166667,4.916667,1.605113,2.250000,0.750000,2.416667,5.833333
24414,24414,215,1,"\n\nDear Editor,\n\nI am writing to share my o...",text-davinci-003,1,"['dear', 'editor', 'writing', 'share', 'opinio...","['\n\ndear editor,\n\ni am writing to share my...","['dear', 'editor', 'writing', 'share', 'opinio...",18.916667,5.500000,2.101587,1.500000,0.583333,3.166667,5.000000
24415,24415,216,1,\n\nTo the Editor: \n\nAs our world has become...,text-davinci-003,1,"['editor', 'world', 'become', 'increasingly', ...",['\n\nto the editor: \n\nas our world has beco...,"['editor', 'world', 'become', 'increasingly', ...",23.750000,5.416667,1.656217,2.250000,1.166667,4.500000,5.750000


In [7]:
comparison_data = {
    'Feature': ['Total Word Count', 
                'Average Word Length', 
                'Average Sentence Length', 
                'Type-Token Ratio', 
                'Stop Word Count', 
                'Average Parse Tree Depth', 
                'Parse Tree Depth Variation', 
                'Average Adjectives per Sentence', 
                'Average Adverbs per Sentence', 
                'Average Verbs per Sentence', 
                'Average Nouns per Sentence', 
                'Average Punctuation Marks'],
    'AI-Generated': [ai_avg_total_word_count, 
                     ai_avg_word_length, ai_avg_sentence_length, 
                     ai_avg_TTR, ai_avg_stop_word_count, 
                     ai_avg_parse_tree_depth, 
                     ai_parse_tree_depth_variation, 
                     ai_avg_adjectives_per_sentence, 
                     ai_avg_adverbs_per_sentence, 
                     ai_avg_verbs_per_sentence, 
                     ai_avg_nouns_per_sentence, 
                     ai_avg_punctuation],
    'Human-Written': [human_avg_total_word_count, 
                      human_avg_word_length, 
                      human_avg_sentence_length, 
                      human_avg_TTR, 
                      human_avg_stop_word_count, 
                      human_avg_parse_tree_depth, 
                      human_parse_tree_depth_variation, 
                      human_avg_adjectives_per_sentence, 
                      human_avg_adverbs_per_sentence, 
                      human_avg_verbs_per_sentence, 
                      human_avg_nouns_per_sentence, 
                      human_avg_punctuation],
}

comparison_df = pd.DataFrame(comparison_data)

# # Save the updated comparison DataFrame to an Excel file
# comparison_df.to_excel('feature_comparison.xlsx', index=False)
import openpyxl
# Save the comparison DataFrame to an Excel file
file_name = 'feature_comparison.xlsx'
comparison_df.to_excel(file_name, index=False)

# Autofit the column widths using openpyxl
workbook = openpyxl.load_workbook(file_name)
worksheet = workbook.active

for column_cells in worksheet.columns:
    length = max(len(str(cell.value)) for cell in column_cells)
    worksheet.column_dimensions[column_cells[0].column_letter].width = length

workbook.save(file_name)

In [16]:
import spacy
from textblob import TextBlob

# Initialize spaCy English model
nlp_spacy = spacy.load('en_core_web_sm')

# Function to count passive sentences
def count_passive_sentences(text):
    passive_sentences = 0
    doc = nlp_spacy(text)
    for token in doc:
        if token.dep_ == 'nsubjpass':
            passive_sentences += 1
    return passive_sentences

# Function to calculate readability scores
from readability import Readability
from readability.exceptions import ReadabilityException


import textstat

# Function to calculate readability scores
def readability_scores(text):
    flesch_reading_ease = textstat.flesch_reading_ease(text)
    flesch_kincaid_grade_level = textstat.text_standard(text, float_output=True)
    smog_index = textstat.smog_index(text)
    return flesch_reading_ease, flesch_kincaid_grade_level, smog_index




# Function to calculate sentiment analysis scores
def sentiment_analysis_scores(text):
    sentiment = TextBlob(text)
    return sentiment.polarity, sentiment.subjectivity

# Calculate new features for AI-generated and human-written essays
ai_generated_passive_sentences = [count_passive_sentences(essay) for essay in ai_generated_essays]
human_written_passive_sentences = [count_passive_sentences(essay) for essay in human_written_essays]

ai_generated_readability_scores = [readability_scores(essay) for essay in ai_generated_essays]
human_written_readability_scores = [readability_scores(essay) for essay in human_written_essays]

ai_generated_sentiment_scores = [sentiment_analysis_scores(essay) for essay in ai_generated_essays]
human_written_sentiment_scores = [sentiment_analysis_scores(essay) for essay in human_written_essays]

# Calculate average values for the new features
ai_avg_passive_sentences = np.mean(ai_generated_passive_sentences)
human_avg_passive_sentences = np.mean(human_written_passive_sentences)

ai_avg_flesch_reading_ease = np.mean([score[0] for score in ai_generated_readability_scores])
human_avg_flesch_reading_ease = np.mean([score[0] for score in human_written_readability_scores])

ai_avg_smog_index = np.mean([score[1] for score in ai_generated_readability_scores])
human_avg_smog_index = np.mean([score[1] for score in human_written_readability_scores])

ai_avg_polarity = np.mean([score[0] for score in ai_generated_sentiment_scores])
human_avg_polarity = np.mean([score[0] for score in human_written_sentiment_scores])

ai_avg_subjectivity = np.mean([score[1] for score in ai_generated_sentiment_scores])
human_avg_subjectivity = np.mean([score[1] for score in human_written_sentiment_scores])

# Update comparison_data with the new features
comparison_data = {
    'Feature': ['Total Word Count', 'Average Word Length', 'Average Sentence Length', 'Type-Token Ratio', 'Stop Word Count', 'Avg Parse Tree Depth', 'Parse Tree Depth Variation', 'Punctuation Count', 'Passive Sentences', 'Flesch Reading Ease', 'SMOG Index', 'Sentiment Polarity', 'Sentiment Subjectivity'],
    'AI-Generated': [ai_avg_total_word_count, ai_avg_word_length, ai_avg_sentence_length, ai_avg_TTR, ai_avg_stop_word_count, ai_avg_parse_tree_depth, ai_parse_tree_depth_variation, ai_avg_punctuation, ai_avg_passive_sentences, ai_avg_flesch_reading_ease, ai_avg_smog_index, ai_avg_polarity, ai_avg_subjectivity],
    'Human-Written': [human_avg_total_word_count, human_avg_word_length, human_avg_sentence_length, human_avg_TTR, human_avg_stop_word_count, human_avg_parse_tree_depth, human_parse_tree_depth_variation, human_avg_punctuation, human_avg_passive_sentences, human_avg_flesch_reading_ease, human_avg_smog_index, human_avg_polarity, human_avg_subjectivity],
}

comparison_df = pd.DataFrame(comparison_data)
comparison_df


Unnamed: 0,Feature,AI-Generated,Human-Written
0,Total Word Count,247.46,384.06
1,Average Word Length,4.194879,3.994626
2,Average Sentence Length,21.575748,18.759387
3,Type-Token Ratio,0.529432,0.472312
4,Stop Word Count,111.48,167.88
5,Avg Parse Tree Depth,5.554981,4.802128
6,Parse Tree Depth Variation,1.851215,1.676963
7,Punctuation Count,24.34,47.76
8,Passive Sentences,1.68,1.08
9,Flesch Reading Ease,64.9234,73.1796
