# NLP Pipeline Jupyter Notebook for the aiTextDetect Project: Step 3

## Extracting Features from Preprocessed data

This script extracts a series of features from the text, saving them in a pandas dataframe that is then saved as `cleanData/featuresAsap.xlsx`. 

In [34]:
import openai
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.util import ngrams
from collections import Counter
import numpy as np
from nltk.tree import Tree
import spacy


# Initialize spaCy English model
nlp_spacy = spacy.load('en_core_web_sm')

In [35]:
# Function to extract lexical features
def extract_lexical_features(text):
    # ... your extract_lexical_features function implementation ...
    words = word_tokenize(text)
    sentences = sent_tokenize(text)
    
    total_word_count = len(words)
    avg_word_length = sum(len(word) for word in words) / len(words)
    avg_sentence_length = sum(len(sentence.split()) for sentence in sentences) / len(sentences)
    word_counts = Counter(words)
    TTR = len(word_counts) / len(words)
    stop_words = set(stopwords.words('english'))
    stop_word_count = sum(1 for word in words if word.lower() in stop_words)
    unique_word_count = sum(1 for _, count in word_counts.items() if count == 1)
    word_freq = word_counts
    bigram_freq = Counter(ngrams(words, 2))
    trigram_freq = Counter(ngrams(words, 3))
    rare_word_count = sum(1 for _, count in word_counts.items() if count == 1)

    return {
        'total_word_count': total_word_count,
        'avg_word_length': avg_word_length,
        'avg_sentence_length': avg_sentence_length,
        'TTR': TTR,
        'stop_word_count': stop_word_count,
        'unique_word_count': unique_word_count,
        'word_freq': word_freq,
        'bigram_freq': bigram_freq,
        'trigram_freq': trigram_freq,
        'rare_word_count': rare_word_count
    }

#load data from excel file and save as list
merged_df = pd.read_excel('../cleanData/3bProcessedAsap.xlsx')
all_essays = merged_df['essay'].tolist()

# Extract lexical features from AI-generated and human-written essays
all_lexical_features = [extract_lexical_features(essay) for essay in all_essays]

In [None]:
merged_df = pd.concat([merged_df, pd.DataFrame(all_lexical_features)], axis = 1)#.to_excel("../cleanData/featuresAsap.xlsx")
merged_df.to_excel("../cleanData/featuresAsap.xlsx")

In [None]:

# Function to extract syntactic features
def extract_syntactic_features(text):
    # ... your extract_syntactic_features function implementation ...
    doc = nlp_spacy(text)

    # Calculate average sentence length
    sentence_lengths = [len(sent) for sent in doc.sents]
    avg_sentence_length = np.mean(sentence_lengths)

    # Calculate parse tree depth
    def calc_tree_depth(sent):
        root = [token for token in sent if token.head == token][0]
        return max([len(list(token.ancestors)) for token in sent])

    tree_depths = [calc_tree_depth(sent) for sent in doc.sents]
    avg_parse_tree_depth = np.mean(tree_depths)
    parse_tree_depth_variation = np.std(tree_depths)

    return {
        'avg_sentence_length': avg_sentence_length,
        'avg_parse_tree_depth': avg_parse_tree_depth,
        'parse_tree_depth_variation': parse_tree_depth_variation,
    }


# Extract syntactic features from AI-generated and human-written essays
all_syntactic_features = [extract_syntactic_features(essay) for essay in all_essays]

In [None]:
merged_df = pd.concat([merged_df, pd.DataFrame(all_syntactic_features)], axis = 1)#.to_excel("../cleanData/features.Asap.xlsx")
merged_df.to_excel("../cleanData/featuresAsap.xlsx")

In [None]:
# # Combine lexical and syntactic features
# def combined_features(text):
#     lexical = extract_lexical_features(text)
#     syntactic = extract_syntactic_features(text)
#     return {**lexical, **syntactic}

# # Extract combined features for AI-generated and human-written essays
# all_combined_features = [combined_features(essay) for essay in all_essays]

In [None]:
#Stylistic Features
def extract_stylistic_features(text):
    sentences = sent_tokenize(text)
    num_sentences = len(sentences)
    
    tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
    pos_tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    
    num_adjectives = sum(sum(1 for word, pos in sentence if pos.startswith('JJ')) for sentence in pos_tagged_sentences)
    num_adverbs = sum(sum(1 for word, pos in sentence if pos.startswith('RB')) for sentence in pos_tagged_sentences)
    num_verbs = sum(sum(1 for word, pos in sentence if pos.startswith('VB')) for sentence in pos_tagged_sentences)
    num_nouns = sum(sum(1 for word, pos in sentence if pos.startswith('NN')) for sentence in pos_tagged_sentences)

    avg_adjectives_per_sentence = num_adjectives / num_sentences
    avg_adverbs_per_sentence = num_adverbs / num_sentences
    avg_verbs_per_sentence = num_verbs / num_sentences
    avg_nouns_per_sentence = num_nouns / num_sentences
    
    return {
        'avg_adjectives_per_sentence': avg_adjectives_per_sentence,
        'avg_adverbs_per_sentence': avg_adverbs_per_sentence,
        'avg_verbs_per_sentence': avg_verbs_per_sentence,
        'avg_nouns_per_sentence': avg_nouns_per_sentence,
    }

# Extract stylistic features from AI-generated and human-written essays
all_stylistic_features = [extract_stylistic_features(essay) for essay in all_essays]

import string

def count_punctuation(text):
    punctuation_count = sum(1 for char in text if char in string.punctuation)
    punct_length = sum(1 for char in text)
    punctuation_proportion = punctuation_count / punct_length
    return {"punctuation_proportion" :punctuation_proportion}

all_avg_punctuation = [count_punctuation(essay) for essay in all_essays]


In [None]:
merged_df = pd.concat([merged_df, pd.DataFrame(all_stylistic_features)], axis = 1)#   pd.DataFrame(all_avg_punctuation)
merged_df.to_excel("../cleanData/featuresAsap.xlsx")

In [None]:
import spacy
from textblob import TextBlob

# Initialize spaCy English model
nlp_spacy = spacy.load('en_core_web_sm')

# Function to count passive sentences
def count_passive_sentences(text):
    passive_sentences = 0
    doc = nlp_spacy(text)
    for token in doc:
        if token.dep_ == 'nsubjpass':
            passive_sentences += 1
    return passive_sentences

# Function to calculate readability scores
#from readability import Readability
#from readability.exceptions import ReadabilityException


import textstat

# Function to calculate readability scores
def readability_scores(text):
    flesch_reading_ease = textstat.flesch_reading_ease(text)
    flesch_kincaid_grade_level = textstat.text_standard(text, float_output=True)
    smog_index = textstat.smog_index(text)
    return {
        "flesch_reading_ease" : flesch_reading_ease, 
        "flesch_kincaid_grade_level" : flesch_kincaid_grade_level, 
        "smog_index" : smog_index}


# Function to calculate sentiment analysis scores
def sentiment_analysis_scores(text):
    sentiment = TextBlob(text)
    return {
        "sentiment_polarity" : sentiment.polarity, 
        "sentiment.subjectivity" : sentiment.subjectivity}

# Calculate new features for AI-generated and human-written essays
all_passive_sentences = [count_passive_sentences(essay) for essay in all_essays]
    #ai_generated_passive_sentences = [count_passive_sentences(essay) for essay in ai_generated_essays]
    #human_written_passive_sentences = [count_passive_sentences(essay) for essay in human_written_essays]


all_readibility_scores = [readability_scores(essay) for essay in all_essays]
    #ai_generated_readability_scores = [readability_scores(essay) for essay in ai_generated_essays]
    #human_written_readability_scores = [readability_scores(essay) for essay in human_written_essays]

all_sentiment_scores = [sentiment_analysis_scores(essay) for essay in all_essays]

In [None]:
#merge the extracted features with the original data
merged_df = pd.concat([merged_df, pd.DataFrame(all_passive_sentences), pd.DataFrame(all_readibility_scores), pd.DataFrame(all_sentiment_scores)], axis = 1) #pd.DataFrame(all_lexical_features), pd.DataFrame(all_syntactic_features), pd.DataFrame(all_stylistic_features)

#save as excel document
merged_df.to_excel("../cleanData/featuresAsap.xlsx")

In [None]:
#calculate essay perplexity

#install dependencies
#install dependencies
from nltk.lm.preprocessing import padded_everygram_pipeline #function used to prepare the tokenized text accordingly
from nltk.lm import MLE
from nltk.util import bigrams
from nltk.lm.preprocessing import pad_both_ends
import string

#install the “popular” subset of NLTK data, on the command line type
#python -m nltk.downloader popular
import nltk.tokenize
import pandas as pd

from nltk.lm import MLE #import a maximum likelihood estimator


In [None]:
#function to tokenize text while keeping stop words
def clean_text_keep_stopword(text):
    sentences = nltk.sent_tokenize(text) #create sentence tokens (not cleaned)
    text = "".join([word for word in text if word not in string.punctuation]) #remove punctuation
    tokens = nltk.tokenize.word_tokenize(text) #tokenize
    return(tokens)



In [None]:
#preprocessing for perplexity
perplexty_df = merged_df.loc[:, ["essay_id", "ai_llm", "essay"]]

#run the word tokenized function through every row of text
perplexty_df['word_token_with_stopword'] = perplexty_df["essay"].apply(lambda x: clean_text_keep_stopword(x.lower())) 

#use padded_everygram_pipeline() to preprocess the tokenized data 
train, vocab = padded_everygram_pipeline(2, perplexty_df['word_token_with_stopword'])

#import a maximum likelihood estimator
from nltk.lm import MLE 
lm = MLE(2) #use MLE function to create an empty vocabulary

 #fit the MLE model to the preprocessed data
lm.fit(train, vocab)





In [None]:
#use a list comprehension to run two functions on the word tokens with stopwords for each essay:
 #1) create a series of bigrams based on the text, 2) calculate entropy
merged_df["perplexity"] = perplexty_df['word_token_with_stopword'].apply(lambda x: lm.perplexity(list(bigrams(pad_both_ends(x, n=2)))))

#save the dataset
merged_df.to_excel("../cleanData/4aFeaturesAsap.xlsx")

## Calculating Averages Based on Features

This remaining 

In [None]:
# Calculate average values of some lexical features for both AI-generated and human-written essays
def average_feature_value(features, feature_key):
    return sum(feature[feature_key] for feature in features) / len(features)

ai_avg_word_length = average_feature_value(ai_generated_features, 'avg_word_length')
ai_avg_TTR = average_feature_value(ai_generated_features, 'TTR')
ai_avg_stop_word_count = average_feature_value(ai_generated_features, 'stop_word_count')

ai_avg_sentence_length = average_feature_value(ai_generated_features, 'avg_sentence_length')
human_avg_sentence_length = average_feature_value(human_written_features, 'avg_sentence_length')
human_avg_word_length = average_feature_value(human_written_features, 'avg_word_length')
human_avg_TTR = average_feature_value(human_written_features, 'TTR')
human_avg_stop_word_count = average_feature_value(human_written_features, 'stop_word_count')

# Calculate average values of the total word count for both AI-generated and human-written essays
ai_avg_total_word_count = average_feature_value(ai_generated_features, 'total_word_count')
human_avg_total_word_count = average_feature_value(human_written_features, 'total_word_count')

In [None]:
# Compare the syntactic features
ai_avg_sentence_length = np.mean([features['avg_sentence_length'] for features in ai_generated_syntactic_features])
human_avg_sentence_length = np.mean([features['avg_sentence_length'] for features in human_written_syntactic_features])

ai_avg_parse_tree_depth = np.mean([features['avg_parse_tree_depth'] for features in ai_generated_syntactic_features])
human_avg_parse_tree_depth = np.mean([features['avg_parse_tree_depth'] for features in human_written_syntactic_features])

ai_parse_tree_depth_variation = np.mean([features['parse_tree_depth_variation'] for features in ai_generated_syntactic_features])
human_parse_tree_depth_variation = np.mean([features['parse_tree_depth_variation'] for features in human_written_syntactic_features])


In [None]:
# Calculate average values of stylistic features for both AI-generated and human-written essays
ai_avg_adjectives_per_sentence = average_feature_value(ai_generated_stylistic_features, 'avg_adjectives_per_sentence')
ai_avg_adverbs_per_sentence = average_feature_value(ai_generated_stylistic_features, 'avg_adverbs_per_sentence')
ai_avg_verbs_per_sentence = average_feature_value(ai_generated_stylistic_features, 'avg_verbs_per_sentence')
ai_avg_nouns_per_sentence = average_feature_value(ai_generated_stylistic_features, 'avg_nouns_per_sentence')

human_avg_adjectives_per_sentence = average_feature_value(human_written_stylistic_features, 'avg_adjectives_per_sentence')
human_avg_adverbs_per_sentence = average_feature_value(human_written_stylistic_features, 'avg_adverbs_per_sentence')
human_avg_verbs_per_sentence = average_feature_value(human_written_stylistic_features, 'avg_verbs_per_sentence')
human_avg_nouns_per_sentence = average_feature_value(human_written_stylistic_features, 'avg_nouns_per_sentence')

import string

def count_punctuation(text):
    punctuation_count = sum(1 for char in text if char in string.punctuation)
    return punctuation_count

def average_value(values):
    return sum(values) / len(values)

all_avg_punctuation = average_value([count_punctuation(essay) for essay in all_essays])
ai_avg_punctuation = average_value([count_punctuation(essay) for essay in ai_generated_essays])
human_avg_punctuation = average_value([count_punctuation(essay) for essay in human_written_essays])


ai_avg_punctuation = average_feature_value([(count_punctuation(essay)) for essay in ai_generated_essays])
human_avg_punctuation = average_feature_value([(count_punctuation(essay)) for essay in human_written_essays])

In [None]:
comparison_data = {
    'Feature': ['Total Word Count', 
                'Average Word Length', 
                'Average Sentence Length', 
                'Type-Token Ratio', 
                'Stop Word Count', 
                'Average Parse Tree Depth', 
                'Parse Tree Depth Variation', 
                'Average Adjectives per Sentence', 
                'Average Adverbs per Sentence', 
                'Average Verbs per Sentence', 
                'Average Nouns per Sentence', 
                'Average Punctuation Marks'],
    'AI-Generated': [ai_avg_total_word_count, 
                     ai_avg_word_length, ai_avg_sentence_length, 
                     ai_avg_TTR, ai_avg_stop_word_count, 
                     ai_avg_parse_tree_depth, 
                     ai_parse_tree_depth_variation, 
                     ai_avg_adjectives_per_sentence, 
                     ai_avg_adverbs_per_sentence, 
                     ai_avg_verbs_per_sentence, 
                     ai_avg_nouns_per_sentence, 
                     ai_avg_punctuation],
    'Human-Written': [human_avg_total_word_count, 
                      human_avg_word_length, 
                      human_avg_sentence_length, 
                      human_avg_TTR, 
                      human_avg_stop_word_count, 
                      human_avg_parse_tree_depth, 
                      human_parse_tree_depth_variation, 
                      human_avg_adjectives_per_sentence, 
                      human_avg_adverbs_per_sentence, 
                      human_avg_verbs_per_sentence, 
                      human_avg_nouns_per_sentence, 
                      human_avg_punctuation],
}

comparison_df = pd.DataFrame(comparison_data)

# # Save the updated comparison DataFrame to an Excel file
# comparison_df.to_excel('feature_comparison.xlsx', index=False)
import openpyxl
# Save the comparison DataFrame to an Excel file
file_name = 'feature_comparison.xlsx'
comparison_df.to_excel(file_name, index=False)

# Autofit the column widths using openpyxl
workbook = openpyxl.load_workbook(file_name)
worksheet = workbook.active

for column_cells in worksheet.columns:
    length = max(len(str(cell.value)) for cell in column_cells)
    worksheet.column_dimensions[column_cells[0].column_letter].width = length

workbook.save(file_name)

In [None]:
# Calculate average values for the new features
ai_avg_passive_sentences = np.mean(ai_generated_passive_sentences)
human_avg_passive_sentences = np.mean(human_written_passive_sentences)

ai_avg_flesch_reading_ease = np.mean([score[0] for score in ai_generated_readability_scores])
human_avg_flesch_reading_ease = np.mean([score[0] for score in human_written_readability_scores])

ai_avg_smog_index = np.mean([score[1] for score in ai_generated_readability_scores])
human_avg_smog_index = np.mean([score[1] for score in human_written_readability_scores])

ai_avg_polarity = np.mean([score[0] for score in ai_generated_sentiment_scores])
human_avg_polarity = np.mean([score[0] for score in human_written_sentiment_scores])

ai_avg_subjectivity = np.mean([score[1] for score in ai_generated_sentiment_scores])
human_avg_subjectivity = np.mean([score[1] for score in human_written_sentiment_scores])

# Update comparison_data with the new features
comparison_data = {
    'Feature': ['Total Word Count', 'Average Word Length', 'Average Sentence Length', 'Type-Token Ratio', 'Stop Word Count', 'Avg Parse Tree Depth', 'Parse Tree Depth Variation', 'Punctuation Count', 'Passive Sentences', 'Flesch Reading Ease', 'SMOG Index', 'Sentiment Polarity', 'Sentiment Subjectivity'],
    'AI-Generated': [ai_avg_total_word_count, ai_avg_word_length, ai_avg_sentence_length, ai_avg_TTR, ai_avg_stop_word_count, ai_avg_parse_tree_depth, ai_parse_tree_depth_variation, ai_avg_punctuation, ai_avg_passive_sentences, ai_avg_flesch_reading_ease, ai_avg_smog_index, ai_avg_polarity, ai_avg_subjectivity],
    'Human-Written': [human_avg_total_word_count, human_avg_word_length, human_avg_sentence_length, human_avg_TTR, human_avg_stop_word_count, human_avg_parse_tree_depth, human_parse_tree_depth_variation, human_avg_punctuation, human_avg_passive_sentences, human_avg_flesch_reading_ease, human_avg_smog_index, human_avg_polarity, human_avg_subjectivity],
}

comparison_df = pd.DataFrame(comparison_data)
comparison_df
