# NLP Pipeline Jupyter Notebook for the aiTextDetect Project: Step 2 

## Preprocessing the Text Data

This script conducts a variety of text pre-processing strategies on the merged data from step 1. Text pre-processing conducted includes: cleaning text, tokenizing text, removing special characters, case conversion, correcting spellings, removing stopwords and other unnecessary terms, and lemmatization.

In [63]:
#install dependencies

import pandas as pd
import numpy as np
import string


import nltk
wn = nltk.WordNetLemmatizer() #specifying wn as the word net lemmatizer

nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet
 

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/cbarron/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [64]:
#read in the data
asap_df = pd.read_excel("../cleanData/3aMergedAsap.xlsx", dtype = {"essay_id" : float, "essay_set" : float, "essay" : str, "ai_llm" : str, "ai_generated" : float})

#save a test row
test_row = asap_df.loc[2,]

In [69]:
#have some missing values that were causing an error
asap_df["essay"].isna().sum()

2

In [80]:
stopwords = nltk.corpus.stopwords.words('english')

def clean_text(text,tokenize_sentence='y'):

    sentences = nltk.sent_tokenize(text)
    text = "".join([word for word in text if word not in string.punctuation]) #remove punctuation
    tokens = nltk.tokenize.word_tokenize(text) #tokenize
    text = [word for word in tokens if word not in stopwords] #remove stopwords


    if tokenize_sentence=='y':
        return sentences
    else:
        return text

In [84]:
#remove NA values
asap_df.dropna(inplace=True)

0        [dear, local, newspaper, think, effects, compu...
1        [dear, caps1, caps2, believe, using, computers...
2        [dear, caps1, caps2, caps3, people, use, compu...
3        [dear, local, newspaper, caps1, found, many, e...
4        [dear, location1, know, computers, positive, e...
                               ...                        
43127    [dear, editor, citizen, community, feel, impor...
43128    [dear, editor, concerned, citizen, longtime, r...
43129    [dear, editor, writing, share, opinion, effect...
43130    [editor, world, become, increasingly, reliant,...
43131    [dear, editor, writing, present, thoughts, eff...
Name: essay, Length: 43130, dtype: object

In [85]:
asap_df['word_tokens'] = asap_df["essay"].apply(lambda x: clean_text(x.lower(), tokenize_sentence= "n"))
asap_df['sentence_tokens'] = asap_df["essay"].apply(lambda x: clean_text(x.lower(), tokenize_sentence= "y"))

In [86]:
#check that worked
asap_df["word_tokens"]

0        [dear, local, newspaper, think, effects, compu...
1        [dear, caps1, caps2, believe, using, computers...
2        [dear, caps1, caps2, caps3, people, use, compu...
3        [dear, local, newspaper, caps1, found, many, e...
4        [dear, location1, know, computers, positive, e...
                               ...                        
43127    [dear, editor, citizen, community, feel, impor...
43128    [dear, editor, concerned, citizen, longtime, r...
43129    [dear, editor, writing, share, opinion, effect...
43130    [editor, world, become, increasingly, reliant,...
43131    [dear, editor, writing, present, thoughts, eff...
Name: word_tokens, Length: 43130, dtype: object

In [87]:
#create a function to lemmatize the text
def lemmatize_text(word_tokens):
    lem_text = [wn.lemmatize(word) for word in word_tokens]
    return(lem_text)

#lemmatize the text and save in a new column
asap_df["lemmatized_word_tokens"] = asap_df["word_tokens"].apply(lambda x: lemmatize_text(x))


In [88]:
#checking that the previous code worked
print(asap_df["word_tokens"].head())
print(asap_df["sentence_tokens"].head())
print(asap_df["lemmatized_word_tokens"].head())
asap_df


0    [dear, local, newspaper, think, effects, compu...
1    [dear, caps1, caps2, believe, using, computers...
2    [dear, caps1, caps2, caps3, people, use, compu...
3    [dear, local, newspaper, caps1, found, many, e...
4    [dear, location1, know, computers, positive, e...
Name: word_tokens, dtype: object
0    [dear local newspaper, i think effects compute...
1    [dear @caps1 @caps2, i believe that using comp...
2    [dear, @caps1 @caps2 @caps3 more and more peop...
3    [dear local newspaper, @caps1 i have found tha...
4    [dear @location1, i know having computers has ...
Name: sentence_tokens, dtype: object
0    [dear, local, newspaper, think, effect, comput...
1    [dear, caps1, caps2, believe, using, computer,...
2    [dear, caps1, caps2, caps3, people, use, compu...
3    [dear, local, newspaper, caps1, found, many, e...
4    [dear, location1, know, computer, positive, ef...
Name: lemmatized_word_tokens, dtype: object


Unnamed: 0.1,Unnamed: 0,essay_id,essay_set,essay,ai_llm,ai_generated,word_tokens,sentence_tokens,lemmatized_word_tokens
0,0,1.0,1.0,"Dear local newspaper, I think effects computer...",human-generated,0.0,"[dear, local, newspaper, think, effects, compu...","[dear local newspaper, i think effects compute...","[dear, local, newspaper, think, effect, comput..."
1,1,2.0,1.0,"Dear @CAPS1 @CAPS2, I believe that using compu...",human-generated,0.0,"[dear, caps1, caps2, believe, using, computers...","[dear @caps1 @caps2, i believe that using comp...","[dear, caps1, caps2, believe, using, computer,..."
2,2,3.0,1.0,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",human-generated,0.0,"[dear, caps1, caps2, caps3, people, use, compu...","[dear, @caps1 @caps2 @caps3 more and more peop...","[dear, caps1, caps2, caps3, people, use, compu..."
3,3,4.0,1.0,"Dear Local Newspaper, @CAPS1 I have found that...",human-generated,0.0,"[dear, local, newspaper, caps1, found, many, e...","[dear local newspaper, @caps1 i have found tha...","[dear, local, newspaper, caps1, found, many, e..."
4,4,5.0,1.0,"Dear @LOCATION1, I know having computers has a...",human-generated,0.0,"[dear, location1, know, computers, positive, e...","[dear @location1, i know having computers has ...","[dear, location1, know, computer, positive, ef..."
...,...,...,...,...,...,...,...,...,...
43127,213,213.0,1.0,"\n\nDear Editor, \n\nAs a citizen of this comm...",text-davinci-003,1.0,"[dear, editor, citizen, community, feel, impor...","[\n\ndear editor, \n\nas a citizen of this com...","[dear, editor, citizen, community, feel, impor..."
43128,214,214.0,1.0,"\n\n\nDear Editor,\n\nAs a concerned citizen a...",text-davinci-003,1.0,"[dear, editor, concerned, citizen, longtime, r...","[\n\n\ndear editor,\n\nas a concerned citizen ...","[dear, editor, concerned, citizen, longtime, r..."
43129,215,215.0,1.0,"\n\nDear Editor,\n\nI am writing to share my o...",text-davinci-003,1.0,"[dear, editor, writing, share, opinion, effect...","[\n\ndear editor,\n\ni am writing to share my ...","[dear, editor, writing, share, opinion, effect..."
43130,216,216.0,1.0,\n\nTo the Editor: \n\nAs our world has become...,text-davinci-003,1.0,"[editor, world, become, increasingly, reliant,...",[\n\nto the editor: \n\nas our world has becom...,"[editor, world, become, increasingly, reliant,..."


In [89]:
#remove unwanted column
asap_df = asap_df.drop("Unnamed: 0", axis = 1)



In [90]:
#save dataframe
asap_df.to_excel("../cleanData/3bProcessedAsap.xlsx")

In [19]:
#Removing Special Charactes 

Remove_Special_CharactersDf=asap_df["essay"].str.replace('\W', ' ', regex=True)
Remove_Special_CharactersDf
sentence= Remove_Special_CharactersDf.loc[asap_df.index[1]]

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
         ..
21445   NaN
21446   NaN
21447   NaN
21448   NaN
21449   NaN
Name: word_tokens, Length: 21450, dtype: float64


In [7]:
# lemmatization is a lot more powerful. It looks beyond word reduction and considers a language’s full vocabulary to
# apply a morphological analysis to words, aiming to remove inflectional endings only and to return the base or 
# dictionary form of a word
# Wordnet is a publicly available lexical database of over 200 languages that provides semantic relationships betweenits words


import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet
 

lemmatizer = WordNetLemmatizer()

def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:         
        return None
  
sentence= Remove_Special_CharactersDf.loc[asap_df.index[1]]
 
# tokenize the sentence and find the POS tag for each token
pos_tagged = nltk.pos_tag(nltk.word_tokenize(sentence)) 
  
# we use our own pos_tagger function to make things simpler to understand.
wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tagged))

 
lemmatized_sentence = []
for word, tag in wordnet_tagged:
    if tag is None:
        # if there is no available tag, append the token as is
        lemmatized_sentence.append(word)
    else:       
        # else use the tag to lemmatize the token
        lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
lemmatized_sentence = " ".join(lemmatized_sentence)
 
print(lemmatized_sentence)


NameError: name 'WordNetLemmatizer' is not defined

In [12]:
   #The words which are generally filtered out before processing a natural language are called stop words
#Examples of a few stop words in English are “the”, “a”, “an”, “so”, “what”.
#NLTK is a library to play with natural language.The steps to import the library and the English stop words list

from nltk.corpus import stopwords
sw_nltk = stopwords.words('english')

words = [word for word in sentence.split() if word.lower() not in sw_nltk]
new_text = " ".join(words)
print(new_text)
print("Old length: ", len(sentence))
print("New length: ", len(new_text))

Dear CAPS1 CAPS2 believe using computers benefit us many ways like talking becoming friends others websites like facebook mysace Using computers help us find coordibates locations able ourselfs millions information Also computers benefit us helping jobs planning house plan typing NUM1 page report one jobs less writing lets go wonder world technology Using computer help us life talking making friends line Many people myspace facebooks aim benefit us conversations one another Many people believe computers bad make friends never talk fortunate computer help school work social life make friends Computers help us finding locations coordibates millions information online go internet lot know go onto websites MONTH1 help us locations coordinates like LOCATION1 Would rather use computer LOCATION3 supposed vacationing LOCATION2 Million information found internet almost every question computer Would rather easily draw house plan computers take NUM1 hours one hand ugly erazer marks garrenteed fin