# NLP Pipeline Jupyter Notebook for the aiTextDetect Project: Step 1

## Reading in and merging the datasets

This jupyter notebook reads in the kaggle human data and the AI-generated data into one pandas dataframe, which is then saved as `cleanData/mergedAsap.xlsx`



In [43]:
#install dependencies
import pandas as pd
import numpy as np
#import plotnine as p9

In [44]:
#read the human datasets

#read in the training dataset
trainDf = pd.read_excel("../rawData/training_set_rel3.xlsx")
trainDf = trainDf[["essay_id", "essay_set", "essay"]] #only keep relevant colummns
trainDf["ai_llm"] = "human-generated"
trainDf["ai_generated"] = 0 #create new variable desigating whether the data are ai generated (1) or human generated (0)
trainDf

#read in the validation dataset
validDf = pd.read_excel("../rawData/valid_set.xlsx")
validDf = validDf[["essay_id", "essay_set", "essay"]] #only keep relevant colummns
validDf["ai_llm"] = "human-generated"
validDf["ai_generated"] = 0 #create new variable desigating whether the data are ai generated (1) or human generated (0)
validDf

#read in the 'test' dataset (we'll create our own k-fold crossvalidation)
testDf = pd.read_csv("../rawData/test_set.tsv", sep = "\t", encoding = "latin-1") #got an error with default encoding. This site https://github.com/nusnlp/nea/issues/11 suggested latin-1 encoding
testDf = testDf[["essay_id", "essay_set", "essay"]] #keep relevant columns
testDf["ai_llm"] = "human-generated"
testDf["ai_generated"] = 0 #create new variable desigating whether the data are ai generated (1) or human generated (0)




In [51]:
#read and wrangle ai data
aiDf = pd.read_excel("../rawData/aiGenerated.xlsx")
aiDf2 = pd.read_excel("../rawData/aiGenerated1.xlsx")

#save the essay prompt used for this data
aiDf2["essay_set"] = 1 

#specify this data as generated by ai
aiDf["ai_generated"] = 1
aiDf2["ai_generated"] = 1




In [53]:
#rename the dataframes for subsequent merging
aiDf = aiDf.rename(columns = {
    "essay_id" : "essay_set",
    "ai_essay" : "essay",
    "row_id" : "essay_id",
})

aiDf2 = aiDf2.rename(columns = {
    "ai_text" : "essay",
    "Unnamed: 0" : "essay_id",
})



In [59]:
#select only relevant columns
aiDf = aiDf.loc[:,["essay_id", "essay_set", "essay", "ai_llm", "ai_generated"]]
aiDf2 = aiDf2.loc[:,["essay_id", "essay_set", "essay", "ai_llm", "ai_generated"]]

In [60]:
#merge the 5 datasets (3 human, 2 ai) into a preliminary dataset
mergeDf = pd.concat([trainDf, validDf, testDf, aiDf, aiDf2], axis = 0)


In [63]:
#write merged file to an excel doc
mergeDf.to_excel("../cleanData/3aMergedAsap.xlsx")