# Construção do Dataset
Primeiramente, pretendemos agrupar diversos datasets, utilizando duas colunas: "text", texto plano, sendo a segunda coluna a label "source" que categoriza o texto como "human" ou "ai".
O segundo passo é extrair as _features_ necessárias para treinar os modelos

In [34]:
import numpy as np
import pandas as pd
import os
import random
def set_seed(seed: int):
    random.seed(seed) # Python
    np.random.seed(seed)  # Numpy, é o gerador utilizado pelo sklearn
    os.environ["PYTHONHASHSEED"] = str(seed)  # sistema operativo

set_seed(25)

# Montar o Dataset

In [35]:
dataframes = []

## Primeiro Dataset

In [32]:
from datasets import load_dataset
ds = load_dataset("artem9k/ai-text-detection-pile")

In [33]:
df_1 = pd.DataFrame(ds['train'])
df_1.drop(columns=['id'], inplace=True)

df_1 = df_1.iloc[:, [1,0]]
dataframes.append(df_1)
df_1.head()

Unnamed: 0,text,source
0,12 Years a Slave: An Analysis of the Film Essa...,human
1,20+ Social Media Post Ideas to Radically Simpl...,human
2,2022 Russian Invasion of Ukraine in Global Med...,human
3,533 U.S. 27 (2001) Kyllo v. United States: The...,human
4,A Charles Schwab Corporation Case Essay\n\nCha...,human


## Segundo Dataset

In [None]:
from datasets import load_dataset

ds2 = load_dataset("dmitva/human_ai_generated_text")

In [None]:
df_2 = pd.DataFrame(ds2['train'])

# Create a DataFrame for human text
df_human = df_2[['human_text']].copy()
df_human = df_human.rename(columns={'human_text': 'text'})
df_human['source'] = 'human'

# Create a DataFrame for AI text
df_ai = df_2[['ai_text']].copy()
df_ai = df_ai.rename(columns={'ai_text': 'text'})
df_ai['source'] = 'ai'

# Combine the two DataFrames into one
new_df_2 = pd.concat([df_human, df_ai], ignore_index=True)
dataframes.append(new_df_2)
new_df_2.head()

In [None]:
# Sanity check
print(new_df_2['source'].value_counts())

## Terceiro Dataset

In [13]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds_abs = load_dataset("NicolaiSivesind/human-vs-machine", "research_abstracts_labeled")
ds_wiki = load_dataset("NicolaiSivesind/human-vs-machine", "wiki_labeled")

df_3_1_1 = pd.DataFrame(ds_abs['train'])
df_3_1_2 = pd.DataFrame(ds_abs['validation'])
df_3_1_3 = pd.DataFrame(ds_abs['test'])

df_3_2_1 = pd.DataFrame(ds_wiki['train'])
df_3_2_2 = pd.DataFrame(ds_wiki['validation'])
df_3_2_3 = pd.DataFrame(ds_wiki['test'])

df_3 = pd.concat([df_3_1_1, df_3_1_2, df_3_1_3, df_3_2_1, df_3_2_2, df_3_2_3], ignore_index=True)

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
# Create a mapping for the label values
label_to_source = {
    0: "human",
    1: "ai"
}

# Apply the mapping to create the "source" column
df_3['source'] = df_3['label'].map(label_to_source)

# Select only the desired columns: "text" and "source"
new_df_3 = df_3[['text', 'source']].copy()

dataframes.append(new_df_3)
new_df_3.head()


Unnamed: 0,text,source
0,Coupling losses were studied in composite tape...,human
1,"In this study, we investigate the coupling los...",ai
2,Let $\mathsf M_{\mathsf S}$ denote the strong ...,human
3,"In this paper, we investigate Weighted Solyani...",ai
4,In 2019 October Betelgeuse began a decline in ...,human


## Quarto Dataset

In [None]:
df_4 = pd.read_csv("LLM.csv")
df_4.rename(columns = {"Text": "text", "Label": "source"}, inplace=True)

# Create a mapping for the label values
label_to_source = {
    "ai": "ai",
    "student": "human"
}

# Apply the mapping to create the "source" column
df_4['source'] = df_4['source'].map(label_to_source)
df_4.head()

## Quinto Dataset

In [36]:
df_5 = pd.read_csv("data_set.csv")
df_5.rename(columns = {"abstract": "text", "is_ai_generated": "source"}, inplace=True)
df_5.drop(columns=['title','ai_generated'], inplace=True)

# Create a mapping for the label values
label_to_source = {
    1: "ai",
    0: "human"
}

# Apply the mapping to create the "source" column
df_5['source'] = df_5['source'].map(label_to_source)
print(df_5['source'].value_counts())
dataframes.append(df_5)
df_5.head()

source
human    2100
ai       1953
Name: count, dtype: int64


Unnamed: 0,text,source
0,Advanced electromagnetic potentials are indi...,human
1,This research paper investigates the question ...,ai
2,We give an algorithm for finding network enc...,human
3,The paper presents an efficient centralized bi...,ai
4,We introduce an exponential random graph mod...,human


## Sexto Dataset

In [24]:
df_6_news_gpt = pd.read_pickle("en_news_gpt_features_df.pkl")
df_6_news_human = pd.read_pickle("en_news_human_features_df.pkl")
df_6_wiki_gpt = pd.read_pickle("en_wiki_gpt_features_df.pkl")
df_6_wiki_human = pd.read_pickle("en_wiki_human_features_df.pkl")

df_6_news_gpt = df_6_news_gpt[['text']]
df_6_news_gpt['source'] = 'ai'

df_6_news_human = df_6_news_human[['text']]
df_6_news_human['source'] = 'human'

df_6_wiki_gpt = df_6_wiki_gpt[['text']]
df_6_wiki_gpt['source'] = 'ai'

df_6_wiki_human = df_6_wiki_human[['text']]
df_6_wiki_human['source'] = 'human'

df_6 = pd.concat([df_6_news_gpt, df_6_news_human, df_6_wiki_gpt, df_6_wiki_human], ignore_index=True)
dataframes.append(df_6)
print(df_6['source'].value_counts())


source
ai       800
human    200
Name: count, dtype: int64


## Coisas geradas

In [37]:
df_gerado_cie = pd.read_csv("scientific_texts.csv")
df_gerado_ai = pd.read_csv("processed_scientific_texts0.csv")
df_gerado_ai1 = pd.read_csv("processed_scientific_texts1.csv")
df_gerado_ai2 = pd.read_csv("processed_scientific_texts2.csv")
df_gerado_ai3 = pd.read_csv("processed_scientific_texts3.csv")

dataframes.append(df_gerado_cie.head(400))
dataframes.append(df_gerado_ai.head(200))
dataframes.append(df_gerado_ai1.head(200))
dataframes.append(df_gerado_ai2.head(200))
dataframes.append(df_gerado_ai3.head(200))

df_1 = pd.read_csv("dataset1_inputs.csv",sep="\t")
df_1_out = pd.read_csv("dataset1_outputs.csv",sep="\t")
df_1['source'] = df_1_out['Label']
# rename Text to text
df_1.rename(columns={"Text": "text"}, inplace=True)
# map values Human to human and AI to ai
label_to_source = {
    "Human": "human",
    "AI": "ai"
}
# Apply the mapping to create the "source" column
df_1['source'] = df_1['source'].map(label_to_source)
df_1 = df_1[['text', 'source']]
df_1.head()

Unnamed: 0,text,source
0,"The cell cycle, or cell-division cycle, is the...",human
1,The cell cycle is the process by which a cell ...,ai
2,"Photons, in many atomic models in physics, are...",human
3,A photon is a fundamental particle of light an...,ai
4,"According to the theory of plate tectonics, Ea...",human


## Juntar tudo

In [38]:
df = pd.concat(dataframes, ignore_index=True)

In [39]:
df = df.sample(frac=1).reset_index(drop=True)
df.info()
# take "\n" out of the text
df['text'] = df['text'].str.replace('\n', ' ')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5249 entries, 0 to 5248
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5249 non-null   object
 1   source  5248 non-null   object
dtypes: object(2)
memory usage: 82.1+ KB


In [40]:
df = df.drop_duplicates()
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 5247 entries, 0 to 5248
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5247 non-null   object
 1   source  5246 non-null   object
dtypes: object(2)
memory usage: 123.0+ KB


In [41]:
df.to_csv("human_or_ai_dataset_sub3.csv", index=False)