# Segunda etapa, extração

In [2]:
import numpy as np
import pandas as pd
import os
import random
def set_seed(seed: int):
    random.seed(seed) # Python
    np.random.seed(seed)  # Numpy, é o gerador utilizado pelo sklearn
    os.environ["PYTHONHASHSEED"] = str(seed)  # sistema operativo

set_seed(25)

In [3]:
import pandas as pd

df = pd.read_csv("human_or_ai_dataset_small_research_only.csv")

In [4]:
df.head()

Unnamed: 0,text,source
0,Advanced electromagnetic potentials are indi...,human
1,This research paper investigates the question ...,ai
2,We give an algorithm for finding network enc...,human
3,The paper presents an efficient centralized bi...,ai
4,We introduce an exponential random graph mod...,human


## Tentar com outras libs

In [5]:
# Encode the source column, "human" = 0, "ai" = 1
df["source"] = df["source"].apply(lambda x: 0 if x == "human" else 1)

In [6]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

# Assume df is your original DataFrame (loaded from a CSV)
print(df.info())

# Initialize CountVectorizer with binary features
vectorizer = CountVectorizer(max_features=10000, binary=True)
print(df["source"].value_counts())

# Split data into df_tail (98.6%) and df_head (1.2%) using stratified sampling
df_tail, df_head = train_test_split(df, test_size=0.99, random_state=25, stratify=df["source"])

print(df_tail["source"].value_counts())
print(df_head["source"].value_counts())

# Rename "source" to "targetLabel"
df_head = df_head.rename(columns={"source": "targetLabel"})

# Fit and transform the text data (using df_head)
X = vectorizer.fit_transform(df_head['text'])

# Convert the sparse matrix to a DataFrame
df_encoded = pd.DataFrame(
    X.astype("int8").toarray(),
    columns=vectorizer.get_feature_names_out()
)

# Add the "targetLabel" column at the last position
df_encoded["targetLabel"] = df_head["targetLabel"].values

# Final check
print(df_encoded.info())
print(df_encoded.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5051 entries, 0 to 5050
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5051 non-null   object
 1   source  5051 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 79.1+ KB
None
source
1    2753
0    2298
Name: count, dtype: int64
source
1    27
0    23
Name: count, dtype: int64
source
1    2726
0    2275
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5001 entries, 0 to 5000
Columns: 10001 entries, 000 to targetLabel
dtypes: int64(1), int8(10000)
memory usage: 47.7 MB
None
   000  001  02  03  04  05  09  10  100  1000  ...  zeros  zeta  zeus  \
0    0    0   0   0   0   0   0   0    0     0  ...      0     0     0   
1    0    0   0   0   0   0   0   0    0     0  ...      0     0     0   
2    0    0   0   0   0   0   0   0    0     0  ...      0     0     0   
3    0    0   0   0   0   0   0   0    0     0  ...      0     0     

In [7]:
df_encoded.info()
df_encoded.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5001 entries, 0 to 5000
Columns: 10001 entries, 000 to targetLabel
dtypes: int64(1), int8(10000)
memory usage: 47.7 MB


Unnamed: 0,000,001,02,03,04,05,09,10,100,1000,...,zeros,zeta,zeus,zigzag,zinc,zipf,zn,zone,zones,targetLabel
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [8]:
print(df_encoded["targetLabel"].value_counts())

targetLabel
1    2726
0    2275
Name: count, dtype: int64


In [9]:
# Split the data: First separate out 20% as the test set.
df_train_val, df_test = train_test_split(df_encoded, test_size=0.2, random_state=25,stratify=df_encoded["targetLabel"])

# Now split the remaining 80% into training and validation sets.
# Since we want a total of 10% of the original data for validation, we split 12.5% of the remaining data.
df_train, df_val = train_test_split(df_train_val, test_size=0.125, random_state=25,stratify=df_train_val["targetLabel"])

print(df_train["targetLabel"].value_counts())
print(df_test["targetLabel"].value_counts())
print(df_val["targetLabel"].value_counts())

targetLabel
1    1908
0    1592
Name: count, dtype: int64
targetLabel
1    546
0    455
Name: count, dtype: int64
targetLabel
1    272
0    228
Name: count, dtype: int64


In [11]:
# Write each DataFrame split to CSV files.
df_train.to_csv('train.csv', index=False)
df_val.to_csv('validation.csv', index=False)
df_test.to_csv('test.csv', index=False)

# Stor

In [19]:
df_in = pd.read_csv("dataset2_inputs.csv", sep=";")
df_in.drop(columns=["ID"], inplace=True)
# df_in.dropna(subset=["Text"], inplace=True)
df_in.head()

Unnamed: 0,Text
0,The Solar System faces a dramatic future over ...
1,Spermidine is an aliphatic polyamine. Spermidi...
2,The feasibility of extraterrestrial life is a ...
3,Many cross sectional and prospective studies h...
4,There were observations of spectral lines. Tha...


In [20]:
df_in.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    100 non-null    object
dtypes: object(1)
memory usage: 932.0+ bytes


In [21]:
# Rename "Text" to "text"
df_in = df_in.rename(columns={"Text": "text"})

# Fit and transform the text data
X_in = vectorizer.transform(df_in['text'])

# Convert the sparse matrix to a DataFrame
df_in_encoded = pd.DataFrame(
    X_in.astype("int8").toarray(),
    columns=vectorizer.get_feature_names_out()
)

# Final check
print(df_in_encoded.info())
print(df_in_encoded.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Columns: 10000 entries, 000 to zones
dtypes: int8(10000)
memory usage: 976.7 KB
None
   000  001  02  03  04  05  09  10  100  1000  ...  zero  zeros  zeta  zeus  \
0    0    0   0   0   0   0   0   0    0     0  ...     0      0     0     0   
1    0    0   0   0   0   0   0   0    0     0  ...     0      0     0     0   
2    0    0   0   0   0   0   0   0    0     0  ...     0      0     0     0   
3    0    0   0   0   0   0   0   0    0     0  ...     0      0     0     0   
4    0    0   0   0   0   0   0   0    0     0  ...     0      0     0     0   

   zigzag  zinc  zipf  zn  zone  zones  
0       0     0     0   0     0      0  
1       0     0     0   0     0      0  
2       0     0     0   0     1      0  
3       0     0     0   0     0      0  
4       0     0     0   0     0      0  

[5 rows x 10000 columns]


In [22]:
df_in_encoded.to_csv('dataset2_inputs_proc.csv', index=False)