In [1]:
import numpy as np
import pandas as pd
import os
import random
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

def set_seed(seed: int):
    random.seed(seed) # Python
    np.random.seed(seed)  # Numpy, é o gerador utilizado pelo sklearn
    os.environ["PYTHONHASHSEED"] = str(seed)  # sistema operativo

set_seed(25)

In [2]:
def load_glove_embeddings(file_path, word_index, embedding_dim=100):
    """Load GloVe vectors and create an embedding matrix."""
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = vector
    
    vocab_size = len(word_index) + 1  # +1 for padding index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    for word, i in word_index.items():
        if word in embeddings_index:
            embedding_matrix[i] = embeddings_index[word]  # Use pre-trained GloVe vector
        else:
            embedding_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim,))  # Random init

    return embedding_matrix

def tokenize_and_index(texts):
    """Tokenizes text and creates a word index mapping."""
    word_index = {}
    tokenized_texts = []

    for sentence in texts:
        words = sentence.lower().split()  # Basic tokenization
        indexed_sentence = []
        
        for word in words:
            if word not in word_index:
                word_index[word] = len(word_index) + 1  # Start index from 1
            indexed_sentence.append(word_index[word])

        tokenized_texts.append(indexed_sentence)
    
    return tokenized_texts, word_index


In [None]:
from sklearn.model_selection import train_test_split
df = pd.read_csv("datasets/human_or_ai_dataset.csv")
df = df.dropna()

In [4]:
# Split data into df_tail (98.6%) and df_head (1.2%) using stratified sampling
df_tail, df_head = train_test_split(df, test_size=0.008, random_state=25, stratify=df["source"])
df = df_head
print(df["source"].value_counts())


source
human    16019
ai       11188
Name: count, dtype: int64


In [5]:
# Encode the source column, "human" = 0, "ai" = 1
df["source"] = df["source"].apply(lambda x: 0 if x == "human" else 1)
df = df.rename(columns={"source": "targetLabel"})

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27207 entries, 860790 to 170303
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   text         27207 non-null  object
 1   targetLabel  27207 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 637.7+ KB


In [7]:
df.head()

Unnamed: 0,text,targetLabel
860790,"r mother! I did it in your own basement, you n...",0
1343844,How about by grabbing a bowl and mixing togeth...,1
3012376,"On one side, they want their children to exper...",1
1311978,Tottenham manager Mauricio Pochettino was not ...,1
625756,"``Harold, I'm bored.'' \n \n ``Would you kindl...",0


In [8]:
tokenized_texts, word_index = tokenize_and_index(df["text"])  # Convert text to indices

In [None]:
max_length = max(len(seq) for seq in tokenized_texts)  # Find longest sequence
padded_sequences = pad_sequences(tokenized_texts, maxlen=max_length, padding="post")
glove_path = "../glove.6B.100d.txt"  # Path to your GloVe file
embedding_matrix = load_glove_embeddings(glove_path, word_index, embedding_dim=100)
processed_df = pd.DataFrame(padded_sequences)  # Convert to Pandas DataFrame

In [10]:
import json

# Save word_index as a JSON file
with open("word_index.json", "w") as f:
    json.dump(word_index, f)

In [11]:
processed_df["targetLabel"] = df["targetLabel"].values   # Add the target column back

In [12]:
processed_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55996,55997,55998,55999,56000,56001,56002,56003,56004,targetLabel
0,1,2,3,4,5,6,7,8,9,10,...,0,0,0,0,0,0,0,0,0,0
1,133,134,100,135,74,136,30,137,138,139,...,0,0,0,0,0,0,0,0,0,1
2,42,197,198,199,168,200,201,17,202,32,...,0,0,0,0,0,0,0,0,0,1
3,345,346,347,348,181,349,134,17,147,350,...,0,0,0,0,0,0,0,0,0,1
4,486,62,487,488,10,489,490,32,491,492,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# Split the data: First separate out 20% as the test set.
df_train_val, df_test = train_test_split(processed_df, test_size=0.2, random_state=25,stratify=processed_df["targetLabel"])

# Now split the remaining 80% into training and validation sets.
# Since we want a total of 10% of the original data for validation, we split 12.5% of the remaining data.
df_train, df_val = train_test_split(df_train_val, test_size=0.125, random_state=25,stratify=df_train_val["targetLabel"])

print(df_train["targetLabel"].value_counts())
print(df_test["targetLabel"].value_counts())
print(df_val["targetLabel"].value_counts())

targetLabel
0    11213
1     7831
Name: count, dtype: int64
targetLabel
0    3204
1    2238
Name: count, dtype: int64
targetLabel
0    1602
1    1119
Name: count, dtype: int64


In [14]:
# Write each DataFrame split to CSV files.
df_train.to_csv('train_emb.csv', index=False)
df_val.to_csv('validation_emb.csv', index=False)
df_test.to_csv('test_emb.csv', index=False)

In [15]:
print(max_length)

56005
