In [1]:
# !pip install fasttext
# !pip install xgboost

In [2]:
import pandas as pd
import nltk
from nltk import FreqDist
from nltk.tokenize import word_tokenize, sent_tokenize


import fasttext
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

import csv

import pandas as pd
import re
import string

import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

# Make sure you've downloaded the appropriate resources
nltk.download('punkt')

np.random.seed(42)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
lang = "es"

df = pd.read_csv("../data/subtask_1/"+lang+"/train.tsv", sep='\t')


mapping = {
    "generated":0,
    "human":1
}

df["label"] = df['label'].map(mapping)
df=df.drop(df.columns[0], axis=1)

In [4]:
def preprocess_text(text):
    # Remove non-printable characters
    text = ''.join(ch for ch in text if ch.isprintable() or ch.isspace())

    # Remove some special characters
#     special_chars = ['$', '%', '^', '_', '`', '{', '}', '~', '\\', '|', '<', '>', '*', '+', '=']
#     for special_char in special_chars:
#         text = text.replace(special_char, '')
        
    # Remove emojis: Emojis fall outside the ASCII range and are replaced with ''
    text = text.encode('ascii', 'ignore').decode('ascii')

    return text

# Assume that df is your DataFrame and 'text_column' is the column with the text
# Replace 'df' and 'text_column' with your actual DataFrame name and text column name

# Applying the preprocessing function
df['text'] = df['text'].apply(lambda x: preprocess_text(x))


In [5]:
def calculate_features(text):
    word_tokens = word_tokenize(text)
    sent_tokens = sent_tokenize(text)
    
    avg_word_length = sum(len(word) for word in word_tokens) / len(word_tokens)
    
    avg_sent_length = sum(len(sent) for sent in sent_tokens) / len(sent_tokens)
    
    vocab_richness = len(set(word_tokens)) / len(word_tokens)
    
    freq_dist = FreqDist(word_tokens)
    repetition_rate = len([freq for word, freq in freq_dist.items() if freq > 1]) / len(word_tokens)

    return avg_word_length, avg_sent_length, vocab_richness, repetition_rate


df['avg_word_length'], df['avg_sent_length'], df['vocab_richness'], df['repetition_rate'] = zip(*df['text'].map(calculate_features))


In [6]:
RANDOM_SEED = 42


train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
train_df, valid_df = train_test_split(train_df, test_size=0.1, random_state=42)


train_df['text'].fillna('', inplace=True)
train_df['label'].fillna('0', inplace=True)  

train_df[['label', 'text']].to_csv('train.txt', index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, escapechar = ' ')


ft_model = fasttext.train_supervised('train.txt', lr=1.0, epoch=25, wordNgrams=2)


def generate_sentence_vectors(text):
    return ft_model.get_sentence_vector(text)


train_vectors = np.array(train_df['text'].apply(generate_sentence_vectors).tolist())
test_vectors = np.array(test_df['text'].apply(generate_sentence_vectors).tolist())


train_features = np.concatenate([train_vectors, train_df[['avg_word_length', 'avg_sent_length', 'vocab_richness', 'repetition_rate']].values], axis=1)
test_features = np.concatenate([test_vectors, test_df[['avg_word_length', 'avg_sent_length', 'vocab_richness', 'repetition_rate']].values], axis=1)


scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)
test_features = scaler.transform(test_features)


lr_model = LogisticRegression(random_state=RANDOM_SEED)
lr_model.fit(train_features, train_df['label'])


test_preds = lr_model.predict(test_features)



true_labels = test_df['label']
predicted_labels = test_preds

# Calculate metrics
accuracy = accuracy_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels, average='weighted')  
precision = precision_score(true_labels, predicted_labels, average='weighted')  
recall = recall_score(true_labels, predicted_labels, average='weighted')

# Print metrics
print('Accuracy:', accuracy)
print('F1 Score:', f1)
print('Precision:', precision)
print('Recall:', recall)


Read 1M words
Number of words:  84618
Number of labels: 0
Progress: 100.0% words/sec/thread: 1104162 lr:  0.000000 avg.loss:      -nan ETA:   0h 0m 0s


Accuracy: 0.637043966323667
F1 Score: 0.6356382167345916
Precision: 0.637190875397059
Recall: 0.637043966323667


In [10]:
27414+3046+3385+25969+2886+3207
18156+2018+2242+17766+1975+2194

44351

In [7]:
print(len(train_df))
print(len(valid_df))
print(len(test_df))


# subtask1
# en
# train: 27414
# valid: 3046
# test: 3385

# es
# train: 25969
# valid: 2886
# test: 3207


# subtask2
# en
# train: 18156
# valid: 2018
# test: 2242

# es
# train: 17766
# valid: 1975
# test: 2194

25969
2886
3207


In [8]:
le = LabelEncoder()
y_train_encoded = le.fit_transform(train_df['label'])
y_test_encoded = le.transform(test_df['label'])


dtrain = xgb.DMatrix(train_features, label=y_train_encoded)
dtest = xgb.DMatrix(test_features, label=y_test_encoded)


parameters = {
    'objective': 'multi:softmax',
    'num_class': 6,  
    'max_depth': 5,
    'eta': 0.3,
    'seed': RANDOM_SEED,  
    'silent': 1,  
}


xgb_model = xgb.train(parameters, dtrain, num_boost_round=10)


y_pred = xgb_model.predict(dtest)


accuracy = accuracy_score(y_test_encoded, y_pred)
precision = precision_score(y_test_encoded, y_pred, average='macro')
recall = recall_score(y_test_encoded, y_pred, average='macro')
f1 = f1_score(y_test_encoded, y_pred, average='macro')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")



Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Accuracy: 0.7087620829435609
Precision: 0.7085842652922298
Recall: 0.708762107951537
F1 Score: 0.7086111024639283
