In [1]:
import pandas as pd
import string
import re
from language_tool_python import LanguageTool
import helpers
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import xgboost
import numpy as np
from scipy.sparse import csr_matrix, hstack

train_path = "data/SemEval2024-Task8/SubtaskA/subtaskA_train_monolingual.jsonl"
val_path = "data/SemEval2024-Task8/SubtaskA/subtaskA_dev_monolingual.jsonl"

train_df, val_df = helpers.get_pandas_dfs(train_path, val_path)
train_df

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,text,label
0,Forza Motorsport is a popular racing game that...,LLM
1,Buying Virtual Console games for your Nintendo...,LLM
2,Windows NT 4.0 was a popular operating system ...,LLM
3,How to Make Perfume\n\nPerfume is a great way ...,LLM
4,How to Convert Song Lyrics to a Song'\n\nConve...,LLM
...,...,...
119752,"The paper is an interesting contribution, prim...",human
119753,\nWe thank the reviewers for all their comment...,human
119754,The authors introduce a semi-supervised method...,human
119755,This paper proposes the Neural Graph Machine t...,human


In [3]:
def text_statistics(row):
    text = row["text"]
    sentences = re.split(r'[.!?]', text)
    num_sentences = len(sentences)

    words = re.findall(r'\b\w+\b', text)
    
    num_words = len(words)
    num_characters = len(''.join(words))

    avg_word_length = num_characters / num_words if num_words > 0 else 0
    num_digits = sum(c.isdigit() for c in text)

    num_punctuations = sum(c in string.punctuation for c in text)

    num_other_characters = len(text) - num_words - num_punctuations - num_digits

    avg_words_per_sentence = num_words / num_sentences if num_sentences > 0 else 0

    row["num_sentences"] = num_sentences
    row["num_words"] = num_words
    row["average_words_per_sentence"] = avg_words_per_sentence
    row["average_word_length"] = avg_word_length
    row["num_digits"] = num_digits
    row["num_punctuations"] = num_punctuations
    row["other_characters"] = num_other_characters

    return row

train_df = train_df.apply(text_statistics, axis=1)
train_df

CalledProcessError: Command '['/usr/bin/java', '-version']' returned non-zero exit status 1.

In [6]:
val_df = val_df.apply(text_statistics, axis=1)
val_df

Unnamed: 0,text,label,num_sentences,num_words,average_words_per_sentence,average_word_length,num_digits,num_punctuations,other_characters
0,Giving gifts should always be enjoyable. Howe...,LLM,11,191,17.363636,4.769634,0,29,925
1,Yveltal (Japanese: ユベルタル) is one of the main a...,LLM,16,181,11.312500,4.060773,1,26,778
2,If you'd rather not annoy others by being rude...,LLM,25,168,6.720000,4.452381,11,33,785
3,If you're interested in visiting gravesite(s) ...,LLM,18,219,12.166667,5.082192,2,42,1112
4,The following are some tips for becoming succe...,LLM,23,173,7.521739,4.670520,0,51,821
...,...,...,...,...,...,...,...,...,...
4995,The paper deals with an interesting applicatio...,human,22,432,19.636364,4.594907,7,67,1980
4996,This manuscript tries to tackle neural network...,human,17,395,23.235294,4.868354,22,63,1887
4997,The paper introduced a regularization scheme t...,human,15,130,8.666667,4.807692,21,28,606
4998,Inspired by the analysis on the effect of the ...,human,16,291,18.187500,5.360825,11,36,1557


In [7]:
tf_idf = TfidfVectorizer(ngram_range=(1, 3), stop_words='english', max_features = 30000)
tf_idf_feats = tf_idf.fit_transform(train_df["text"])
other_feats = train_df.drop(columns=["text", "label"]).to_numpy()
le = LabelEncoder()
y_train = le.fit_transform(train_df["label"])
X_train = hstack((tf_idf_feats, csr_matrix(other_feats)))

In [9]:
tf_idf_feats = tf_idf.transform(val_df["text"])
other_feats = val_df.drop(columns=["text", "label"]).to_numpy()
y_val = le.transform(val_df["label"])
X_val = hstack((tf_idf_feats, csr_matrix(other_feats)))

In [10]:
scale_pos_weight =  sum((~y_train.astype(bool)).astype(int)) / sum(y_train)
scale_pos_weight

0.8903726855140408

In [12]:
early_stop = xgboost.callback.EarlyStopping(
    rounds=10, metric_name='logloss', save_best=True
)
xgb = xgboost.XGBClassifier(learning_rate=0.3, n_estimators=500, callbacks=[early_stop], scale_pos_weight=scale_pos_weight)

xgb.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True)
y_pred = xgb.predict(X_val) 
print(f"XGB metrics: {helpers.calculate_metrics(y_val, y_pred)}")

[0]	validation_0-logloss:0.65839
[1]	validation_0-logloss:0.62201
[2]	validation_0-logloss:0.60333
[3]	validation_0-logloss:0.58537
[4]	validation_0-logloss:0.57875
[5]	validation_0-logloss:0.56585
[6]	validation_0-logloss:0.55028
[7]	validation_0-logloss:0.54853
[8]	validation_0-logloss:0.53837
[9]	validation_0-logloss:0.54190
[10]	validation_0-logloss:0.54052
[11]	validation_0-logloss:0.54261
[12]	validation_0-logloss:0.53650
[13]	validation_0-logloss:0.53934
[14]	validation_0-logloss:0.53308
[15]	validation_0-logloss:0.53199
[16]	validation_0-logloss:0.53016
[17]	validation_0-logloss:0.52998
[18]	validation_0-logloss:0.52404
[19]	validation_0-logloss:0.51847
[20]	validation_0-logloss:0.52140
[21]	validation_0-logloss:0.51809
[22]	validation_0-logloss:0.52310
[23]	validation_0-logloss:0.51724
[24]	validation_0-logloss:0.51871
[25]	validation_0-logloss:0.51430
[26]	validation_0-logloss:0.51664
[27]	validation_0-logloss:0.51368
[28]	validation_0-logloss:0.51516
[29]	validation_0-loglos