In [1]:
import pandas as pd
import string
import re
from language_tool_python import LanguageTool
import helpers
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import xgboost
import numpy as np
from scipy.sparse import csr_matrix, hstack

train_path = "data/SemEval2024-Task8/SubtaskA/subtaskA_train_monolingual.jsonl"
val_path = "data/SemEval2024-Task8/SubtaskA/subtaskA_dev_monolingual.jsonl"
train_addon1_path = "./backtranslation_data_mono/mono_addon1.jsonl"
train_addon2_path = "./backtranslation_data_mono/mono_addon2.jsonl"
train_addon3_path = "./backtranslation_data_mono/mono_addon3.jsonl"

train_df, val_df = helpers.get_pandas_dfs(train_path, val_path)
# train_addon1_df = helpers.get_pandas_atomic_dfs(train_addon1_path)
# train_addon2_df = helpers.get_pandas_atomic_dfs(train_addon2_path)
# train_addon3_df = helpers.get_pandas_atomic_dfs(train_addon3_path)

# train_df = pd.concat([train_df, train_addon1_df, train_addon2_df, train_addon3_df], axis=0, ignore_index=True)
train_df

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,text,label
0,Forza Motorsport is a popular racing game that...,LLM
1,Buying Virtual Console games for your Nintendo...,LLM
2,Windows NT 4.0 was a popular operating system ...,LLM
3,How to Make Perfume\n\nPerfume is a great way ...,LLM
4,How to Convert Song Lyrics to a Song'\n\nConve...,LLM
...,...,...
119752,"The paper is an interesting contribution, prim...",human
119753,\nWe thank the reviewers for all their comment...,human
119754,The authors introduce a semi-supervised method...,human
119755,This paper proposes the Neural Graph Machine t...,human


In [2]:
def text_statistics(row):
    text = row["text"]
    sentences = re.split(r'[.!?]', text)
    words_per_sentence = [len(sentence.split()) for sentence in sentences]
    std_words_per_sentence = np.std(words_per_sentence)
    num_sentences = len(sentences)

    words = re.findall(r'\b\w+\b', text)
    
    num_words = len(words)
    num_characters = len(''.join(words))

    avg_word_length = num_characters / num_words if num_words > 0 else 0
    num_digits = sum(c.isdigit() for c in text)

    num_punctuations = sum(c in string.punctuation for c in text)

    num_other_characters = len(text) - num_words - num_punctuations - num_digits

    avg_words_per_sentence = num_words / num_sentences if num_sentences > 0 else 0

    row["num_sentences"] = num_sentences
    row["std_words_per_sentence"] = std_words_per_sentence
    row["num_words"] = num_words
    row["average_words_per_sentence"] = avg_words_per_sentence
    row["average_word_length"] = avg_word_length
    row["num_digits"] = num_digits
    row["num_punctuations"] = num_punctuations
    row["other_characters"] = num_other_characters

    return row

train_df = train_df.apply(text_statistics, axis=1)
train_df

Unnamed: 0,text,label,num_sentences,std_words_per_sentence,num_words,average_words_per_sentence,average_word_length,num_digits,num_punctuations,other_characters
0,Forza Motorsport is a popular racing game that...,LLM,38,8.394572,410,10.789474,4.224390,15,82,1737
1,Buying Virtual Console games for your Nintendo...,LLM,69,7.245130,693,10.043478,4.151515,30,132,2873
2,Windows NT 4.0 was a popular operating system ...,LLM,110,7.022920,939,8.536364,4.412141,79,153,4066
3,How to Make Perfume\n\nPerfume is a great way ...,LLM,68,7.833806,810,11.911765,4.572840,18,152,3749
4,How to Convert Song Lyrics to a Song'\n\nConve...,LLM,43,11.794435,585,13.604651,4.135043,9,89,2412
...,...,...,...,...,...,...,...,...,...,...
119752,"The paper is an interesting contribution, prim...",human,5,9.620811,73,14.600000,4.904110,2,14,351
119753,\nWe thank the reviewers for all their comment...,human,65,9.448008,766,11.784615,4.870757,155,262,3654
119754,The authors introduce a semi-supervised method...,human,10,7.736278,176,17.600000,5.079545,24,37,872
119755,This paper proposes the Neural Graph Machine t...,human,15,11.999259,203,13.533333,4.655172,13,25,935


In [3]:
val_df = val_df.apply(text_statistics, axis=1)
val_df

Unnamed: 0,text,label,num_sentences,std_words_per_sentence,num_words,average_words_per_sentence,average_word_length,num_digits,num_punctuations,other_characters
0,Giving gifts should always be enjoyable. Howe...,LLM,11,14.399265,191,17.363636,4.769634,0,29,925
1,Yveltal (Japanese: ユベルタル) is one of the main a...,LLM,16,6.872727,181,11.312500,4.060773,1,26,778
2,If you'd rather not annoy others by being rude...,LLM,25,6.223311,168,6.720000,4.452381,11,33,785
3,If you're interested in visiting gravesite(s) ...,LLM,18,13.348140,219,12.166667,5.082192,2,42,1112
4,The following are some tips for becoming succe...,LLM,23,5.619975,173,7.521739,4.670520,0,51,821
...,...,...,...,...,...,...,...,...,...,...
4995,The paper deals with an interesting applicatio...,human,22,13.336708,432,19.636364,4.594907,7,67,1980
4996,This manuscript tries to tackle neural network...,human,17,16.208631,395,23.235294,4.868354,22,63,1887
4997,The paper introduced a regularization scheme t...,human,15,6.331579,130,8.666667,4.807692,21,28,606
4998,Inspired by the analysis on the effect of the ...,human,16,8.365666,291,18.187500,5.360825,11,36,1557


In [4]:
tf_idf_vect = TfidfVectorizer(ngram_range=(1, 3), stop_words='english', max_features = 30000)
tf_idf_feats = tf_idf_vect.fit_transform(train_df["text"])
other_feats = train_df.drop(columns=["text", "label"]).to_numpy()
le = LabelEncoder()
y_train = le.fit_transform(train_df["label"])
X_train = hstack((tf_idf_feats, csr_matrix(other_feats)))

In [5]:
tf_idf_feats = tf_idf_vect.transform(val_df["text"])
other_feats = val_df.drop(columns=["text", "label"]).to_numpy()
y_val = le.transform(val_df["label"])
X_val = hstack((tf_idf_feats, csr_matrix(other_feats)))

In [6]:
scale_pos_weight =  sum((~y_train.astype(bool)).astype(int)) / sum(y_train)
early_stop = xgboost.callback.EarlyStopping(
    rounds=10, metric_name='logloss', save_best=True
)
xgb = xgboost.XGBClassifier(learning_rate=0.3, n_estimators=500, callbacks=[early_stop], scale_pos_weight=scale_pos_weight)


xgb.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True)
y_pred = xgb.predict(X_val) 
print(f"XGB metrics: {helpers.calculate_metrics(y_val, y_pred)}")

[0]	validation_0-logloss:0.63123
[1]	validation_0-logloss:0.60754
[2]	validation_0-logloss:0.57903
[3]	validation_0-logloss:0.56282
[4]	validation_0-logloss:0.55411
[5]	validation_0-logloss:0.54748
[6]	validation_0-logloss:0.54909
[7]	validation_0-logloss:0.54724
[8]	validation_0-logloss:0.54575
[9]	validation_0-logloss:0.54114
[10]	validation_0-logloss:0.53842
[11]	validation_0-logloss:0.54172
[12]	validation_0-logloss:0.53525
[13]	validation_0-logloss:0.53364
[14]	validation_0-logloss:0.53227
[15]	validation_0-logloss:0.53135
[16]	validation_0-logloss:0.52976
[17]	validation_0-logloss:0.53180
[18]	validation_0-logloss:0.53229
[19]	validation_0-logloss:0.52369
[20]	validation_0-logloss:0.52451
[21]	validation_0-logloss:0.52226
[22]	validation_0-logloss:0.52223
[23]	validation_0-logloss:0.51903
[24]	validation_0-logloss:0.51839
[25]	validation_0-logloss:0.51920
[26]	validation_0-logloss:0.52708
[27]	validation_0-logloss:0.52663
[28]	validation_0-logloss:0.52316
[29]	validation_0-loglos

In [7]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier, XGBRFClassifier


random_forest = RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced")
xgboost = XGBClassifier(learning_rate=0.3, n_estimators=100, random_state=42, scale_pos_weight=scale_pos_weight)
xgb_random_forest = XGBRFClassifier(n_estimators=100, random_state=42)

voting_classifier = VotingClassifier(
    estimators=[
        ('random_forest', random_forest),
        ('xgb_classifier', xgboost),
        ('xgb_rf_classifier', xgb_random_forest)
    ],
    voting='soft'
)


voting_classifier.fit(X_train, y_train)
y_pred = voting_classifier.predict(X_val) 
print(f"Voting Classifier metrics: {helpers.calculate_metrics(y_val, y_pred)}")

Voting Classifier metrics: {'accuracy': 0.7242, 'precision': 0.6754303599374022, 'recall': 0.8632, 'f1_score': 0.757857769973661}
