This file contains an experiment in which the statistical features and the extracted embeddings are combined.

In [3]:
!pip install autogluon
!pip install lightgbm
!pip install pygraphviz

Collecting pygraphviz
  Using cached pygraphviz-1.13.tar.gz (104 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: pygraphviz
  Building wheel for pygraphviz (pyproject.toml) ... [?25l[?25hdone
  Created wheel for pygraphviz: filename=pygraphviz-1.13-cp310-cp310-linux_x86_64.whl size=168497 sha256=30c1cd7b9ab70505dc2382cdd007285e5bfc417ce3a0fcb0d5f6821fb66f7487
  Stored in directory: /root/.cache/pip/wheels/c5/96/10/6c25add1fffc368b1927252bf73b63fcb938de8f4486e23691
Successfully built pygraphviz
Installing collected packages: pygraphviz
Successfully installed pygraphviz-1.13


In [4]:
from tqdm import tqdm


In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import pickle
with open('/content/drive/MyDrive/multitude_split/train_embeddingsmdeberta-2.pkl', 'rb') as f:
    train_embeddings = pickle.load(f)

with open('/content/drive/MyDrive/multitude_split/test_embeddingsmdeberta-2.pkl', 'rb') as f:
    test_embeddings = pickle.load(f)

In [7]:
import pandas as pd
# Convert embeddings into DataFrame
train_embeddings_df = pd.DataFrame(train_embeddings).reset_index(drop=True)
test_embeddings_df = pd.DataFrame(test_embeddings).reset_index(drop=True)

In [8]:
data = pd.read_csv('/content/drive/MyDrive/multitude_split/dataset_all.csv')
train = data[data.split == "train"]
test = data[data.split == "test"]

In [9]:
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [10]:
train_embeddings_df['label'] = train['label']
test_embeddings_df['label'] = test['label']

In [13]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

statistical_features = ['word_count', 'unique_word_count', 'char_count', 'avg_word_length',
       'ttr', 'hapax_legomenon', 'sentence_count', 'avg_sentence_length',
       'avg_sentence_complexity', 'punctuation_count', 'noun_count',
       'stopword_count', 'verb_count', 'adj_count', 'adv_count',
       'complex_sentence_count', 'question_mark_count',
       'exclamation_mark_count', 'flesch_reading_ease', 'gunning_fog_index',
       'first_person_pronoun_count', 'person_entity_count',
       'date_entity_count', 'uniqueness_bigram', 'uniqueness_trigram',
       'syntax_variety']
# Assuming you have a 'split' column to distinguish between train and test data
train_data = data[data['split'] == 'train']
test_data = data[data['split'] == 'test']

# Fit and transform the training data
train_data[statistical_features] = scaler.fit_transform(train_data[statistical_features])

# Transform the test data based on the scaler fitted to the training data
test_data[statistical_features] = scaler.transform(test_data[statistical_features])

# Extract features and labels
train_data = train_data[['word_count', 'unique_word_count', 'char_count', 'avg_word_length',
       'ttr', 'hapax_legomenon', 'sentence_count', 'avg_sentence_length',
       'avg_sentence_complexity', 'punctuation_count', 'noun_count',
       'stopword_count', 'verb_count', 'adj_count', 'adv_count',
       'complex_sentence_count', 'question_mark_count',
       'exclamation_mark_count', 'flesch_reading_ease', 'gunning_fog_index',
       'first_person_pronoun_count', 'person_entity_count',
       'date_entity_count', 'uniqueness_bigram', 'uniqueness_trigram',
       'syntax_variety', 'label']]

test_data = test_data[['word_count', 'unique_word_count', 'char_count', 'avg_word_length',
       'ttr', 'hapax_legomenon', 'sentence_count', 'avg_sentence_length',
       'avg_sentence_complexity', 'punctuation_count', 'noun_count',
       'stopword_count', 'verb_count', 'adj_count', 'adv_count',
       'complex_sentence_count', 'question_mark_count',
       'exclamation_mark_count', 'flesch_reading_ease', 'gunning_fog_index',
       'first_person_pronoun_count', 'person_entity_count',
       'date_entity_count', 'uniqueness_bigram', 'uniqueness_trigram',
       'syntax_variety', 'label']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data[statistical_features] = scaler.fit_transform(train_data[statistical_features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data[statistical_features] = scaler.transform(test_data[statistical_features])


In [16]:
train_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

In [17]:
print(train_data.shape)
print(test_data.shape)
print(train_embeddings_df.shape)
print(test_embeddings_df.shape)

(44786, 27)
(29295, 27)
(44786, 769)
(29295, 769)


In [18]:
train_data = pd.concat([train_data.drop(columns=['label']), train_embeddings_df], axis=1)
test_data = pd.concat([test_data.drop(columns=['label']), test_embeddings_df], axis=1)

In [19]:
train_data.head()

Unnamed: 0,word_count,unique_word_count,char_count,avg_word_length,ttr,hapax_legomenon,sentence_count,avg_sentence_length,avg_sentence_complexity,punctuation_count,...,759,760,761,762,763,764,765,766,767,label
0,0.125737,0.158537,0.110176,0.048981,0.768652,0.614286,0.052632,0.031683,0.067629,0.035831,...,-0.373299,-0.365936,-0.555313,-0.026099,0.262471,0.250163,-0.029813,0.458692,-0.62851,1
1,0.561886,0.448171,0.534047,0.055545,0.504326,0.376712,0.175439,0.049288,0.043773,0.104235,...,0.112705,-0.111735,-0.620949,0.267168,0.343089,0.459632,-0.257986,0.627208,-0.190547,1
2,0.923379,0.731707,0.855394,0.053752,0.502433,0.386555,0.263158,0.055526,0.038802,0.153094,...,0.337295,0.048234,-0.631433,0.758225,0.165882,0.613361,-0.216776,0.901802,0.253716,1
3,0.75835,0.609756,0.607881,0.043773,0.509419,0.369898,0.192982,0.061202,0.035125,0.18241,...,0.165922,-0.368102,-0.471804,0.533132,0.438845,0.547391,-0.443912,0.351721,0.308104,1
4,0.045187,0.073171,0.042081,0.051239,0.895295,0.793103,0.0,0.054066,0.039867,0.019544,...,-0.456476,-0.279839,-0.582763,-0.053895,0.305651,0.256392,-0.088894,0.541938,-0.709234,1


In [20]:
X_train = train_data.drop('label', axis=1)
y_train = train_data['label']
X_test = test_data.drop('label', axis=1)
y_test = test_data['label']

In [21]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout

# count the number of features
n_features = X_train.shape[1]

# define the model
model = Sequential()
model.add(Dense(units=256, activation='relu', input_shape=(n_features,)))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=32, activation='relu'))
model.add(Dense(units=1, activation='sigmoid'))

# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# print the model summary
model.summary()

# fit the model
model.fit(X_train, y_train, epochs=100, batch_size=64, verbose=1)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m700/700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.9591 - loss: 0.1130
Epoch 2/100
[1m700/700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9689 - loss: 0.0820
Epoch 3/100
[1m700/700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.9713 - loss: 0.0759
Epoch 4/100
[1m700/700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9732 - loss: 0.0719
Epoch 5/100
[1m700/700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9746 - loss: 0.0661
Epoch 6/100
[1m700/700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9734 - loss: 0.0683
Epoch 7/100
[1m700/700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9747 - loss: 0.0647
Epoch 8/100
[1m700/700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9735 - loss: 0.0677
Epoch 9/100
[1m700/700[0m [32

<keras.src.callbacks.history.History at 0x7daa92d71210>

In [22]:
# evaluate the model
loss, acc = model.evaluate(X_test, y_test, verbose=0)
print('Test Accuracy: %.3f' % acc)

Test Accuracy: 0.958


In [23]:
pred_prob = model.predict(X_test)
predictions = (pred_prob > 0.5).astype(int)

[1m916/916[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step


In [24]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
true_labels = y_test
cr = classification_report(true_labels, predictions, digits=4, zero_division=0)
cm = confusion_matrix(true_labels, predictions)
TN = cm[0][0]
FN = cm[1][0]
TP = cm[1][1]
FP = cm[0][1]
# Fall out or false positive rate
FPR = FP/(FP+TN) if (FP+TN) > 0 else 0
# False negative rate
FNR = FN/(TP+FN) if (TP+FN) > 0 else 0

roc = roc_auc_score(true_labels, predictions)
roc_prob = roc_auc_score(true_labels, pred_prob)

print(cm)
print(cr)
print(f"FPR: {FPR}")
print(f"FNR: {FNR}")
print(f"ROC: {roc}")
print(f"ROC_prob: {roc_prob}")

[[ 2476   760]
 [  475 25584]]
              precision    recall  f1-score   support

           0     0.8390    0.7651    0.8004      3236
           1     0.9712    0.9818    0.9764     26059

    accuracy                         0.9578     29295
   macro avg     0.9051    0.8735    0.8884     29295
weighted avg     0.9566    0.9578    0.9570     29295

FPR: 0.23485784919653893
FNR: 0.01822786753137112
ROC: 0.8734571416360449
ROC_prob: 0.9714192942695264


In [25]:
save_path = '/content/drive/MyDrive/multitude_split/autogluon_combined_features'
from autogluon.tabular import TabularPredictor
label_column = 'label'

predictor = TabularPredictor(label='label', path=save_path, eval_metric='f1_macro',
                             problem_type='binary',  # Adjust according to your specific problem, e.g., 'binary', 'multiclass'
                             verbosity=2)

In [26]:
predictor.fit(train_data=train_data, presets='best_quality', ag_args_fit={'num_gpus': 1})

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.10.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Thu Jun 27 21:05:47 UTC 2024
CPU Count:          2
Memory Avail:       9.04 GB / 12.67 GB (71.3%)
Disk Space Avail:   44.18 GB / 100.00 GB (44.2%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation data is used to detect stacked overfitting.
	Running DyStack for up to 900s of th

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7daa936f27a0>

In [28]:
# Evaluate and print the leaderboard
performance = predictor.evaluate(test_data, auxiliary_metrics=True)
print("Model performance on test data:", performance)

Model performance on test data: {'f1_macro': 0.8933355906013467, 'accuracy': 0.9585253456221198, 'balanced_accuracy': 0.8891325562877166, 'mcc': 0.7867508564275396, 'roc_auc': 0.9792767847194332, 'f1': 0.9767228001609288, 'precision': 0.9752467671589257, 'recall': 0.9782033078782763}


In [29]:
leaderboard = predictor.leaderboard(
    test_data,
    silent=True,
    extra_metrics=['accuracy', 'roc_auc', 'f1_macro', 'f1_weighted']
)
leaderboard

Unnamed: 0,model,score_test,accuracy,roc_auc,f1_macro,f1_weighted,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,LightGBMXT_BAG_L2,0.893336,0.958525,0.979277,0.893336,0.9583,0.937759,f1_macro,185.575509,282.449452,2258.68794,4.602788,0.72154,416.614708,2,True,7
1,WeightedEnsemble_L3,0.893336,0.958525,0.979277,0.893336,0.9583,0.937759,f1_macro,185.588224,282.464545,2261.956976,0.012715,0.015093,3.269036,3,True,9
2,LightGBMXT_BAG_L1,0.887448,0.957092,0.979524,0.887448,0.956424,0.936906,f1_macro,6.895828,1.245612,658.001169,6.895828,1.245612,658.001169,1,True,3
3,WeightedEnsemble_L2,0.887448,0.957092,0.979524,0.887448,0.956424,0.936906,f1_macro,6.901165,1.268593,659.932845,0.005337,0.022981,1.931676,2,True,6
4,LightGBM_BAG_L1,0.884952,0.956443,0.97851,0.884952,0.955607,0.935656,f1_macro,4.306035,1.019752,662.368795,4.306035,1.019752,662.368795,1,True,4
5,RandomForestGini_BAG_L1,0.881192,0.956033,0.971746,0.881192,0.954656,0.926385,f1_macro,1.382456,21.977498,517.313219,1.382456,21.977498,517.313219,1,True,5
6,KNeighborsDist_BAG_L1,0.88052,0.954361,0.939264,0.88052,0.953697,0.917638,f1_macro,82.815081,128.547228,2.576011,82.815081,128.547228,2.576011,1,True,2
7,KNeighborsUnif_BAG_L1,0.88052,0.954361,0.938951,0.88052,0.953697,0.917638,f1_macro,85.573322,128.937822,1.814038,85.573322,128.937822,1.814038,1,True,1
8,LightGBM_BAG_L2,0.47077,0.889537,0.96273,0.47077,0.837535,0.471963,f1_macro,184.273649,282.080709,1982.44514,3.300928,0.352796,140.371908,2,True,8


In [30]:
best_model = predictor.get_model_best()
best_model

  best_model = predictor.get_model_best()


'WeightedEnsemble_L3'

In [31]:
predictions = predictor.predict(test_data.drop(columns=['label']), model=best_model)
predictions_prob = predictor.predict_proba(test_data.drop(columns=['label']), model=best_model)

In [32]:
true_labels = test_data['label']
pred_prob = []
for i in range(len(predictions_prob)):
  pred_prob.append(predictions_prob[true_labels[i]][i])

In [33]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
true_labels = test_embeddings_df['label']
cr = classification_report(true_labels, predictions, digits=4, zero_division=0)
cm = confusion_matrix(true_labels, predictions)
TN = cm[0][0]
FN = cm[1][0]
TP = cm[1][1]
FP = cm[0][1]
# Fall out or false positive rate
FPR = FP/(FP+TN) if (FP+TN) > 0 else 0
# False negative rate
FNR = FN/(TP+FN) if (TP+FN) > 0 else 0

roc = roc_auc_score(true_labels, predictions)
roc_prob = roc_auc_score(true_labels, pred_prob)

print(cm)
print(cr)
print(f"FPR: {FPR}")
print(f"FNR: {FNR}")
print(f"ROC: {roc}")
print(f"ROC_prob: {roc_prob}")

[[ 2589   647]
 [  568 25491]]
              precision    recall  f1-score   support

           0     0.8201    0.8001    0.8099      3236
           1     0.9752    0.9782    0.9767     26059

    accuracy                         0.9585     29295
   macro avg     0.8977    0.8891    0.8933     29295
weighted avg     0.9581    0.9585    0.9583     29295

FPR: 0.199938195302843
FNR: 0.02179669212172378
ROC: 0.8891325562877167
ROC_prob: 0.9220517221759448


In [34]:
!pip install autokeras

Collecting autokeras
  Downloading autokeras-2.0.0-py3-none-any.whl.metadata (5.8 kB)
Collecting keras-tuner>=1.4.0 (from autokeras)
  Downloading keras_tuner-1.4.7-py3-none-any.whl.metadata (5.4 kB)
Collecting keras-nlp>=0.8.0 (from autokeras)
  Downloading keras_nlp-0.14.2-py3-none-any.whl.metadata (6.8 kB)
Collecting tensorflow-text (from keras-nlp>=0.8.0->autokeras)
  Downloading tensorflow_text-2.17.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.8 kB)
Collecting kt-legacy (from keras-tuner>=1.4.0->autokeras)
  Downloading kt_legacy-1.0.5-py3-none-any.whl.metadata (221 bytes)
Downloading autokeras-2.0.0-py3-none-any.whl (122 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.7/122.7 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading keras_nlp-0.14.2-py3-none-any.whl (571 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m572.0/572.0 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading keras_tuner-1

In [35]:
import autokeras as ak

In [36]:
X_train = X_train.to_numpy()
y_train = y_train.to_numpy()

In [37]:
clf = ak.AutoModel(
    ak.Input(shape=(X_train.shape[1],)),
    outputs=ak.ClassificationHead(num_classes=2),
    overwrite=True,
    max_trials=60
)

clf.fit(X_train, y_train, epochs=50)

Trial 60 Complete [00h 03m 21s]
val_loss: 0.05798732861876488

Best val_loss So Far: 0.057020679116249084
Total elapsed time: 03h 07m 26s
Epoch 1/50
[1m1400/1400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9458 - loss: 0.1326
Epoch 2/50
[1m1400/1400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9683 - loss: 0.0817
Epoch 3/50
[1m1400/1400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9712 - loss: 0.0759
Epoch 4/50
[1m1400/1400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9723 - loss: 0.0724
Epoch 5/50
[1m1400/1400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9730 - loss: 0.0699
Epoch 6/50
[1m1400/1400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9735 - loss: 0.0683
Epoch 7/50
[1m1400/1400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9742 - loss: 0.0667
Epoch 8/

<keras.src.callbacks.history.History at 0x7daaf397d690>

In [38]:
X_test = X_test.to_numpy()
y_test = y_test.to_numpy()

In [39]:
predictions_prob = clf.predict(X_test)
predictions = (predictions_prob > 0.5).astype(int)

  saveable.load_own_variables(weights_store.get(inner_path))


[1m916/916[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step
[1m916/916[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step


In [40]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
true_labels = y_test
cr = classification_report(true_labels, predictions, digits=4, zero_division=0)
cm = confusion_matrix(true_labels, predictions)
TN = cm[0][0]
FN = cm[1][0]
TP = cm[1][1]
FP = cm[0][1]
# Fall out or false positive rate
FPR = FP/(FP+TN) if (FP+TN) > 0 else 0
# False negative rate
FNR = FN/(TP+FN) if (TP+FN) > 0 else 0

roc = roc_auc_score(true_labels, predictions)
roc_prob = roc_auc_score(true_labels, predictions_prob)

print(cm)
print(cr)
print(f"FPR: {FPR}")
print(f"FNR: {FNR}")
print(f"ROC: {roc}")
print(f"ROC_prob: {roc_prob}")

[[ 2621   615]
 [  700 25359]]
              precision    recall  f1-score   support

           0     0.7892    0.8100    0.7995      3236
           1     0.9763    0.9731    0.9747     26059

    accuracy                         0.9551     29295
   macro avg     0.8828    0.8915    0.8871     29295
weighted avg     0.9557    0.9551    0.9554     29295

FPR: 0.1900494437577256
FNR: 0.026862120572546912
ROC: 0.8915442178348637
ROC_prob: 0.8915442178348637


In [None]:
model = clf.export_model()
model.save('/content/drive/MyDrive/multitude_split/autokeras_combined_features')
