# Process data for lightGBM

In [1]:
import pandas as pd
import numpy as np

In [25]:
data = pd.read_csv("lightgbm.csv")
labels = pd.read_csv("bert_dataset.csv")

In [7]:
label.loc[0,'label']

0

In [32]:
grouped = data.groupby('paper')

# Maximum number of data points in any group
max_len = max(grouped.size())

# Create DataFrame with appropriate number of columns
columns = [f'prediction_{i}' for i in range(max_len)]
columns.append('label')
print(columns)

df = pd.DataFrame(columns=columns)

for name, group in grouped:
    predictions = group["prediction"].values.astype(float)
    label = labels.loc[name, 'label']
    entry = np.pad(predictions, (0, max_len - len(predictions)), constant_values=np.nan)
    entry = np.append(entry, label)
    df.loc[name] = entry

['prediction_0', 'prediction_1', 'prediction_2', 'prediction_3', 'prediction_4', 'prediction_5', 'prediction_6', 'prediction_7', 'prediction_8', 'prediction_9', 'prediction_10', 'prediction_11', 'prediction_12', 'prediction_13', 'prediction_14', 'prediction_15', 'prediction_16', 'prediction_17', 'prediction_18', 'prediction_19', 'prediction_20', 'prediction_21', 'prediction_22', 'prediction_23', 'prediction_24', 'prediction_25', 'prediction_26', 'prediction_27', 'prediction_28', 'prediction_29', 'prediction_30', 'prediction_31', 'prediction_32', 'prediction_33', 'prediction_34', 'prediction_35', 'prediction_36', 'prediction_37', 'prediction_38', 'prediction_39', 'prediction_40', 'prediction_41', 'prediction_42', 'prediction_43', 'prediction_44', 'prediction_45', 'prediction_46', 'prediction_47', 'prediction_48', 'prediction_49', 'prediction_50', 'prediction_51', 'prediction_52', 'prediction_53', 'prediction_54', 'prediction_55', 'prediction_56', 'prediction_57', 'prediction_58', 'predi

In [36]:
print(sum(df['label'].isna()))
print(sum(df['prediction_0'].isna()))

0
0


# Use lightGBM

In [59]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [44]:
# Separate the target column from the feature columns
X = df.drop(columns='label')  # features
y = df['label']               # target labels

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [46]:
# Create a LightGBM dataset
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_test = lgb.Dataset(X_test, label=y_test, reference=lgb_train)

In [47]:
# Set the parameters (without hyperparameter tuning)
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',  # or 'auc'
    'boosting': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

In [50]:
# Train the model
bst = lgb.train(params, lgb_train, num_boost_round=100, valid_sets=[lgb_train, lgb_test], callbacks=[
        lgb.early_stopping(stopping_rounds=10),
    ])

[LightGBM] [Info] Number of positive: 248, number of negative: 246
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000965 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 110
[LightGBM] [Info] Number of data points in the train set: 494, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502024 -> initscore=0.008097
[LightGBM] [Info] Start training from score 0.008097
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[72]	training's binary_logloss: 0.126215	valid_1's binary_logloss: 0.179049


In [61]:
bst.save_model('lightbgm_model.txt', num_iteration=bst.best_iteration)

<lightgbm.basic.Booster at 0x7f4bb4e74f90>

In [52]:
# Predict
predictions = bst.predict(X_test, num_iteration=bst.best_iteration)

In [57]:
def compute_metrics(pred, labels):
    labels = labels
    preds = [1 if pred > 0.5 else 0 for pred in predictions]
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [60]:
compute_metrics(predictions, y_test)

{'accuracy': 0.9435483870967742,
 'f1': 0.943089430894309,
 'precision': 0.9354838709677419,
 'recall': 0.9508196721311475}

In [55]:
# Convert probabilities to binary predictions
binary_predictions = [1 if pred > 0.5 else 0 for pred in predictions]

# Evaluate the model
accuracy = accuracy_score(y_test, binary_predictions)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9435483870967742


# Predict new papers --> flagging new papers

In [None]:
# need to combine paper after chunking