# Gradient boosting model

In [1]:
import pandas as pd
import numpy as np

# Process training data
Create training data to train gradient boosting model. 

Example:
| DOI | Prediction 1 | ... | Prediction n | Paper label |
|----------|----------|----------|----------|----------|
| doi.org/... | Chunk label 1 | ... | Chunk label n | label |
| doi.org/... | Chunk label 1 | ... | Chunk label n | label |
| doi.org/... | Chunk label 1 | ... | Chunk label n | label |


In [None]:
data = pd.read_csv("lightgbm.csv") # Text chunks with labels
labels = pd.read_csv("bert_dataset.csv") # Labels for full-text papers

# Combine datasets where each row consists of predictions for all text chunks in a paper and the corresponding label
grouped = data.groupby('paper')

# Maximum number of data points in any group
max_len = max(grouped.size())

# Create DataFrame with appropriate number of columns
columns = [f'prediction_{i}' for i in range(max_len)]
columns.append('label')

df = pd.DataFrame(columns=columns)

# Create rows for each paper
for name, group in grouped:
    predictions = group["prediction"].values.astype(float)
    label = labels.loc[name, 'label']
    entry = np.pad(predictions, (0, max_len - len(predictions)), constant_values=np.nan)
    entry = np.append(entry, label)
    df.loc[name] = entry

# Train model 
Train gradient boosting model using lightGBM

In [59]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
# Separate the target column from the feature columns
X = df.drop(columns='label')  # features
y = df['label']               # target labels

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a LightGBM dataset
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_test = lgb.Dataset(X_test, label=y_test, reference=lgb_train)

In [None]:
# Set the parameters (without hyperparameter tuning)
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',  # or 'auc'
    'boosting': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

# Train the model
bst = lgb.train(params, lgb_train, num_boost_round=100, valid_sets=[lgb_train, lgb_test], callbacks=[
        lgb.early_stopping(stopping_rounds=10),
    ])

bst.save_model('lightbgm_model.txt', num_iteration=bst.best_iteration)

In [52]:
# Predict test dataset
predictions = bst.predict(X_test, num_iteration=bst.best_iteration)

In [57]:
# Compute confusion matrix
def compute_metrics(pred, labels):
    labels = labels
    preds = [1 if pred > 0.5 else 0 for pred in predictions]
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [60]:
compute_metrics(predictions, y_test)

{'accuracy': 0.9435483870967742,
 'f1': 0.943089430894309,
 'precision': 0.9354838709677419,
 'recall': 0.9508196721311475}

In [55]:
# Convert probabilities to binary predictions
binary_predictions = [1 if pred > 0.5 else 0 for pred in predictions]

# Evaluate the model
accuracy = accuracy_score(y_test, binary_predictions)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9435483870967742
