In [1]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer

from collections import defaultdict

import re
import string

In [2]:
# Check if GPU is available
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


In [3]:
import configparser
import time

current_time = str(time.time())

config = configparser.ConfigParser()
config.read('FakeNewsModel_txt_sum.ini')
print(f'Configuration Sections: {config.sections()}')

model_rnn_bidirectional_fname  = config['DEFAULT']['model_rnn_bidirectional_fname']
model_rnn_unidirectional_fname = config['DEFAULT']['model_rnn_unidirectional_fname']

model_cnn_fname  = config['DEFAULT']['model_cnn_fname']
model_randforest_fname = config['DEFAULT']['model_randforest_fname']
model_adaboost_fname   = config['DEFAULT']['model_adaboost_fname']
model_xgboost_fname    = config['DEFAULT']['model_xgboost_fname']
model_bagging_fname    = config['DEFAULT']['model_bagging_fname']
model_gradboost_fname  = config['DEFAULT']['model_gradboost_fname']
model_svm_fname        = config['DEFAULT']['model_svm_fname']
model_lgr_fname        = config['DEFAULT']['model_lgr_fname']


DropFeatures = config['FEATURE_PROCESSING']['DropFeatures']
print(f'DropFeatures: {DropFeatures}, type: {type(DropFeatures)}')

Proc_LogisticRegression     = config['MODELS']['LogisticRegression'    ]
Proc_SVMGridSearch          = config['MODELS']['SVMGridSearch'         ]
Proc_SVMRandomizedSearch    = config['MODELS']['SVMRandomizedSearch'   ]
Proc_KNNearestNeighbors     = config['MODELS']['KNNearestNeighbors'    ]
Proc_DecisionTreeClassifier = config['MODELS']['DecisionTreeClassifier']
Proc_DeepNeuralNetworks     = config['MODELS']['DeepNeuralNetworks'    ]
Proc_ConvolutionalNetworks  = config['MODELS']['ConvolutionalNetworks' ]
Proc_LanguagePatternsAlg    = config['MODELS']['LanguagePatternsAlg'   ]
Proc_BayesianOptimization   = config['MODELS']['BayesianOptimization'  ]

print(f'LogisticRegression     : {Proc_LogisticRegression    }')
print(f'SVMGridSearch          : {Proc_SVMGridSearch         }')
print(f'SVMRandomizedSearch    : {Proc_SVMRandomizedSearch   }')
print(f'KNNearestNeighbors     : {Proc_KNNearestNeighbors    }')
print(f'DecisionTreeClassifier : {Proc_DecisionTreeClassifier}')
print(f'DeepNeuralNetworks     : {Proc_DeepNeuralNetworks    }')
print(f'ConvolutionNetworks    : {Proc_ConvolutionalNetworks }')
print(f'LanguagePatternsAlg    : {Proc_LanguagePatternsAlg   }')
print(f'BayesianOptimization   : {Proc_BayesianOptimization  }')

model_outFile = config['DEFAULT']['model_outputFile'] + current_time + '.pkl'
model_inFile  = config['DEFAULT']['model_inputFile' ] + current_time + '.pkl'
model_prefix_    = config['DEFAULT']['model_prefix'    ]
print(f'Model Output File: {model_outFile}')

grid_search_verbose = int(config['PROCESS']['gridSearchVerbose'])
print(f'grid_search_verbose: {grid_search_verbose}')

dataset_split = float(config['DEFAULT']['train_test_split'])
print(f'dataset_split: {dataset_split}')

file_true = config['DATASET']['TrueFile']
file_fake = config['DATASET']['FakeFile']

nn_models_epochs          = 20
nn_models_batchsize       = 128
nn_models_validationSplit = 0.2

Configuration Sections: ['DATASET', 'FEATURE_PROCESSING', 'EDA', 'PROCESS', 'MODELS']
DropFeatures: 'title','subject','date', type: <class 'str'>
LogisticRegression     : TRUE
SVMGridSearch          : TRUE
SVMRandomizedSearch    : TRUE
KNNearestNeighbors     : FALSE
DecisionTreeClassifier : TRUE
DeepNeuralNetworks     : FALSE
ConvolutionNetworks    : FALSE
LanguagePatternsAlg    : FALSE
BayesianOptimization   : FALSE
Model Output File: FakeNewsModel_1728521642.8647237.pkl
grid_search_verbose: 3
dataset_split: 0.35


In [4]:
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense, Dropout, Bidirectional
dataset_fraction = 1.0

data_fake = pd.read_csv( file_fake ).sample(frac=dataset_fraction)
data_true = pd.read_csv( file_true ).sample(frac=dataset_fraction)

print(f'data_fake input: {data_fake.shape}')
print(f'data_true input: {data_true.shape}')

data_fake['class'] = 0
data_true['class'] = 1


data_fake input: (23481, 10)
data_true input: (21417, 10)


In [5]:
# Custom function to split text
def split_text(text):
    if '-' in text[:30]:
        parts = text.split('-', 1)
        return parts[0], parts[1]
    else:
        return 'None', text
    
data_fake[['publisher', 'text']] = data_fake['text'].apply(lambda x: pd.Series(split_text(x)))
data_true[['publisher', 'text']] = data_true['text'].apply(lambda x: pd.Series(split_text(x)))

num_fake = len(data_fake)
num_true = len(data_true)



In [6]:
# Find the minimum number of records
min_records = min(num_fake, num_true)

# Determine which DataFrame has more records
if num_fake > num_true:
    data_fake_sampled = data_fake.sample(n=min_records)
    data = pd.concat([data_true, data_fake_sampled], ignore_index=True)
    
else:
    data_true_sampled = data_true.sample(n=min_records)
    data = pd.concat([data_fake, data_true_sampled], ignore_index=True)

# Print the shape of the new DataFrame
print(f'New DataFrame shape: {data.shape}')
data = data.sample(frac=1).reset_index(drop=True)

data = data.drop(columns=['title','subject','date','sentiment','emotion','objectivity','intent','assertions','publisher'])

New DataFrame shape: (42834, 12)


In [7]:
# Text filters for cleanup text passages
def wordopt( text ):
    text = text.lower()
    text = re.sub('\[.*?\]','',text)
    text = re.sub('\\W', ' ',  text)
    text = re.sub('https?://\S+|www\.\S+','',text)
    text = re.sub('<.*?>+','',text)
    text = re.sub('[%s]' % re.escape(string.punctuation),'',text)
    text = re.sub('\n','',text)
    text = re.sub('\w*\d\w*','',text)
    return text



In [8]:
##########################################################################
#  Data Preparation                                                      #
#  Prepare the data frame in formats for each type of model used         #
#  a) Split dataset into Train / Test                                    #
#  b) TDIDF vectorization transformation of Training and Testing data    #
#  c) Transform Training / Testing Data into Dense Arrays                #
#  d) Reshape dense data arrays for input to RNN models                  #
##########################################################################
# Remove stop words, and escape characters
data['text'] = data['text'].apply(wordopt)
x = data[['text', 'summary']]
y = data['class']

Xv_0_train, Xv_0_test, y_train, y_test = train_test_split(x,y,test_size=dataset_split)
Xv_train = Xv_0_train['text']
Xv_test  = Xv_0_test['summary']


max_features = 5000
vectorization = TfidfVectorizer( max_features=max_features )
X_train = vectorization.fit_transform( Xv_train )
X_train_dense = X_train.toarray()

X_test  = vectorization.transform( Xv_test )
X_test_dense = X_test.toarray()

y_train = np.array(y_train)
y_test  = np.array(y_test )

X_train_reshaped = X_train_dense.reshape((X_train_dense.shape[0], 1, X_train_dense.shape[1]))  # Shape: (num_samples, 1, num_features)
X_test_reshaped  = X_test_dense.reshape((X_test_dense.shape[0],   1, X_test_dense.shape[1]))   # Shape: (num_samples, 1, num_features)



In [9]:
from keras.models import Sequential
from keras.layers import Embedding, Dropout, Bidirectional, LSTM, GRU, Dense
from scipy.sparse import csr_matrix

max_words = 30000           # Maximum number of words to consider in vocabulary
max_len   = 2000            # Maximum length of sequences (padding/truncating)

embedding_dim = 128         # Dimension of word embedding

model_results = []
model_results_headers = ['Model','Training Time','Testing Time','Accuracy','Precision(weighted)','Recall(weighted)','f1-score(weighted)']

In [10]:
# RNN Models
#   a) BiLSTM, BiGRU - BiDirectional  64-LSTM, 32-GRU hidden layers
#   b) LSTM, GRU     - UniDirectional 64-LSTM, 32-GRU hidden layers

from keras.models import load_model, Sequential, Model
from keras.layers import Dense, Conv1D, MaxPooling1D, LSTM, Dropout, Flatten, Input, Embedding
from keras.layers import Bidirectional

def create_model_Rnn_BiLSTM_BiGRU( modelType, max_words=max_words, embedding_dim=embedding_dim ):
    print(f'Creating model type: {modelType}')
    model = Sequential([
        Dropout(0.2),
        Bidirectional(LSTM(64, return_sequences=True)),
        Dropout(0.2),
        Bidirectional(GRU(32)),
        Dropout(0.2),
        Dense(4, activation='relu'),
        Dropout(0.2),
        Dense(1, activation='sigmoid')  # Sigmoid for binary classification
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def create_model_Rnn_LSTM_GRU( modelType, max_words=max_words, embedding_dim=embedding_dim ):
    print(f'Creating model type: {modelType}')
    model = Sequential([
        Dropout(0.2),
        LSTM(64, return_sequences=True),
        Dropout(0.2),
        GRU(32),
        Dropout(0.2),
        Dense(4, activation='relu'),
        Dropout(0.2),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# CNN Models
#   a) CNN + RNN + Transformer

def create_model_CnnRnnTransformer(input_shape):
    inputs = Input(shape=input_shape)

    # CNN part
    cnn_layer = Conv1D(filters=128, kernel_size=5, activation='relu')(inputs)
    cnn_layer = MaxPooling1D(pool_size=2)(cnn_layer)
    cnn_layer = Conv1D(filters=64, kernel_size=5, activation='relu')(cnn_layer)
    cnn_layer = MaxPooling1D(pool_size=2)(cnn_layer)

    # RNN (LSTM or GRU)
    rnn_layer = LSTM(128, return_sequences=True)(cnn_layer)
    rnn_layer = Dropout(0.2)(rnn_layer)

    # Optionally, use Bidirectional LSTM/GRU for better performance
    rnn_layer = Bidirectional(LSTM(128,return_sequences=True))(rnn_layer)

    # Transformer Layer using MultiHeadAttention
    transformer_layer = tf.keras.layers.MultiHeadAttention(num_heads=4, key_dim=128)(rnn_layer, rnn_layer)
    
    # Pooling to reduce the sequence to a fixed size (optional global average pooling)
    pooled_output = tf.keras.layers.GlobalAveragePooling1D()(transformer_layer)

    # Fully Connected Layer
    dense_layer = Dense(64, activation='relu')(pooled_output)
    dense_layer = Dropout(0.5)(dense_layer)

    # Output Layer (assuming binary classification)
    outputs = Dense(1, activation='sigmoid')(dense_layer)

    # Define the model
    model = Model(inputs=inputs, outputs=outputs)

    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    return model

In [11]:
############################################################################################
#  Executing Neural Network Models - a) RNN + BiLSTM  + BiGRU                              #
#                                    b) RNN + UniLSTM + UniGRU                             #
#  epcochs   : 20                                                                          #
#  batch_size: 128                                                                         #
############################################################################################

import os
import joblib
from keras.models import load_model

models = { model_rnn_bidirectional_fname: create_model_Rnn_BiLSTM_BiGRU, model_rnn_unidirectional_fname: create_model_Rnn_LSTM_GRU }

for model_name, model_function in models.items():
    if( os.path.exists( model_name ) ):
        model = load_model( model_name )
        print(f'model: {model_name} loaded ...')
    else:
        model = model_function("RNN_BiLSTM_BiGRU", max_words=max_words, embedding_dim=embedding_dim)

    start_time = time.time()
    history = model.fit(X_train_reshaped, y_train, 
                        epochs=nn_models_epochs, 
                        batch_size=nn_models_batchsize, 
                        validation_split=nn_models_validationSplit, 
                        verbose=1)
    train_time = time.time() - start_time
    print(f'\nTraining Time: {train_time:.2f} seconds\n')
    start_time = time.time()
    test_loss, test_accuracy = model.evaluate(X_test_reshaped, y_test)
    
    test_time = time.time() - start_time
    print(f'Test Accuracy: {test_accuracy:.4f}')

    y_pred = (model.predict(X_test_reshaped) > 0.50).astype("int32")
    print(f"{model_name}: Classification Report:\n", classification_report(y_test, y_pred))
    print(f"{model_name}: Accuracy Score: {accuracy_score(y_test, y_pred):.4f}")    
    
    report = classification_report(y_test,y_pred,output_dict=True)
    precision = report['weighted avg']['precision']
    recall    = report['weighted avg']['recall']
    f1_score  = report['weighted avg']['f1-score']

    model_results.append([model_name, f'{train_time:.2f} sec', f'{test_time:.2f} sec', f'{test_accuracy:.4f}',f'{precision:.4f}',f'{recall:.4f}',f'{f1_score:.4f}'])
    model.save(model_name)


model: rnn_bidirectional_txt_sum.h5 loaded ...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Training Time: 42.01 seconds

Test Accuracy: 0.9207
rnn_bidirectional_txt_sum.h5: Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.92      0.92      7490
           1       0.92      0.92      0.92      7502

    accuracy                           0.92     14992
   macro avg       0.92      0.92      0.92     14992
weighted avg       0.92      0.92      0.92     14992

rnn_bidirectional_txt_sum.h5: Accuracy Score: 0.9207
model: rnn_unidirectional_txt_sum.h5 loaded ...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoc

In [12]:
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier

models_cpu = {
    model_randforest_fname : RandomForestClassifier,
    model_adaboost_fname   : AdaBoostClassifier,
#    model_bagging_fname    : BaggingClassifier,
    model_gradboost_fname  : GradientBoostingClassifier
}
for model_name, model_ in models_cpu.items():
    print(f'model_name: {model_name}')

model_n_estimators=10
for model_name, model_function in models_cpu.items():
    print(f'#####  {model_name}  #####')

    if( os.path.exists( model_name ) ):
        model = joblib.load( model_name )
        print(f'model: {model_name} loaded ...')
    else:
        model = model_function(n_estimators=model_n_estimators)
    
    # Check if the model supports partial fitting
    start_time = time.time()
    if hasattr(model, 'partial_fit'):
        # For partial fit, we need to specify classes for classification models
        print(f'partial training fit')
        classes = np.unique(y_train)
        model.partial_fit(X_train_dense, y_train, classes=classes)
    else:
        print(f'regular training fit')
        model.fit(X_train_dense, y_train)      
    
    train_time = time.time() - start_time
    print(f'Training Time: {train_time:.2f} seconds\n')
    start_time = time.time()
    y_pred = model.predict(X_test_dense)
    test_time = time.time() - start_time
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    print(f'{conf_matrix}')
    print(f'{classification_report(y_test, y_pred)}')

    test_accuracy = accuracy_score(y_test,y_pred)
    
    report = classification_report(y_test,y_pred,output_dict=True)
    precision = report['weighted avg']['precision']
    recall    = report['weighted avg']['recall']
    f1_score  = report['weighted avg']['f1-score']

    model_results.append([model_name, f'{train_time:.2f} sec', f'{test_time:.2f} sec', f'{test_accuracy:.4f}',f'{precision:.4f}',f'{recall:.4f}',f'{f1_score:.4f}'])
   
    joblib.dump( model, model_name )


model_name: randforest_txt_sum.h5
model_name: adaboost_txt_sum.h5
model_name: gradboost_txt_sum.h5
#####  randforest_txt_sum.h5  #####
model: randforest_txt_sum.h5 loaded ...
regular training fit
Training Time: 5.80 seconds

[[6609  881]
 [1279 6223]]
              precision    recall  f1-score   support

           0       0.84      0.88      0.86      7490
           1       0.88      0.83      0.85      7502

    accuracy                           0.86     14992
   macro avg       0.86      0.86      0.86     14992
weighted avg       0.86      0.86      0.86     14992

#####  adaboost_txt_sum.h5  #####
model: adaboost_txt_sum.h5 loaded ...
regular training fit




Training Time: 28.17 seconds

[[5614 1876]
 [1181 6321]]
              precision    recall  f1-score   support

           0       0.83      0.75      0.79      7490
           1       0.77      0.84      0.81      7502

    accuracy                           0.80     14992
   macro avg       0.80      0.80      0.80     14992
weighted avg       0.80      0.80      0.80     14992

#####  gradboost_txt_sum.h5  #####
model: gradboost_txt_sum.h5 loaded ...
regular training fit
Training Time: 69.92 seconds

[[6177 1313]
 [2354 5148]]
              precision    recall  f1-score   support

           0       0.72      0.82      0.77      7490
           1       0.80      0.69      0.74      7502

    accuracy                           0.76     14992
   macro avg       0.76      0.76      0.75     14992
weighted avg       0.76      0.76      0.75     14992



In [13]:
##### XgBoost - CUDA #####
import xgboost as xgb
import os

model_name = model_xgboost_fname

dtrain = xgb.DMatrix(X_train_dense, label=y_train)
dtest  = xgb.DMatrix(X_test_dense,  label=y_test )

start_time = time.time()
params = {
    'objective': 'binary:logistic',  # Assuming binary classification
    'tree_method': 'hist',           # Use GPU for training
    'device'     : 'cuda',           # Use CUDA for GPU
    'max_depth'  : 4,
    'predictor'  : 'cpu_predictor'   # GPU for prediction as well
}

model = xgb.train(params, dtrain, num_boost_round=50)
train_time = time.time() - start_time
print(f'Training Time: {train_time:.2f} seconds\n')

start_time = time.time()

y_pred = model.predict(dtest)
y_pred = [1 if pred > 0.5 else 0 for pred in y_pred]  # Convert probabilities to class labels

test_time = time.time() - start_time

# Confusion matrix and classification report
print("Confusion Matrix for XGBoost (GPU):")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

test_accuracy = accuracy_score(y_test,y_pred)

report = classification_report(y_test,y_pred,output_dict=True)
precision = report['weighted avg']['precision']
recall    = report['weighted avg']['recall']
f1_score  = report['weighted avg']['f1-score']

model_results.append([model_name, f'{train_time:.2f} sec', f'{test_time:.2f} sec', f'{test_accuracy:.4f}',f'{precision:.4f}',f'{recall:.4f}',f'{f1_score:.4f}'])

joblib.dump( model,model_name )

Parameters: { "predictor" } are not used.



Training Time: 23.29 seconds

Confusion Matrix for XGBoost (GPU):
[[6576  914]
 [1388 6114]]
              precision    recall  f1-score   support

           0       0.83      0.88      0.85      7490
           1       0.87      0.81      0.84      7502

    accuracy                           0.85     14992
   macro avg       0.85      0.85      0.85     14992
weighted avg       0.85      0.85      0.85     14992



['xgboost_txt_sum.h5']

In [14]:
def ModelClassification_GridSearch(classifier, classifier_text, X_train,y_train,X_test,y_test,param_grid):
    start_time = time.time()

    grid_search = GridSearchCV(classifier, param_grid, cv=10, n_jobs=-1, scoring='accuracy',verbose=grid_search_verbose)
    model_fit_param = grid_search.fit(X_train, y_train)
    best_regressor  = grid_search.best_estimator_

    train_time = time.time() - start_time
    start_time = time.time()
    
    y_pred                 = best_regressor.predict(X_test)
    
    test_time = time.time() - start_time
    
    model_accuracy         = accuracy_score(y_test,y_pred)
    model_confusion_matrix = confusion_matrix(y_test,y_pred)
    
    report_classification = classification_report(y_test,y_pred, output_dict=True)

    grid_training_score = best_regressor.score(X_train,y_train)
    grid_testing_score  = best_regressor.score(X_test, y_test )

    precision = report_classification['weighted avg']['precision']
    recall    = report_classification['weighted avg']['recall']
    f1_score  = report_classification['weighted avg']['f1-score']

    model_results.append([classifier_text, f'{train_time:.2f} sec', f'{test_time:.2f} sec', f'{test_accuracy:.4f}',f'{precision:.4f}',f'{recall:.4f}',f'{f1_score:.4f}'])
    
    joblib.dump( best_regressor, classifier_text )



In [15]:
if Proc_SVMGridSearch == 'TRUE':
    param_grid_svc = {
        'C': [0.1,1, 10], 
        'gamma': [1,0.1,0.01]
    } 
    classifier_gridsearch_report = ModelClassification_GridSearch(SVC(), 'SVMGridSearch', X_train, y_train, X_test, y_test, param_grid_svc) 

if Proc_LogisticRegression == 'TRUE':
    param_grid_lr = {
        'C': [0.1, 1, 10],
        'solver': ['liblinear', 'saga']
    }
    classifier_gridsearch_report = ModelClassification_GridSearch(LogisticRegression(), 'Logistic Regression GridSearch', X_train, y_train, X_test, y_test, param_grid_lr)
        
    

Fitting 10 folds for each of 9 candidates, totalling 90 fits
Fitting 10 folds for each of 6 candidates, totalling 60 fits


In [16]:
from tabulate import tabulate

print( tabulate(model_results, model_results_headers, tablefmt='pretty') )

+--------------------------------+---------------+--------------+----------+---------------------+------------------+--------------------+
|             Model              | Training Time | Testing Time | Accuracy | Precision(weighted) | Recall(weighted) | f1-score(weighted) |
+--------------------------------+---------------+--------------+----------+---------------------+------------------+--------------------+
|  rnn_bidirectional_txt_sum.h5  |   42.01 sec   |   2.17 sec   |  0.9207  |       0.9207        |      0.9207      |       0.9207       |
| rnn_unidirectional_txt_sum.h5  |   27.74 sec   |   1.50 sec   |  0.9173  |       0.9174        |      0.9173      |       0.9173       |
|     randforest_txt_sum.h5      |   5.80 sec    |   0.21 sec   |  0.8559  |       0.8569        |      0.8559      |       0.8558       |
|      adaboost_txt_sum.h5       |   28.17 sec   |   1.33 sec   |  0.7961  |       0.7986        |      0.7961      |       0.7956       |
|      gradboost_txt_sum.h5