In [None]:
import pandas as pd
import umap
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout, BatchNormalization, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Attention, Concatenate, Reshape
from keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from keras.regularizers import l1, l2, L1L2
from tensorflow.keras.utils import to_categorical
from keras.initializers import GlorotUniform
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score, log_loss, make_scorer
from sklearn.ensemble import RandomForestClassifier
from tensorflow.keras.models import load_model
from scipy.optimize import minimize
from skopt import gp_minimize
from skopt.space import Real, Integer
import xgboost as xgb
import lightgbm as lgb
import numpy as np
from skopt import BayesSearchCV
import joblib
import json
import zipfile
import os
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from google.colab import drive
import tensorflow as tf
from tensorflow import keras
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#!pip install umap-learn
#!pip install scikit-optimize

# Data Processs

In [None]:
def data_loader(path = "/content/train_data_swc.csv"):
    df = pd.read_csv(path)
    return df

In [None]:
def get_new_train_test(X_train, y_train):
    # Shuffle and split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

In [None]:
def umap_feature_eng(train2, test2, test, n_components = 3, n_neighbors = 15, names = ['train2_data_umap3.csv','test2_data_umap3.csv' ,'test_data_umap3.csv']):
  umap_model = umap.UMAP(n_components=n_components, n_neighbors = n_neighbors, n_jobs=-1)
  train2_embeddings = umap_model.fit_transform(train2)
  print("train2 fitted")
  test2_embeddings = umap_model.transform(test2)
  print("test2 transformed")
  test_embeddings = umap_model.transform(test)
  print("test transformed")

  new_cols = []
  for i in range(n_components):
      new_cols.append(f'umap_{i+1}')

  # Convert embeddings to DataFrame
  train2_embedding_df = pd.DataFrame(train2_embeddings, columns=new_cols)
  test2_embedding_df = pd.DataFrame(test2_embeddings, columns=new_cols)
  test_embedding_df = pd.DataFrame(test_embeddings, columns=new_cols)

  # Concatenate the embeddings DataFrame with the original DataFrame
  train2_with_embeddings = pd.concat([train2, train2_embedding_df], axis=1)
  test2_with_embeddings = pd.concat([test2, test2_embedding_df], axis=1)
  test_with_embeddings = pd.concat([test, test_embedding_df], axis=1)

  # Save new data frame
  train2_with_embeddings.to_csv('/content/drive/My Drive/swc_data/' + names[0], index=False)
  test2_with_embeddings.to_csv('/content/drive/My Drive/swc_data/' + names[1], index=False)
  test_with_embeddings.to_csv('/content/drive/My Drive/swc_data/' + names[2], index=False)

  return train2_with_embeddings, test2_with_embeddings, test_with_embeddings

In [None]:
def normalize_data(X_train2, X_test2, X_test):
    scaler = StandardScaler()
    features = X_train2.columns
    X_train2_scaled = scaler.fit_transform(X_train2.values)
    X_test2_scaled = scaler.transform(X_test2.values)
    X_test_scaled = scaler.transform(X_test.values)
    X_train2_scaled = pd.DataFrame(X_train2_scaled, columns=features)
    X_test2_scaled = pd.DataFrame(X_test2_scaled, columns=features)
    X_test_scaled = pd.DataFrame(X_test_scaled, columns=features)

    return X_train2_scaled, X_test2_scaled, X_test_scaled


In [None]:
folder_path = '/content/drive/My Drive/swc_data/'
train_data = data_loader(folder_path + "train_data_swc.csv")
test_data = data_loader(folder_path + "test_data_swc.csv")
y_train2  = data_loader(folder_path + "y_train2.csv")
y_test2  = data_loader(folder_path + "y_test2.csv")

X_train2_umap3 = data_loader(folder_path + "train2_data_umap3.csv")
X_test2_umap3 = data_loader(folder_path + "test2_data_umap3.csv")
X_test_umap3 = data_loader(folder_path + "test_data_umap3.csv")

In [None]:
X_train2, X_test2, _, _ = get_new_train_test(train_data.drop("y", axis = 1), train_data["y"])

In [None]:
X_train2, X_test2, X_test = normalize_data(X_train2, X_test2, test_data)

In [None]:
X_train2_umap10, X_test2_umap10, X_test_umap10 = umap_feature_eng(X_train2, X_test2, X_test, n_components = 10, n_neighbors = 15, names = ['train2_data_umap10.csv','test2_data_umap10.csv' ,'test_data_umap10.csv'])

train2 fitted
test2 transformed
test transformed


## Random Forest

In [None]:
def train_rf(X_train2, y_train2, X_test2, y_test2):

    # Scale the features
    scaler = StandardScaler()
    features = X_train2.columns
    X_train2_scaled = scaler.fit_transform(X_train2.values)
    X_test2_scaled = scaler.transform(X_test2.values)
    X_train2_scaled = pd.DataFrame(X_train2_scaled, columns=features)
    X_test2_scaled = pd.DataFrame(X_test2_scaled, columns=features)
    print("Data Normalized")

    # Number of samples for each class
    class_samples = y_train2["y"].value_counts().to_dict()

    # Calculate class weights
    total_samples = sum(class_samples.values())
    class_weights = {class_label: total_samples / (len(class_samples) * class_samples[class_label]) for class_label in class_samples}


    # Parameter initializations
    n_splits = 3

    # Initialize k-fold cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)


    # Hyperparameter search space for Bayesian optimization
    param_space = {
        'n_estimators': (100, 1500),
        'max_depth': (2, 250),
        'min_samples_split': (2, 100),
        'min_samples_leaf': (1, 100),
        'criterion': ['gini', 'entropy', 'log_loss'],
        'max_features': [None, "sqrt", "log2"]
    }

    # Initialize Random Forest classifier
    clf = RandomForestClassifier(random_state=42, class_weight=class_weights, n_jobs=-1)

    # Define log loss scorer
    log_loss_scorer = make_scorer(log_loss, greater_is_better=False, needs_proba=True)

    # Initialize Bayesian search
    bayes_search = BayesSearchCV(clf, param_space, cv=kf, scoring=log_loss_scorer, n_iter= 35, verbose=1, n_jobs=-1)

    # Fit BayesSearchCV to data
    print("Bayesian Search Begin")
    bayes_search.fit(X_train2_scaled.values, y_train2["y"].ravel())

    # Print and save the best parameters found
    print("Best parameters found:")
    best_params = bayes_search.best_params_
    print(best_params)

    # Retrieve the best estimator and save it
    best_clf = bayes_search.best_estimator_
    random_forest_model_path = '/content/drive/My Drive/swc_models/trained_model_random_forest_umap2joblib'
    joblib.dump(best_clf, random_forest_model_path)

    """
    best_clf = RandomForestClassifier(random_state=42, class_weight=class_weights, n_estimators = 1200, max_depth = 136,
                                      min_samples_leaf = 1, min_samples_split = 2, n_jobs=-1) # 0.5473

    best_clf.fit(X_train2_scaled, y_train2["y"].ravel())
    """

    # Validation Score
    y_pred_val = best_clf.predict(X_test2_scaled)
    y_prob_val = best_clf.predict_proba(X_test2_scaled)
    # Calculate accuracy score
    accuracy = accuracy_score(y_test2["y"].ravel(), y_pred_val)
    print(f"Accuracy: {accuracy:.4f}")
    # Calculate log-loss
    logloss = log_loss(y_test2["y"].ravel(), y_prob_val)
    print(f"Log Loss: {logloss:.4f}")
    return best_clf


In [None]:
best_rf = train_rf(X_train2_umap3, y_train2, X_test2_umap3, y_test2)

## LightGBM

In [None]:
def train_lightgbm(X_train2, y_train2, X_test2, y_test2):

    # Scale the features
    scaler = StandardScaler()
    features = X_train2.columns
    X_train2_scaled = scaler.fit_transform(X_train2.values)
    X_test2_scaled = scaler.transform(X_test2.values)
    X_train2_scaled = pd.DataFrame(X_train2_scaled, columns=features)
    X_test2_scaled = pd.DataFrame(X_test2_scaled, columns=features)
    print("Data Normalized")


    # Number of samples for each class
    class_samples = y_train2["y"].value_counts().to_dict()#

    # Calculate class weights
    total_samples = sum(class_samples.values())
    class_weights = {class_label: total_samples / (len(class_samples) * class_samples[class_label]) for class_label in class_samples}


    #Parameter initializations
    n_splits = 3

    # Initialize k-fold cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)


    # Hyperparameter search space for Bayesian optimization
    param_space = {
        'num_leaves': (2, 1500),
        'max_depth': (2, 100),
        'learning_rate': (0.01, 0.3),
        'min_child_samples': (5, 150),
        'n_estimators': (100, 1000),
        'subsample': (0.2, 1),
        'colsample_bytree': (0.2, 1),
    }

     # Initialize LightGBM classifier
    clf = lgb.LGBMClassifier(random_state=42, class_weight=class_weights, objective='multiclass', verbose = -1)

    # Define log loss scorer
    log_loss_scorer = make_scorer(log_loss, greater_is_better=False, needs_proba=True)

    #Initialize bayesian search
    bayes_search = BayesSearchCV(clf, param_space, cv=kf, scoring=log_loss_scorer, n_iter= 30, verbose= 1, n_jobs=-1)

    # Fit BayesSearchCV to data
    print("Bayesian Search Begin")
    bayes_search.fit(X_train2_scaled, y_train2["y"].ravel())

    # Print and save the best parameters found
    print("Best parameters found:")
    best_params = bayes_search.best_params_
    print(best_params)

    # Retrieve the best estimator and save it
    best_clf = bayes_search.best_estimator_
    lightgbm_model_path = '/content/drive/My Drive/swc_models/trained_model_lightgbm_umap2.joblib'
    joblib.dump(best_clf, lightgbm_model_path)

    """
    best_clf = lgb.LGBMClassifier(num_leaves = 1000, max_depth = 45, min_child_samples = 87, n_estimators=192, learning_rate = 0.05,
                                  random_state=42, objective='multiclass', verbose = -1, class_weight=class_weights,
                                  colsample_bytree = 0.3202803930164253, subsample = 0.5934691217094938, n_jobs = -1) #0.4832


    best_clf.fit(X_train2_scaled, y_train2["y"])#
    """

    # Validation Score
    y_pred_val = best_clf.predict(X_test2_scaled)
    y_prob_val = best_clf.predict_proba(X_test2_scaled)
    # Calculate accuracy score for this fold
    accuracy = accuracy_score(y_test2["y"].ravel(), y_pred_val)
    print(f"Accuracy: {accuracy:.4f}")
    # Calculate log-loss for this fold
    logloss = log_loss(y_test2["y"].ravel(), y_prob_val)
    print(f"Log Loss: {logloss:.4f}")

    return best_clf

In [None]:
best_lightgbm = train_lightgbm(X_train2_umap10, y_train2, X_test2_umap10, y_test2)

Data Normalized
Bayesian Search Begin
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits




Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits




Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits


# XGBoost

In [None]:
def train_xgboost(X_train2, y_train2, X_test2, y_test2):

    # Scale the features
    scaler = StandardScaler()
    features = X_train2.columns
    X_train2_scaled = scaler.fit_transform(X_train2.values)
    X_test2_scaled = scaler.transform(X_test2.values)
    X_train2_scaled = pd.DataFrame(X_train2_scaled, columns=features)
    X_test2_scaled = pd.DataFrame(X_test2_scaled, columns=features)
    print("Data Normalized")


    # Number of samples for each class
    class_samples = (y_train2["y"] - 1).value_counts().to_dict()

    # Calculate class weights
    total_samples = sum(class_samples.values())
    class_weights = {class_label: total_samples / (len(class_samples) * class_samples[class_label]) for class_label in class_samples}


    #Parameter initializations
    n_splits = 3

    # Initialize k-fold cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)


    # Hyperparameter search space for Bayesian optimization
    param_space = {
        'max_depth': (5, 50),
        'learning_rate': (0.01, 0.5),
        'min_child_weight': (2, 50),
        'n_estimators': (25, 500),
        'subsample': (0.2, 1),
        'colsample_bytree': (0.2, 1)
    }

     # Initialize LightGBM classifier
    clf = xgb.XGBClassifier(random_state=42, scale_pos_weight=class_weights, objective='multi:softprob', verbosity=0)

    # Define log loss scorer
    log_loss_scorer = make_scorer(log_loss, greater_is_better=False, needs_proba=True)

    #Initialize bayesian search
    bayes_search = BayesSearchCV(clf, param_space, cv=kf, scoring=log_loss_scorer, n_iter= 50, verbose= 1, n_jobs=-1)

    # Fit BayesSearchCV to data
    print("Bayesian Search Begin")
    bayes_search.fit(X_train2_scaled, y_train2["y"].ravel() - 1)

    # Print and save the best parameters found
    print("Best parameters found:")
    best_params = bayes_search.best_params_
    print(best_params)

    # Retrieve the best estimator and save it
    best_clf = bayes_search.best_estimator_

    xgboost_model_path = '/content/drive/My Drive/swc_models/trained_model_xgboost_tsne_umap.joblib'
    joblib.dump(best_clf, xgboost_model_path)
    """
    best_clf = xgb.XGBClassifier(random_state=42, scale_pos_weight=class_weights, objective='multi:softprob', verbosity=0,
                                 max_depth = 25, learning_rate = 0.0923789893340372, min_child_weight = 25, n_estimators = 296,
                                 subsample = 0.8742364404441483, colsample_bytree = 0.3445600498256228) #0.4741

    best_clf.fit(X_train2_scaled, y_train2["y"].ravel() - 1)
    """
    # Validation Score
    y_pred_val = best_clf.predict(X_test2_scaled)
    y_prob_val = best_clf.predict_proba(X_test2_scaled)
    # Calculate accuracy score for this fold
    accuracy = accuracy_score(y_test2["y"].ravel() - 1, y_pred_val)
    print(f"Accuracy: {accuracy:.4f}")
    # Calculate log-loss for this fold
    logloss = log_loss(y_test2["y"].ravel() - 1, y_prob_val)
    print(f"Log Loss: {logloss:.4f}")

    return best_clf

In [None]:
best_xgboost = train_xgboost(X_train2_umap3_50, y_train2, X_test2_umap3_50, y_test2)

Data Normalized
Accuracy: 0.8226
Log Loss: 0.4761


# Neural Network

In [None]:
def train_neural_network(X_train2, y_train2, X_test2, y_test2):
    # Scale the features
    scaler = StandardScaler()
    features = X_train2.columns
    X_train2_scaled = scaler.fit_transform(X_train2.values)
    X_test2_scaled = scaler.transform(X_test2.values)
    X_train2_scaled = pd.DataFrame(X_train2_scaled, columns=features)
    X_test2_scaled = pd.DataFrame(X_test2_scaled, columns=features)
    y_train2 = to_categorical(y_train2["y"])[:,1:]
    y_test2 = to_categorical(y_test2["y"])[:,1:]
    print("Data Normalized")

    model = Sequential([
      Dense(1000, activation='relu', input_shape=(111,), kernel_regularizer=l1(0.0001)), # L1 regularization with coefficient 0.001
      BatchNormalization(),
      Dropout(0.3),
      Dense(500, activation='relu', kernel_regularizer=l2(0.0001)), # L2 regularization with coefficient 0.001
      BatchNormalization(),
      Dropout(0.2),
      Dense(250, activation='relu', kernel_regularizer=L1L2(l1=0.0001, l2=0.0001)), # L1 and L2 regularization with coefficients 0.001
      BatchNormalization(),
      Dropout(0.1),
      Dense(125, activation='relu', kernel_regularizer=l2(0.0001)), # L2 regularization with coefficient 0.001
      BatchNormalization(),
      Dense(9, activation='softmax')  # Assuming 9 classes
    ])

    # Compile the model
    model.compile(optimizer=Adam(learning_rate=1e-5),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    early_stopping = EarlyStopping(monitor='val_loss', patience= 40, verbose=1)

    # Train the model
    model.fit(X_train2_scaled, y_train2, epochs=1250, batch_size=128, validation_split=0.2, callbacks=[early_stopping]) # batch_size 128

    # Save the trained model using joblib
    model.save('/content/drive/My Drive/swc_models/' + 'mlp_model_umap2.keras')
    print("Model saved successfully.")

    # Evaluate the model
    loss, accuracy = model.evaluate(X_test2_scaled, y_test2)
    print(f'Test Loss: {loss:.3f}, Test Accuracy: {accuracy:.3f}')


In [None]:
train_neural_network(X_train2_umap3, y_train2, X_test2_umap3, y_test2)

# Ensemble

In [None]:
def load_models(folder_path = '/content/drive/My Drive/swc_models/'):
    lightgbm_model = joblib.load(folder_path + 'trained_model_lightgbm_umap2.joblib')
    mlp_model = load_model(folder_path + 'mlp_model_umap.keras')
    random_forest_model = joblib.load(folder_path + 'trained_model_random_forest_umap.joblib')
    xgboost_model = joblib.load(folder_path + 'trained_model_xgboost_umap2.joblib')
    models = [lightgbm_model, mlp_model, random_forest_model, xgboost_model]
    return models

In [None]:
def validate(X_train2, X_test2, y_test2):
    # Define the weights
    models = load_models()
    # Load the saved models
    lightgbm_model = models[0]
    mlp_model = models[1]
    random_forest_model = models[2]
    xgboost_model = models[3]
    # Predictions on Validation Set
    scaler = StandardScaler()
    features = X_train2.columns
    _ = scaler.fit_transform(X_train2.values)
    X_test2 = scaler.transform(X_test2.values)
    X_test2 = pd.DataFrame(X_test2, columns=features)
    print("Data Normalized")
    light_gbm_prob_val = lightgbm_model.predict_proba(X_test2)
    rf_prob_val = random_forest_model.predict_proba(X_test2)
    mlp_prob_val = mlp_model.predict(X_test2)
    xgboost_prob_val = xgboost_model.predict_proba(X_test2)

    print("Performance of Base Models:")
    for name, prob_val in zip(["Light GBM", "Random Forest", "MLP", "XGBoost"],
                              [light_gbm_prob_val, rf_prob_val, mlp_prob_val, xgboost_prob_val]):
        accuracy = accuracy_score(y_test2, np.argmax(prob_val, axis=1) + 1)
        logloss = log_loss(y_test2 - 1, prob_val)
        print(f"{name} - Accuracy: {accuracy:.4f}, Log Loss: {logloss:.4f}")


    return light_gbm_prob_val, rf_prob_val,  mlp_prob_val, xgboost_prob_val

In [None]:
def optimal_weight_finder(light_gbm_prob_val, rf_prob_val, mlp_prob_val, xgboost_prob_val):
  def compute_ensemble_logloss(weights):

      weights = np.array(weights) / np.sum(weights)
      # Compute ensemble probabilities
      ensemble_proba_val = (weights[0] * light_gbm_prob_val +
                            weights[1] * mlp_prob_val +
                            weights[2] * rf_prob_val +
                            weights[3] * xgboost_prob_val)

      return log_loss(y_test2 - 1, ensemble_proba_val)


  # Define the search space for weights
  space = [Real(0, 1, name='lightgbm_weight'),
           Real(0, 1, name='mlp_weight'),
           Real(0, 1, name='random_forest_weight'),
           Real(0, 1, name='xgboost_weight')]

  # Perform Bayesian Optimization
  result = gp_minimize(compute_ensemble_logloss, space, n_calls=175, random_state=42, verbose = True)

  # Extract the optimal weights
  optimal_weights = result.x
  optimal_weights = optimal_weights / np.sum(optimal_weights)

  print("Optimal Ensemble Log Loss:", result.fun)
  print("Optimal Weights:", optimal_weights)

  return optimal_weights




In [None]:
def ensemble_weight(X_train2, X_test2, X_test, y_test2, y_train2, optimal_weights, pred_no = 1):

    # Load the saved models
    models = load_models()
    lightgbm_model = models[0]
    mlp_model = models[1]
    random_forest_model = models[2]
    xgboost_model = models[3]

    # Scale the data
    scaler = StandardScaler()
    features = X_train2.columns
    _ = scaler.fit_transform(X_train2.values)
    X_test = scaler.transform(X_test.values)
    X_test2 = scaler.transform(X_test2.values)
    X_test = pd.DataFrame(X_test, columns=features)
    X_test2 = pd.DataFrame(X_test2, columns=features)
    print("Data Normalized")

    # Make predictions using the ensemble on X_test
    light_gbm_prob_test = np.load("/content/drive/My Drive/swc_predictions/lightgbm_test_prediction.npy")
    print("LightGBM predicted")
    rf_prob_test = random_forest_model.predict_proba(X_test)
    print("Random Forest predicted")
    mlp_prob_test = mlp_model.predict(X_test)
    print("MLP predicted")
    xgboost_prob_test = xgboost_model.predict_proba(X_test)
    print("XGBoost predicted")
    #light_gbm_prob_test = lightgbm_model.predict_proba(X_test)


    ensemble_proba_test = (optimal_weights[0] * light_gbm_prob_test +
                           optimal_weights[1] * mlp_prob_test +
                           optimal_weights[2] * rf_prob_test +
                           optimal_weights[3] * xgboost_prob_test)

    print("Final Prediction Made")
    column_names = "c1,c2,c3,c4,c5,c6,c7,c8,c9"
    np.savetxt(f"/content/drive/My Drive/swc_predictions/dogan_parlak_{pred_no}.csv", ensemble_proba_test, delimiter=',', header=column_names, comments='')
    print("Results Saved")
    return ensemble_proba_test, light_gbm_prob_test , rf_prob_test, mlp_prob_test, xgboost_prob_test

In [None]:
light_gbm_prob_val, rf_prob_val, mlp_prob_val, xgboost_prob_val = validate(X_train2_umap3, X_test2_umap3, y_test2)

Data Normalized
Performance of Base Models:
Light GBM - Accuracy: 0.8208, Log Loss: 0.4700
Random Forest - Accuracy: 0.8074, Log Loss: 0.5473
MLP - Accuracy: 0.8079, Log Loss: 0.5085
XGBoost - Accuracy: 0.8218, Log Loss: 0.4685


In [None]:
#optimal_weights = optimal_weight_finder(light_gbm_prob_val, rf_prob_val, mlp_prob_val, xgboost_prob_val)
optimal_weights= [0.3617517,  0.31526965, 0.02926238, 0.29371628]
print(optimal_weights)

[0.3617517, 0.31526965, 0.02926238, 0.29371628]


In [None]:
ensemble_proba_test, light_gbm_prob_test, rf_prob_test, mlp_prob_test, xgboost_prob_test =\
ensemble_weight(X_train2_umap3, X_test2_umap3, X_test_umap3, y_test2, y_train2, optimal_weights, pred_no = 13)

Data Normalized
LightGBM predicted
Random Forest predicted
MLP predicted
XGBoost predicted
Final Prediction Made
Results Saved


In [None]:
pred_13 = pd.read_csv(f"/content/drive/My Drive/swc_predictions/dogan_parlak_{13}.csv")

In [None]:
pred_11 = pd.read_csv(f"/content/drive/My Drive/swc_predictions/dogan_parlak_{11}.csv")

In [None]:
# Identify the predicted class for each row
predicted_class_df11 = pred_11.idxmax(axis=1)
predicted_class_df13 = pred_13.idxmax(axis=1)

In [None]:
# Filter rows where the predicted class is the same in both DataFrames
matching_rows = predicted_class_df11 == predicted_class_df13

In [None]:
proba_diff = pred_13.lookup(pred_13.index[matching_rows], predicted_class_df13[matching_rows]) - \
             pred_11.lookup(pred_11.index[matching_rows], predicted_class_df11[matching_rows])


  proba_diff = pred_13.lookup(pred_13.index[matching_rows], predicted_class_df13[matching_rows]) - \
  pred_11.lookup(pred_11.index[matching_rows], predicted_class_df11[matching_rows])


In [None]:
np.sum(proba_diff > 0)

70979