In [1]:
RANDOM_STATE = 2025


In [2]:
import kagglehub
import pandas as pd
import numpy as np

# Download latest version
path = kagglehub.dataset_download("agungpambudi/network-malware-detection-connection-analysis")

print("Path to dataset files:", path)


RELOAD_DATA = False
if not RELOAD_DATA:
  try:
    print(df.head())
  except Exception as e:
    print("No dataframe.  Loading data...")
    RELOAD_DATA=True
if RELOAD_DATA:
  dataframes = []
  import os
  for dirname, _, filenames in os.walk(path):
    for index, filename in enumerate(filenames):
      full_path = os.path.join(dirname, filename)
      print(f"Using file: {full_path}")
      dataframes.append(pd.read_csv(full_path, sep ="|"))
  df = pd.concat(dataframes, ignore_index=True)


Path to dataset files: /home/craig-wilkinson/.cache/kagglehub/datasets/agungpambudi/network-malware-detection-connection-analysis/versions/3
No dataframe.  Loading data...
Using file: /home/craig-wilkinson/.cache/kagglehub/datasets/agungpambudi/network-malware-detection-connection-analysis/versions/3/CTU-IoT-Malware-Capture-21-1conn.log.labeled.csv
Using file: /home/craig-wilkinson/.cache/kagglehub/datasets/agungpambudi/network-malware-detection-connection-analysis/versions/3/CTU-IoT-Malware-Capture-3-1conn.log.labeled.csv
Using file: /home/craig-wilkinson/.cache/kagglehub/datasets/agungpambudi/network-malware-detection-connection-analysis/versions/3/CTU-IoT-Malware-Capture-60-1conn.log.labeled.csv
Using file: /home/craig-wilkinson/.cache/kagglehub/datasets/agungpambudi/network-malware-detection-connection-analysis/versions/3/CTU-IoT-Malware-Capture-42-1conn.log.labeled.csv
Using file: /home/craig-wilkinson/.cache/kagglehub/datasets/agungpambudi/network-malware-detection-connection-ana

  dataframes.append(pd.read_csv(full_path, sep ="|"))


Using file: /home/craig-wilkinson/.cache/kagglehub/datasets/agungpambudi/network-malware-detection-connection-analysis/versions/3/CTU-IoT-Malware-Capture-8-1conn.log.labeled.csv
Using file: /home/craig-wilkinson/.cache/kagglehub/datasets/agungpambudi/network-malware-detection-connection-analysis/versions/3/CTU-IoT-Malware-Capture-1-1conn.log.labeled.csv
Using file: /home/craig-wilkinson/.cache/kagglehub/datasets/agungpambudi/network-malware-detection-connection-analysis/versions/3/CTU-IoT-Malware-Capture-20-1conn.log.labeled.csv
Using file: /home/craig-wilkinson/.cache/kagglehub/datasets/agungpambudi/network-malware-detection-connection-analysis/versions/3/CTU-IoT-Malware-Capture-44-1conn.log.labeled.csv
Using file: /home/craig-wilkinson/.cache/kagglehub/datasets/agungpambudi/network-malware-detection-connection-analysis/versions/3/CTU-IoT-Malware-Capture-9-1conn.log.labeled.csv
Using file: /home/craig-wilkinson/.cache/kagglehub/datasets/agungpambudi/network-malware-detection-connectio

In [3]:

SKIPPED_COLUMNS = [
  'ts', 'uid', 'id.orig_h', 'id.resp_h', 'tunnel_parents', 'detailed-label']
ONE_HOT_COLUMNS = ['proto', 'service', 'conn_state', 'local_orig', 'local_resp', 'history', ]
NUMERIC_COLUMNS = [
   'id.orig_p', 'id.resp_p', #??
   'duration', 'orig_bytes',
     'resp_bytes', 'missed_bytes', 
   'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes',
]
LABEL_COLUMN = ['label']

def process_data(df, sample_count, test_pct):
  df = df[ONE_HOT_COLUMNS + NUMERIC_COLUMNS + LABEL_COLUMN]

  if sample_count:
    df = df.sample(n=sample_count, random_state=42).copy()
  for col in NUMERIC_COLUMNS:
    df.loc[:, col] = pd.to_numeric(df[col], errors='coerce')
    df.loc[:, col] = df[col].fillna(-1)

  df_test = df.sample(frac=test_pct)
  features_test = df_test.drop(columns=LABEL_COLUMN)
  y_test = pd.DataFrame(index=df_test.index)
  y_test['label'] = np.where(df_test[LABEL_COLUMN[0]] == 'Benign', 1, 0)
  
  df_train = df.drop(df_test.index)
  features_train = df_train.drop(columns=LABEL_COLUMN)
  y_train = pd.DataFrame(index=df_train.index)
  
  #y_train['label'] = (1 if df_train[LABEL_COLUMN[0]] == 'Benign' else 0)
  y_train['label'] = np.where(df_train[LABEL_COLUMN[0]] == 'Benign', 1, 0)

  return (
      features_train.reset_index(drop=True), 
      y_train.reset_index(drop=True), 
      features_test.reset_index(drop=True), 
      y_test.reset_index(drop=True)
  )


In [4]:
# @title Question 1 - b
from numpy.random.mtrand import f
import sklearn
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC

def train_svm(df_features_train, df_y_train, C=1, kernel="linear"):
  preprocessor = make_column_transformer(
      (OneHotEncoder(handle_unknown='ignore', sparse_output=False), ONE_HOT_COLUMNS),
      (MinMaxScaler(), NUMERIC_COLUMNS),
      remainder='passthrough'
  )
  preprocessor.fit(df_features_train)
  print(f"Total input features (original): {df_features_train.shape[1]}")
  feature_names = preprocessor.get_feature_names_out()
  print(f"Total output features (post-encoding): {len(feature_names)}")
  # Train the model
  classifier = make_pipeline(
      preprocessor,
      SVC(kernel=kernel,
          C=C
          )
  )
  classifier.fit(df_features_train, df_y_train)

  return classifier

def train_bagging(df_features_train, df_y_train, max_depth=2, max_trees=50):

  preprocessor = make_column_transformer(
      (OneHotEncoder(handle_unknown='ignore'), ONE_HOT_COLUMNS),
      remainder='passthrough'
  )
  tree_classifier = sklearn.tree.DecisionTreeClassifier(max_depth=max_depth)

  bagging_classifier = make_pipeline(
      preprocessor,
      sklearn.ensemble.BaggingClassifier(
          estimator=tree_classifier,
          n_estimators=max_trees,
          max_samples=0.5,
          bootstrap=False,
          n_jobs=4,
          random_state=RANDOM_STATE)
  )
  bagging_classifier.fit(df_features_train, df_y_train)

  return bagging_classifier


In [5]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.cluster import MiniBatchKMeans
import sys
from sklearn.metrics import classification_report, confusion_matrix
VERBOSE = False
class ClassifierBase:
    def __init__(self, df_features_train, df_y_train):
        usable_numeric_columns = NUMERIC_COLUMNS.copy()
        # Any numeric column with only one value, will result in division by zero
        # during normalization, and should just be removed.
        for col in NUMERIC_COLUMNS:
            if df_features_train[col].max(axis=0) == df_features_train[col].min(axis=0):
                if VERBOSE:
                    print("DROPPING COL!!!!!!!!!!!!!!")
                    print(col)
                df_features_train = df_features_train.drop(columns=[col])
                usable_numeric_columns.remove(col)

        self.df_features_train = df_features_train
        self.df_y_train = df_y_train
        self.preprocessor = make_column_transformer(
            (OneHotEncoder(handle_unknown='ignore'), ONE_HOT_COLUMNS),
            (MinMaxScaler(), usable_numeric_columns),
            remainder='passthrough'
        )
        self.preprocessor.fit(df_features_train)
        feature_names = self.preprocessor.get_feature_names_out()
        if VERBOSE:
            print(f"Final feature names: {feature_names}")
    def preprocess(self, df_features):
        X_transformed = self.preprocessor.transform(df_features)
        print(f"Global feature space dimensionality: {X_transformed.shape[1]}")
        return X_transformed
    
class SvmKMeansClassifier(ClassifierBase, BaseEstimator, ClassifierMixin):
    def __init__(self, df_features_train, df_y_train):
        super().__init__(df_features_train, df_y_train)
        self.classifier = None
    
    def transform_kmeans(self, features, labels, kmeans_clusters_per_class):
        features_train_svm = []
        y_train_svm = []

        unique_labels = np.unique(labels["label"])
        print(unique_labels)
        for label in unique_labels:
            # Filter data for this specific class
            class_data = features[(labels['label'] == label).values]

            kmeans = MiniBatchKMeans(
                n_clusters=kmeans_clusters_per_class, 
                batch_size=4096,
                random_state=42,
                n_init="auto"
            )
        
            kmeans.fit(class_data)
            centers = kmeans.cluster_centers_

            # The cluster centers become our new "training examples"
            features_train_svm.append(centers)
            y_train_svm.append(np.full(centers.shape[0], label))

        features_final = np.vstack(features_train_svm)
        labels_final = np.concatenate(y_train_svm)
        return features_final, labels_final


    def compute_metrics(self, X, y):
        assert self.classifier is not None
        X_transformed = self.preprocess(X)
        
        # We don't apply k-means here, because this was just a training trick.
        # KMeans always produces a dense feature array, so we force our data
        # here to be dense to match.
        dense_X_transformed = X_transformed.toarray()
        predictions = self.classifier.predict(dense_X_transformed)
        
        return classification_report(y, predictions)

    def train(self, df_features, df_y, C, kernel, kmeans_clusters_per_class):
        self.kmeans_clusters_per_class = kmeans_clusters_per_class
        X_transformed = self.preprocess(df_features)
        X_final, y_final = self.transform_kmeans(X_transformed, df_y, kmeans_clusters_per_class)
        print(X_final.shape)


        self.classifier = SVC(
            kernel=kernel,
            C=C,
        )
        
        self.classifier.fit(X_final, y_final)




In [6]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.cluster import MiniBatchKMeans
import random
from sklearn.metrics import classification_report, confusion_matrix

VERBOSE = False

class PreProcessor:
    def __init__(self, df_features_all):
        usable_numeric_columns = NUMERIC_COLUMNS.copy()
        # Any numeric column with only one value, will result in division by zero
        # during normalization, and should just be removed.
        for col in NUMERIC_COLUMNS:
            if df_features_all[col].max(axis=0) == df_features_all[col].min(axis=0):
                if VERBOSE:
                    print("DROPPING COL!!!!!!!!!!!!!!")
                    print(col)
                df_features_all = df_features_all.drop(columns=[col])
                usable_numeric_columns.remove(col)

        self.preprocessor = make_column_transformer(
            (OneHotEncoder(handle_unknown='ignore'), ONE_HOT_COLUMNS),
            (MinMaxScaler(), usable_numeric_columns),
            remainder='passthrough'
        )
        self.preprocessor.fit(df_features_all)
        if VERBOSE:
            feature_names = self.preprocessor.get_feature_names_out()
            print(f"Final feature names: {feature_names}")

    def process_features(self, df_features):
        X_transformed = self.preprocessor.transform(df_features)
        return X_transformed

class ClassifierBaseFOR_BAGGING:
    def __init__(self):
        pass

class SvmKMeansClassifierFOR_BAGGING(ClassifierBaseFOR_BAGGING, BaseEstimator, ClassifierMixin):
    def __init__(self, C, kernel, kmeans_clusters_per_class):
        super().__init__()
        self.C = C
        self.kernel=kernel
        self.kmeans_clusters_per_class = kmeans_clusters_per_class
    
    def transform_kmeans(self, features, labels, kmeans_clusters_per_class):
        features_train_svm = []
        y_train_svm = []

        unique_labels = np.unique(labels)
        print(unique_labels)
        for label in unique_labels:
            # Filter data for this specific class
            mask = (labels == label)
            
            # Apply mask to X. 
            # Note: This works for both Dense Arrays and Sparse Matrices
            class_data = features[mask]

            kmeans = MiniBatchKMeans(
                n_clusters=kmeans_clusters_per_class, 
                batch_size=4096,
                random_state=42,
                n_init="auto"
            )
        
            kmeans.fit(class_data)
            centers = kmeans.cluster_centers_

            # The cluster centers become our new "training examples"
            features_train_svm.append(centers)
            y_train_svm.append(np.full(centers.shape[0], label))

        features_final = np.vstack(features_train_svm)
        labels_final = np.concatenate(y_train_svm)
        return features_final, labels_final


    def predict(self, X):
        assert self.classifier is not None
        
        # We don't apply k-means here, because this was just a training trick.
        # KMeans always produces a dense feature array, so we force our data
        # here to be dense to match.
        dense_X_transformed = X.toarray()
        predictions = self.classifier.predict(dense_X_transformed)
        return predictions

    # matching base
    def fit(self, X, y):
        # To improve expressiveness across the bag of elements, we pick
        # values for these parameters randomly.
        chosen_cluster_count = random.choice(self.kmeans_clusters_per_class)
        chosen_C = random.choice(self.C)
        chosen_kernel = random.choice(self.kernel)
        # Next, since the dataset is too large anyways, we extract different
        # subsamples for each classifier.
        print(f"Training SVM with: Clusters:{chosen_cluster_count} - C:{chosen_C} - Kernel: {chosen_kernel}")

        # Inside bagging this is now a matrix, not a dataframe
        #X, y = self.subsample(X, y, self.subsample_count)

        X_final, y_final = self.transform_kmeans(X, y, chosen_cluster_count)
        print(X_final.shape)

        self.classifier = SVC(
            kernel=chosen_kernel,
            C=chosen_C,
        )
        
        self.classifier.fit(X_final, y_final)

    def compute_metrics(self, X, y):
        predictions = self.predict(X)
        print(classification_report(y, predictions))  

    def subsample(self, X, y, count):
        X_subsample = X.sample(n=count)
        y_subsample = y.loc[X_subsample.index]
        return X_subsample, y_subsample


def train_svm_bagging(df_features_train, df_y_train,
                      C, kernel, kmeans_clusters_per_class,
                      subsample_pct, estimator_count):
    svm_classifier = SvmKMeansClassifierFOR_BAGGING(
        C, kernel, kmeans_clusters_per_class
    )

    bagging_classifier = make_pipeline(
        sklearn.ensemble.BaggingClassifier(
            estimator=svm_classifier,
            n_estimators=estimator_count,
            max_samples=subsample_pct,
            bootstrap=False,
            n_jobs=4,
            random_state=RANDOM_STATE)
    )
    bagging_classifier.fit(df_features_train, df_y_train)

    return bagging_classifier

In [7]:
    # with multiprocessing.Pool(processes=self.n_estimators) as pool:
    #         # starmap unpacks the tuple arguments into the function args
    #         self.estimators = pool.starmap(_train_single_estimator, tasks)


In [8]:
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier # For classification

def train_knn(k, df_features_train, df_y_train):
    preprocessor = make_column_transformer(
        (OneHotEncoder(handle_unknown='ignore'), ONE_HOT_COLUMNS),
        remainder='passthrough'
    )

    knn_classifier = make_pipeline(
        preprocessor,
        KNeighborsClassifier(n_neighbors = k + 1)
    )
    knn_classifier.fit(df_features_train, df_y_train)

    return knn_classifier
#   y_preds = knn.predict(test_X)

#   correct = 0
#   incorrect = 0
#   for single_y_test, y_pred in zip(test_y, y_preds):
#     if single_y_test == y_pred:
#       correct += 1
#     else:
#       incorrect += 1
#   return correct / (correct + incorrect)

# def train_and_evaluate_sklearn_knn(train_X, train_y, test_X, test_y):
#     kf = KFold(n_splits=5, shuffle=True, random_state=42)
#     kfold_dataset_parts = []
#     max_train_samples = 0
#     for split in splits:
#         train_indices, validation_indices = split

#         train_dataset_tr = [train_dataset[i] for i in train_indices]
#         train_dataset_val = [train_dataset[i] for i in validation_indices]
#         train_X = [x for y, x in train_dataset_tr]
#         train_y = [y for y, x in train_dataset_tr]
#         val_X = [x for y, x in train_dataset_val]
#         val_y = [y for y, x in train_dataset_val]
#         kfold_dataset_parts.append((train_X, train_y, val_X, val_y))
#         max_train_samples = max(max_train_samples, len(train_dataset_tr))

#     splits = kf.split(train_dataset)
#     best_accuracy = 0
#     for k in range(1, max_train_samples - 1):
#         accuracies = []
#         for train_X, train_y, val_X, val_y in kfold_dataset_parts:
#             accuracies.append(train_and_evaluate_sklearn_knn(
#                 k, train_X, train_y, val_X, val_y))

#         accuracy = sum(accuracies) / len(accuracies)
#         if accuracy > best_accuracy:
#             best_accuracy = accuracy
#             best_k = k
#         print(f"Average cross-validation (k={k}): {accuracy}")

#     print(f"Best N-Fold Validation Accuracy: {best_accuracy}")
#     print(f"Best K: {best_k}")

#     test_accuracy = train_and_evaluate_sklearn_knn(
#         best_k,
#         [x for y, x in train_dataset],
#         [y for y, x in train_dataset],
#         [x for y, x in test_dataset],
#         [y for y, x in test_dataset])

#     print(f"Final test accuracy: {test_accuracy}")



In [None]:
from sklearn.metrics import classification_report, confusion_matrix


def compute_metrics(classifier, df_features, df_y):
  predictions = classifier.predict(df_features)
  
  return classification_report(df_y, predictions)

class KnnConfig:
  def __init__(self,
               K):
    self.K = K

class SvmConfig:
  def __init__(self,
               C,
               kernels):
    self.C = C
    self.kernels = kernels

class BaggingConfig:
  def __init__(self,
               max_depth,
               max_trees):
    self.max_depth = max_depth
    self.max_trees = max_trees

class SvmBaggingConfig:
  def __init__(self,
               C,
               kernels,
               cluster_counts,
               max_submodels,
               subsample_pct):
    self.C = C
    self.kernels = kernels
    self.cluster_counts = cluster_counts
    self.max_submodels = max_submodels
    self.subsample_pct = subsample_pct


class Config:
  def __init__(self,
               svm_config:SvmConfig = None,
               bagging_config:BaggingConfig = None,
               knn_config:KnnConfig = None,
               svm_bagging_config:SvmBaggingConfig = None,
               sample_count = None,
               test_pct = 0.2):
    self.svm_config = svm_config
    self.bagging_config = bagging_config
    self.knn_config = knn_config
    self.svm_bagging_config = svm_bagging_config
    self.sample_count = sample_count
    self.test_pct = test_pct

last_sample_count = -1
last_test_pct = -1

# Allows defining a list of configs for a long-running
# training batch, where multiple approaches are run without
# Adjusting configs.
try:
  test = final_results[0]
except Exception as e:
  final_results = []

for config in [
    # Config(
    #   svm_config=SvmConfig(
    #       C=[0.01, 0.1, 1.0, 10.0],
    #       kernels=['poly', 'linear', 'rbf', 'sigmoid']
    #   ),
    #   sample_count=10_000,
    #   test_pct=0.5,
    # ),
    # Config(
    #   knn_config=KnnConfig(
    #     K=300
    #   ),
    #   sample_count=100_000,
    #   test_pct=0.1,
    # ),
    # Config(
    #   bagging_config=BaggingConfig(
    #     max_depth=5,
    #     max_trees=50
    #   ),
    #   sample_count=None,
    #   test_pct=0.2,
    # ),
    Config(
      svm_bagging_config=SvmBaggingConfig(
          C=[0.01, 0.1, 1.0, 10.0],
          kernels=['poly', 'linear'],
          cluster_counts=[30, 300, 3000],
          max_submodels=40,
          subsample_pct=0.02
      ),
      sample_count=5_500_000,
      test_pct=0.1,
    ),
    ]:
  print(config.sample_count)
  if last_sample_count != config.sample_count or last_test_pct != config.test_pct:
    last_sample_count = config.sample_count
    last_test_pct = config.test_pct
    df_features_train, df_y_train, df_features_test, df_y_test = process_data(df, config.sample_count,
                                                                              config.test_pct)
    print(df_features_train)
    print(df_y_train)
    
  # TODO - add support for KFold cross validation
  # TODO - add support for GridSearchCV
  if config.bagging_config:
    bagging_classifier = train_bagging(df_features_train, df_y_train,
                                        max_depth=config.bagging_config.max_depth,
                                        max_trees=config.bagging_config.max_trees)
    metrics = compute_metrics(bagging_classifier, df_features_test, df_y_test)
    print(metrics)
    final_results.append((bagging_classifier, metrics, config))
  elif config.svm_bagging_config:
    preprocessor = PreProcessor(df_features_all=df_features_train)
    X_transformed = preprocessor.process_features(df_features_train)

    bagging_classifier = train_svm_bagging(
        X_transformed, df_y_train,
        config.svm_bagging_config.C,
        config.svm_bagging_config.kernels,
        config.svm_bagging_config.cluster_counts,
        config.svm_bagging_config.subsample_pct,
        config.svm_bagging_config.max_submodels)

    X_test = preprocessor.process_features(df_features_test)
    metrics = compute_metrics(bagging_classifier, X_test, df_y_test)
    print(metrics)
    final_results.append((bagging_classifier, metrics, config))
  elif config.svm_config:
    # svm_classifier = train_svm(df_features_train, df_y_train)
    for kernel in config.svm_config.kernels:
      for c in config.svm_config.C:
        for cluster_count in [20, 100, 400, 1000]:
          print(f"Experiment: {kernel} - {c} - clust: {cluster_count}")
          classifier = SvmKMeansClassifier(df_features_train, df_y_train)
          classifier.train(df_features_train, df_y_train, c, kernel,
                           kmeans_clusters_per_class=cluster_count)
          metrics = classifier.compute_metrics(df_features_test, df_y_test)
          print(metrics)
          final_results.append((classifier, metrics, config))

  elif config.knn_config:
    knn_classifier = train_knn(config.knn_config.K, df_features_train, df_y_train)
    metrics = compute_metrics(knn_classifier, df_features_test, df_y_test)
    print(metrics)
    final_results.append((knn_classifier, metrics, config))


5500000


  df.loc[:, col] = df[col].fillna(-1)


        proto service conn_state local_orig local_resp history  id.orig_p  \
0         tcp       -         S0          -          -       S    58968.0   
1         tcp       -         S0          -          -       S    39102.0   
2         tcp       -         S0          -          -       S    48178.0   
3         tcp       -         S0          -          -       S    42564.0   
4         tcp       -     RSTOS0          -          -       I    55982.0   
...       ...     ...        ...        ...        ...     ...        ...   
4949995   tcp       -         S0          -          -       S    40606.0   
4949996   tcp       -     RSTOS0          -          -       I    32045.0   
4949997   tcp       -        OTH          -          -       C    45589.0   
4949998   tcp       -         S0          -          -       S    21187.0   
4949999   tcp       -         S0          -          -       S    38723.0   

         id.resp_p  duration orig_bytes resp_bytes  missed_bytes  orig_pkts

  y = column_or_1d(y, warn=True)


Training SVM with: Clusters:30 - C:10.0 - Kernel: linear
[0 1]
Training SVM with: Clusters:300 - C:1.0 - Kernel: poly
[0 1]
Training SVM with: Clusters:30 - C:0.1 - Kernel: poly
[0 1]
Training SVM with: Clusters:3000 - C:0.1 - Kernel: poly
[0 1]
(60, 181)
(60, 181)
Training SVM with: Clusters:300 - C:10.0 - Kernel: linear
[0 1]
Training SVM with: Clusters:3000 - C:1.0 - Kernel: linear
[0 1]
(600, 181)
Training SVM with: Clusters:30 - C:0.01 - Kernel: linear
[0 1]
(60, 181)
(600, 181)
Training SVM with: Clusters:3000 - C:0.1 - Kernel: linear
[0 1]
Training SVM with: Clusters:300 - C:0.01 - Kernel: poly
[0 1]
(600, 181)
Training SVM with: Clusters:300 - C:1.0 - Kernel: linear
[0 1]
(600, 181)
Training SVM with: Clusters:300 - C:10.0 - Kernel: poly
[0 1]
(600, 181)
Training SVM with: Clusters:30 - C:10.0 - Kernel: poly
[0 1]
(60, 181)
Training SVM with: Clusters:3000 - C:1.0 - Kernel: poly
[0 1]
(6000, 181)
(6000, 181)
(6000, 181)
Training SVM with: Clusters:300 - C:1.0 - Kernel: poly
[0 

In [10]:
for a, b, c in final_results:
    print(b)

              precision    recall  f1-score   support

           0       0.88      0.72      0.79      6476
           1       0.61      0.81      0.70      3524

    accuracy                           0.75     10000
   macro avg       0.74      0.77      0.74     10000
weighted avg       0.78      0.75      0.76     10000

              precision    recall  f1-score   support

           0       1.00      0.98      0.99   3247654
           1       0.97      1.00      0.98   1754547

    accuracy                           0.99   5002201
   macro avg       0.98      0.99      0.99   5002201
weighted avg       0.99      0.99      0.99   5002201

