In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import glob
import re
import sqlite3
import matplotlib.pyplot as plt
import pickle
import pyarrow
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from typing import Union, Tuple

In [None]:
# 3. Data Integration (extract)
## Extracting data from files and combining it into one single data frame

In [None]:
def extract(file_path: str) -> pd.DataFrame:
    # created 3 different lists which contain the data files - 1 for each type of file
    csv_files = glob.glob(file_path + '*.csv')
    json_files = glob.glob(file_path + '*.json')
    parquet_files = glob.glob(file_path + '*.parquet')

    # created 3 different empty lists of each file type to store the dataframes
    csv_list = []
    json_list = []
    parquet_list = []

    # loop through the list of glob csv files and read/convert each one into a data frame
    # and append it to the list that will contain the newly converted csv data frames
    for csv in csv_files:
      temp_df = pd.read_csv(csv)
      csv_list.append(temp_df)

    # concatenate the newly converted list of csv data frames into one
    csv_df = pd.concat(csv_list, axis=0)
    print(f"CSV shape: {csv_df.shape} \n")
    csv_df.head()

    # loop through the list of glob JSON files and read/convert each one into a data frame
    # and append it to the list that will contain the newly converted JSON data frames
    for json in json_files:
        temp_df = pd.read_json(json, lines=True)
        json_list.append(temp_df)

    # concatenate the newly converted list of JSON data frames into one
    json_df = pd.concat(json_list, axis=0)
    print(f"JSON shape: {json_df.shape} \n")
    json_df.head()

    # loop through the list of glob parquet files and read/convert each one into a data frame
    # and append it to the list that will contain the newly converted parquet data frames
    for pq in parquet_files:
        temp_df = pd.read_parquet(pq)
        parquet_list.append(temp_df)

    # concatenate the list of parquet data frames into one
    parquet_df = pd.concat(parquet_list, axis=0)
    print(f"Parquet shape: {parquet_df.shape} \n")
    parquet_df.head()

    # combine all dataframes into one and return it
    combined_df = pd.concat([csv_df, json_df, parquet_df], ignore_index=True)
    return combined_df

In [None]:
# directory to access for the data files we need is called 'data' and the file path is '/content/data/'
# the file path to obtain the data may change obviously, it is dependent upon the machine it is ran and where the data is stored
path = './data/'
data = extract(path)
print(f"Combined dataframes shape: {data.shape}")
data.head()

In [None]:
## Data has been extracted, combined into one single data frame; now it is ready for transformation.

In [None]:
# 4. Data Transformation (Transform)

In [None]:
# check the current data types with data.info() as it won't truncate it unlike data.dtypes()
data.info()

In [None]:
# remove the white space from column names
data.columns = data.columns.str.strip()

In [None]:
# remove 'Heartbleed' attack data
data = data[data['Label'] != 'Heartbleed']

In [None]:
# display the number of missing data per column in our 'data' data frame
data.isna().sum()

In [None]:
# the following statement is used to determine the total number of missing values
print(f'Total number of missing values: {data.isna().sum().sum()}')

In [None]:
# drop duplicate data values and display the shape to determine if the row (x) value is less than the original value of 61_128
data = data.drop_duplicates()
data.shape

In [None]:
# print the total number of missing values again after dropping duplicates to determine if there are any
# remaining so that they can be handled accordingly
print(f'Total number of missing values: {data.isna().sum().sum()}')

In [None]:
# handle out-of-range and outlier data: infinite values are replaced with NaNs, which are then filled with column means.

# list to contain columns with infinite values
cols_with_infinite = []

# loop through each column that is a 'number' type (int64, float64) to check for infinite values
for column in data.select_dtypes(include=[np.number]).columns:
    if data[column].apply(np.isinf).any():
        cols_with_infinite.append(column)
        count_infinite = data[column].apply(np.isinf).sum()
        print(f"Column '{column}' has {count_infinite} infinite values.")

# display columns with infinite values
print(f"Columns with infinite values: {cols_with_infinite}")

# check for existing NaN values before replacing infinite values - should be 10
initial_nans_count = data[cols_with_infinite].isna().sum()
print(f"Initial count of NaN values in 'Flow Bytes/s' and 'Flow Packets/s': \n{initial_nans_count}")

# replace all infinite values with NaN in the 'Flow Bytes/s' & 'Flow Packets/s' columns
data[cols_with_infinite] = data[cols_with_infinite].replace([np.inf, -np.inf], np.nan)

# verify that all infinite values in the 'Flow Bytes/s' & 'Flow Packets/s' columns have been replaced with NaN
for column in cols_with_infinite:
    if data[column].apply(np.isinf).any():
        print(f"Error: Column '{column}' still contains infinite values.")
    else:
        print(f"Success: Column '{column}' no longer contains infinite values.")

# count the number of NaNs in the 'Flow Bytes/s' & 'Flow Packets/s' columns after replacing infinite values
nans_count_after_replacement = data[cols_with_infinite].isna().sum()
print(f"Count of NaN values in 'Flow Bytes/s' and 'Flow Packets/s' after replacing infinite values: \n{nans_count_after_replacement}")

# fill NaNs with the mean of the column in the 'Flow Bytes/s' & 'Flow Packets/s' columns
data[cols_with_infinite] = data[cols_with_infinite].fillna(data[cols_with_infinite].mean())

# verify that there are no more NaNs in the 'Flow Bytes/s' & 'Flow Packets/s' columns
nans_count_after_filling = data[cols_with_infinite].isna().sum()
print(f"Count of NaN values in 'Flow Bytes/s' and 'Flow Packets/s' after filling NaNs: \n{nans_count_after_filling}")

In [None]:
# print the total number of missing values again to make sure that it is 0 and that all missing values have been handled
print(f'Total number of missing values: {data.isna().sum().sum()}')

In [None]:
## Data has been transformed and is ready for loading to a csv file and database

In [None]:
# 5. Data Storage (Load)

In [None]:
# sqlite database and table names variables created for cleaned data
database_name = 'ide_cleaned_db'
table_name = 'ntdl_cleaned'

In [None]:
# load the data to a csv file
def load_to_csv(data_frame: pd.DataFrame, file_name: str) -> None:
    data_frame.to_csv(file_name, index=False)

In [None]:
# load the data to a sqlite database
def load_to_database(data_frame: pd.DataFrame, db_name: str, t_name) -> None:
    # add an id column to the data frame for the database as the primary key
    if 'id' not in data_frame.columns:
        data_frame.reset_index(drop=True, inplace=True)
        data_frame.insert(0, 'id', data_frame.index + 1)

    # create a connection to sqlite
    sqlconnection = sqlite3.connect(db_name)

    # load data frame into sqlite specifying table name, SQL connection, if the table exists then replace it, don't index it, and have id be the primary key
    data_frame.to_sql(t_name, sqlconnection, if_exists="replace", index=False, dtype={"id": "INTEGER PRIMARY KEY"})

    # close the connection
    sqlconnection.close()

    # drop the 'id' column as it's only needed when loading to a database
    if 'id' in data.columns:
        data.drop(columns=['id'], inplace=True)

In [None]:
# call the 'load_to_csv' and 'load_to_database' functions to perform those actions / tasks
load_to_csv(data_frame=data, file_name="ide_data_cleaned.csv")
load_to_database(data_frame=data, db_name=database_name, t_name=table_name)

In [None]:
# connect to the database and create a cursor
connect = sqlite3.connect(database_name)
# database cursor is used to query a database and fetch results
cur = connect.cursor()

In [None]:
# return values for each row in the ntdl_cleaned table:
ntdl_query = f"PRAGMA table_info({table_name})"
ntdl_result = cur.execute(ntdl_query)
ntdl_result.fetchall()

In [None]:
# close the connection
connect.close()

In [None]:
## Data has been loaded to a csv file and a SQLite database and is ready for exploratory data analysis

In [None]:
# 6. Reading Data

In [None]:
# reading the data (6.1) into a pandas dataframe
data = pd.read_csv("ide_data_cleaned.csv")
if 'id' in data.columns:
    data.drop(columns=['id'], inplace=True)

In [None]:
# 7. Exploratory Data Analysis

In [None]:
## Univariate Analysis

In [None]:
# determine the shape of the data:
shape = data.shape
print(f"The dataset contains {shape[0]} rows and {shape[1]} columns.")

In [None]:
# unify the columns/features names
def unify_column_name(name: str) -> str:
    name = name.lower().replace(' ', '_')  # replace all spaces with underscores
    name = re.sub(r'[^\w]', '', name)  # remove all non-alphanumeric characters
    if name.endswith('ss'):
        name = name[:-1] # if the name ends with 'ss', remove the 'ss' and only have it be 's'
    return name

data.columns = data.columns.map(lambda x: unify_column_name(x))
print(f"Unified column names: \n{data.columns}")

In [None]:
# identify the unique values in the class label variable
unique_labels = data['label'].unique()
print(f"The unique values in the 'label' column are: {unique_labels}")

In [None]:
# calculate statistics of data
stats = data.describe()
print(f"Statistics of data: \n{stats}")

In [None]:
# create variable that contains only values that are of type int64 or float64 to calculate statistics of data
numeric_data = data.select_dtypes(include=[np.number])

In [None]:
# calculate mean of data
mean = numeric_data.mean()
print(f"Mean of each column: \n{mean}")

In [None]:
# calculate variance of data
variance = numeric_data.var()
print(f"Variance of each column: \n{variance}")

In [None]:
# calculate standard deviation of data
std = numeric_data.std()
print(f"Standard deviation of each column: \n{std}")

In [None]:
# identify low variance columns
limit = 0.01
low_var_columns = variance[variance < limit].index.tolist()
print(f"Columns with low or near zero variance: {low_var_columns}")

In [None]:
# visualizations for specific columns with bar plots, pie charts, and boxplot

# bar plot for 'label' column
plt.figure(figsize=(10,6))
data['label'].value_counts().plot(kind='bar')
plt.title('Bar Plot of Label Distribution')
plt.xlabel('Label')
plt.ylabel('Count')
plt.show()

In [None]:
# pie chart for 'label' column
plt.figure(figsize=(8, 8))
data['label'].value_counts().plot(kind='pie', autopct='%1.2f')
plt.title('Pie Chart of Label Distribution')
plt.ylabel('')
plt.show()

In [None]:
# box plot for 'flow_packets' column
plt.figure(figsize=(10, 6))
sns.boxplot(x=data['flow_packets'])
plt.title('Box Plot of Flow Packets')
plt.xlabel('Flow Packets')
plt.show()

In [None]:
# box plot for 'fwd_packet_length_max' column
plt.figure(figsize=(10, 6))
sns.boxplot(x=data['fwd_packet_length_max'])
plt.title('Box Plot of Fwd Packet Length Max')
plt.xlabel('Fwd Packet Length Max')
plt.show()

In [None]:
# box plot for 'fwd_packets' column
plt.figure(figsize=(10, 6))
sns.boxplot(x=data['fwd_packets'])
plt.title('Box Plot of Fwd Packets')
plt.xlabel('Fwd Packets')
plt.show()

In [None]:
## Bivariate Analysis

In [None]:
# generating a pair plot for the same columns we performed univariate analysis on
columns_for_pair_plot = ['fwd_packet_length_max', 'fwd_packets', 'flow_packets', 'label']
sns.pairplot(data[columns_for_pair_plot], hue='label')
plt.show()

In [None]:
# computing and generating the correlation matrix
correlation_matrix = numeric_data.corr()
plt.figure(figsize=(14, 10))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
## Exploratory data analysis has been completed now data is ready for preprocessing.

In [None]:
# 8. Data Preprocessing

In [None]:
# apply lambda function to create binary labels. 0 for benign, 1 for attack
data['binary_label'] = data['label'].apply(lambda x: 0 if x.lower() == 'benign' else 1)

In [None]:
# splitting the dataset into train, test, and validation splits

# defines features (X) and target (y)
X = data.drop(columns=['label', 'binary_label']) # drop the label & binary_label column from features
y = data['binary_label'] # binary labels - the label column is the target

# split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# split the training set further into training and validation sets (10% of training data for validation)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.10, random_state=42, stratify=y_train)

# print the shapes of the resulting splits to verify
print(f"Training set shape: X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"Validation set shape: X_val: {X_val.shape}, y_val: {y_val.shape}")
print(f"Testing set shape: X_test: {X_test.shape}, y_test: {y_test.shape}")

In [None]:
# initialize the LabelEncoder / create an instance of it
label_encoder = LabelEncoder()

# fit and transform the label column in the training set
y_train_encoded = label_encoder.fit_transform(y_train)

# transform the label column in the validation and testing sets
y_val_encoded = label_encoder.transform(y_val)
y_test_encoded = label_encoder.transform(y_test)

# verify the encoding
print(f"Encoded training labels: {y_train_encoded[:25]}")
print(f"Encoded validation labels: {y_val_encoded[:25]}")
print(f"Encoded testing labels: {y_test_encoded[:25]}")

# The LabelEncoder will assign:
# benign -> 0
# DoS -> 1

In [None]:
## Data has been preprocessed and is ready for feature engineering.

In [None]:
# 9. Feature Engineering

In [None]:
# feature scaling

# standardizing the data

# initialize the StandardScaler
scaler = StandardScaler()

# standardize the data (mean of 0 and a standard deviation of 1)
# fit the scaler on the training data and transform it
X_train_standardized = scaler.fit_transform(X_train)

# transform the validation and testing data using the same scaler
X_val_standardized = scaler.transform(X_val)
X_test_standardized = scaler.transform(X_test)

# convert the standardized data back to a data frame
X_train_standardized = pd.DataFrame(X_train_standardized, columns=X_train.columns)
X_val_standardized = pd.DataFrame(X_val_standardized, columns=X_val.columns)
X_test_standardized = pd.DataFrame(X_test_standardized, columns=X_test.columns)

In [None]:
# feature selection

# visualize the features' importance

# apply SelectKBest with f_classif statistical test
k = 'all'
selector = SelectKBest(score_func=f_classif, k=k)  # define 'k' for top number of features
X_train_kbest = selector.fit_transform(X_train_standardized, y_train_encoded)
X_val_kbest = selector.transform(X_val_standardized)
X_test_kbest = selector.transform(X_test_standardized)
print(f"Features after SelectKBest: {X_train_kbest.shape[1]}") # SelectKBest scores and ranks features

In [None]:
# get the scores for each feature
kbest_scores = selector.scores_

# get the selected feature indices
selected_indices = selector.get_support(indices=True)

# get the feature names
feature_names = X_train_standardized.columns[selected_indices]

# create a DataFrame for better visualization
feature_scores_df = pd.DataFrame({'Feature': feature_names, 'Score': kbest_scores[selected_indices]})

# sort the DataFrame by scores in descending order for better visualization
feature_scores_df = feature_scores_df.sort_values(by='Score', ascending=False)

# plot the scores
plt.figure(figsize=(12, 12))
plt.barh(feature_scores_df['Feature'], feature_scores_df['Score'], color='skyblue')
plt.xlabel('Score')
plt.ylabel('Feature')
plt.title(f'Top {k} Feature Importance Scores by SelectKBest')
plt.gca().invert_yaxis()  # to display the highest scores on top
plt.show()

In [None]:
# remove constant features
constant_features = [col for col in X_train_standardized.columns if X_train_standardized[col].nunique() == 1]

X_train_cf = X_train_standardized.drop(columns=constant_features)
X_val_cf = X_val_standardized.drop(columns=constant_features)
X_test_cf = X_test_standardized.drop(columns=constant_features)
print(f"Features after removing constant features: {X_train_cf.shape[1]}")

In [None]:
# remove features with near-zero variance using the previously created list of near-zero variance
X_train_vt = X_train_standardized.drop(columns=low_var_columns)
X_val_vt = X_val_standardized.drop(columns=low_var_columns)
X_test_vt = X_test_standardized.drop(columns=low_var_columns)
print(f"Features after removing near-zero variance: {X_train_vt.shape[1]}")

In [None]:
# perform LASSO feature selection

# initialize and fit LassoCV
lasso = LassoCV(cv=5, max_iter=20000)
lasso.fit(X_train_standardized, y_train_encoded)

# select non-zero coefficients
lasso_selected_features = np.where(lasso.coef_ != 0)[0]

# transform the data to keep only the selected features
X_train_lasso = X_train_standardized.iloc[:, lasso_selected_features]
X_val_lasso = X_val_standardized.iloc[:, lasso_selected_features]
X_test_lasso = X_test_standardized.iloc[:, lasso_selected_features]
print(f"Features after LASSO: {X_train_lasso.shape[1]}")

In [None]:
# perform tree-based feature importance using random forest

# initialize and fit RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_standardized, y_train_encoded)

# get feature importances and select top features
importances = rf.feature_importances_

# calculate the threshold using quantiles
threshold = np.quantile(importances, 0.75)

# select features with importance scores above the threshold
top_features = np.where(importances > threshold)[0]

# transform the data to keep only the selected features
X_train_rf = X_train_standardized.iloc[:, top_features]
X_val_rf = X_val_standardized.iloc[:, top_features]
X_test_rf = X_test_standardized.iloc[:, top_features]
print(f"Features after Random Forest: {X_train_rf.shape[1]}")

In [None]:
# feature extraction

# perform PCA for dimensionality reduction
pca = PCA()
X_train_pca = pca.fit_transform(X_train_standardized)
X_val_pca = pca.transform(X_val_standardized)
X_test_pca = pca.transform(X_test_standardized)

# calculate the explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_explained_variance = explained_variance_ratio.cumsum()

# plot the cumulative explained variance to determine the number of components
plt.figure(figsize=(8, 6))
plt.plot(cumulative_explained_variance, marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance by PCA Components')
plt.grid(True)
plt.show()

# a common practice is to retain enough components to explain a significant portion of the variance in the data
# typically aim to keep components that explain around 95% of the variance
n_components = np.argmax(cumulative_explained_variance >= 0.95) + 1
print(f"Number of components explaining 95% variance: {n_components}")

# perform PCA with the selected number of components
pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_standardized)
X_val_pca = pca.transform(X_val_standardized)
X_test_pca = pca.transform(X_test_standardized)
print(f"Features after PCA: {X_train_pca.shape[1]}")

# Print the top contributing features for each principal component
original_features = X_train_standardized.columns
for i in range(n_components):
    component = pca.components_[i]
    feature_contributions = sorted(zip(original_features, component), key=lambda x: -abs(x[1]))
    top_features = feature_contributions[:21]  # get the top 21 features for each component
    print(f"\nPrincipal Component {i+1}:")
    for feature, weight in top_features:
        print(f"{feature}: {weight}")

In [None]:
## Feature engineering has been performed, and the processed data is ready to be loaded to a SQLite database.

In [None]:
# 10. Processed Data Loading

In [None]:
# sqlite database and table names variables created for processed data
database_name = 'ide_cleaned_and_processed_db'

X_train_table_name = 'x_train_standardized_data'
X_val_table_name = 'x_val_standardized_data'
X_test_table_name = 'x_test_standardized_data'

In [None]:
# fully processed data loaded into database in separate tables
load_to_database(data_frame=X_train_standardized, db_name=database_name, t_name=X_train_table_name)
load_to_database(data_frame=X_val_standardized, db_name=database_name, t_name=X_val_table_name)
load_to_database(data_frame=X_test_standardized, db_name=database_name, t_name=X_test_table_name)

In [None]:
## Processed data has been loaded to a SQLite database, now model selection, training, and evaluation will be performed.

In [None]:
# 11. Model Selection and Training & 12. Model Evaluation

In [None]:
# model selection

# initialize the models
models = {
    "Logistic Regression": LogisticRegression(max_iter=5000),
    "SVM": SVC(probability=True, max_iter=5000),
    "k-NN": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier()
}

# function to evaluate model
def evaluate_model(m: object, X_: Union[pd.DataFrame, np.ndarray], y_: Union[np.ndarray, pd.Series]) -> Tuple[float, float, float, float, Union[float, str], np.ndarray]:
    y_val_pred = m.predict(X_)  # this line makes predictions
    y_val_pred_prob = m.predict_proba(X_)[:, 1] if hasattr(model, 'predict_proba') else None  # this line gets probabilities if available

    acc = accuracy_score(y_, y_val_pred)
    pre = precision_score(y_, y_val_pred, average='weighted', zero_division=0)
    rec = recall_score(y_, y_val_pred, average='weighted', zero_division=0)
    f_1 = f1_score(y_, y_val_pred, average='weighted', zero_division=0)
    auc_score = roc_auc_score(y_, y_val_pred_prob) if y_val_pred_prob is not None else 'N/A'

    con_matrix = confusion_matrix(y_, y_val_pred)

    return acc, pre, rec, f_1, auc_score, con_matrix

In [None]:
# model training and evaluation

# change X_train_, X_val_, X_test_ to equal the other variables that contain different features through feature selection to compare performance:
# kbest = SelectKBest features
# cf = dropped constant features
# vt = dropped low variance threshold features
# lasso = lasso selected features
# rf = random forest selected features
# pca = principal component analysis extracted features
X_train_ = pd.DataFrame(X_train_vt)
X_val_ = pd.DataFrame(X_val_vt)
X_test_ = pd.DataFrame(X_test_vt)

# ensure all column names are strings
X_train_.columns = X_train_.columns.astype(str)
X_val_.columns = X_val_.columns.astype(str)
X_test_.columns = X_test_.columns.astype(str)

# Dictionary to store model evaluation results for validation and test sets
train_val_results = {}
test_results = {}

for model_name, model in models.items():
    model.fit(X_train_, y_train_encoded) # this line trains the model
    
    # evaluate on validation set
    accuracy, precision, recall, f1, auc, cm = evaluate_model(model, X_val_, y_val_encoded)
    train_val_results[model_name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "AUC": auc,
        "Confusion Matrix": cm
    }
    
    # evaluate on the test set
    test_accuracy, test_precision, test_recall, test_f1, test_auc, test_cm = evaluate_model(model, X_test_, y_test_encoded)
    test_results[model_name] = {
        "Accuracy": test_accuracy,
        "Precision": test_precision,
        "Recall": test_recall,
        "F1 Score": test_f1,
        "AUC": test_auc,
        "Confusion Matrix": test_cm
    }

# display the model evaluation results for the validation set
train_val_results_df = pd.DataFrame(train_val_results).T
print("Validation Set Results:")
print(train_val_results_df)

# plot confusion matrices for each model on the validation set
for model_name, metrics in train_val_results.items():
    cm = metrics["Confusion Matrix"]
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix for {model_name} on Validation Set')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

# display the model evaluation results for the test set
test_results_df = pd.DataFrame(test_results).T
print("Test Set Results:")
print(test_results_df)

# plot confusion matrices for each model on the test set
for model_name, metrics in test_results.items():
    cm = metrics["Confusion Matrix"]
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix for {model_name} on Test Set')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

In [151]:
with open('feature_selection/variance_threshold/logistic_regression_near_zero_variance_removed.pkl', 'wb') as file:
    pickle.dump(models["Logistic Regression"], file)