In [None]:
# Import necessary libraries for handling data and files
import pandas as pd             # pandas is a powerful data manipulation and analysis library.
import io                       # The io module provides the Python interfaces to stream handling.
from io import StringIO         # StringIO is used to handle text data in memory as file objects.
import string                   # The string module contains various string constant which are useful.
from collections import Counter # Counter is a dict subclass for counting hashable objects.

# Attempt to import StringIO from the StringIO module for compatibility with older Python versions.
try:
    from StringIO import StringIO  # Try importing from StringIO if it's available in the Python version.
except ImportError:
    from io import StringIO        # If not available, fall back to StringIO from the io module.

# Import the files module from google.colab to interact with the file system.
from google.colab import files

# Prompt for file upload via the Google Colab interface to upload the test dataset and store the uploaded files in a variable.
# This is similar to the previous procedure but now focusing on the test dataframe.
uploaded_files_secondary = files.upload()  # This method shows a widget to upload files and returns a dictionary
                                           # of the files which were uploaded. The dictionary keys are the file names
                                           # and values are the data which have been uploaded.

In [None]:
# Read the uploaded Excel file into a pandas DataFrame
# 'header=None' indicates that the first row in the file is not treated as the header row,
test_dataframe = pd.read_csv(io.BytesIO(uploaded_files_secondary["FileName.csv"]), header=None)

test_dataframe.dropna(inplace=True, axis=1)
# Drop all columns that contain only NaN values
# 'inplace=True' modifies the existing DataFrame without creating a new one
# 'axis=1' specifies that columns should be dropped (not rows)

# Display the dataframe
test_dataframe


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43
0,0,tcp,http,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0,0,0.00,0.00,0.00,0.00,0.00,0.0,0.00,0.00,neptune,21
1,0,tcp,http,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1.0,1.0,0.0,0.0,1.00,0.0,0.00,1,1,1.00,0.00,0.00,0.00,1.00,1.0,0.00,0.00,neptune,21
2,0,tcp,http,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,1.0,1.0,0.0,0.0,1.00,0.0,0.00,2,2,1.00,0.00,0.00,0.00,1.00,1.0,0.00,0.00,neptune,21
3,0,tcp,http,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,3,1.0,1.0,0.0,0.0,1.00,0.0,0.00,3,3,1.00,0.00,0.00,0.00,1.00,1.0,0.00,0.00,neptune,21
4,0,tcp,http,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,4,1.0,1.0,0.0,0.0,1.00,0.0,0.00,4,4,1.00,0.00,0.00,0.00,1.00,1.0,0.00,0.00,neptune,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22539,0,tcp,smtp,SF,794,333,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.00,0.0,0.00,100,141,0.72,0.06,0.01,0.01,0.01,0.0,0.00,0.00,normal,21
22540,0,tcp,http,SF,317,938,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,11,0.0,0.0,0.0,0.0,1.00,0.0,0.18,197,255,1.00,0.00,0.01,0.01,0.01,0.0,0.00,0.00,normal,21
22541,0,tcp,http,SF,54540,8314,0,0,0,2,0,1,1,0,0,0,0,0,0,0,0,0,5,10,0.0,0.0,0.0,0.0,1.00,0.0,0.20,255,255,1.00,0.00,0.00,0.00,0.00,0.0,0.07,0.07,back,15
22542,0,udp,domain_u,SF,42,42,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,6,0.0,0.0,0.0,0.0,1.00,0.0,0.33,255,252,0.99,0.01,0.00,0.00,0.00,0.0,0.00,0.00,normal,21


In [None]:
# Create column array for the dataframe, based on the given file
test_dataframe.columns = [
    'duration',
    'protocol_type',
    'service',
    'flag',
    'src_bytes',
    'dst_bytes',
    'land',
    'wrong_fragment',
    'urgent',
    'hot',
    'num_failed_logins',
    'logged_in',
    'num_compromised',
    'root_shell',
    'su_attempted',
    'num_root',
    'num_file_creations',
    'num_shells',
    'num_access_files',
    'num_outbound_cmds',
    'is_host_login',
    'is_guest_login',
    'count',
    'srv_count',
    'serror_rate',
    'srv_serror_rate',
    'rerror_rate',
    'srv_rerror_rate',
    'same_srv_rate',
    'diff_srv_rate',
    'srv_diff_host_rate',
    'dst_host_count',
    'dst_host_srv_count',
    'dst_host_same_srv_rate',
    'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate',
    'dst_host_srv_serror_rate',
    'dst_host_rerror_rate',
    'dst_host_srv_rerror_rate',
    'outcome',
    'new'
]

In [None]:
# Define a function to apply z-score normalization to numeric columns in the dataframe
# Mainly done to improve the performance of our algorithm by speeding up learning and convergence.
def encode_numeric_zscore(dataframe, column_name, mean=None, standard_deviation=None):
    """
    Normalizes a numeric column in the dataframe using z-score standardization.

    Parameters:
    - dataframe (pd.DataFrame): The dataframe containing the column to normalize.
    - column_name (str): The name of the column to normalize.
    - mean (float, optional): Precomputed mean value. If None, the mean is calculated from the column.
    - standard_deviation (float, optional): Precomputed standard deviation. If None, it's calculated from the column.
    """
    # Calculate mean if not provided
    if mean is None:
        mean = dataframe[column_name].mean()

    # Calculate standard deviation if not provided
    if standard_deviation is None:
        standard_deviation = dataframe[column_name].std()

    # Apply z-score normalization
    dataframe[column_name] = (dataframe[column_name] - mean) / standard_deviation

# Define a function to convert categorical text columns into dummy/one-hot encoded variables and drop the original column
def encode_text_dummy(dataframe, column_name):
    """
    Encodes a categorical text column into dummy variables and removes the original column.

    Parameters:
    - dataframe (pd.DataFrame): The dataframe containing the categorical column.
    - column_name (str): The name of the categorical column to encode.
    """
    # Generate dummy variables for the specified categorical column
    dummies = pd.get_dummies(dataframe[column_name])

    # Iterate through each category in the dummy variables
    for category in dummies.columns:
        # Create a new column name by combining the original column name with the category
        dummy_column_name = str(column_name) + "-" + str(category)

        # Add the dummy variable to the dataframe
        dataframe[dummy_column_name] = dummies[category]

    # Remove the original categorical column from the dataframe
    dataframe.drop(column_name, axis=1, inplace=True)

# Encoding numeric features for test data


# Apply z-score normalization to various numeric columns in the training dataframe
encode_numeric_zscore(test_dataframe, 'duration')
encode_numeric_zscore(test_dataframe, 'src_bytes')
encode_numeric_zscore(test_dataframe, 'dst_bytes')
encode_numeric_zscore(test_dataframe, 'land')
encode_numeric_zscore(test_dataframe, 'wrong_fragment')
encode_numeric_zscore(test_dataframe, 'urgent')
encode_numeric_zscore(test_dataframe, 'hot')
encode_numeric_zscore(test_dataframe, 'num_failed_logins')
encode_numeric_zscore(test_dataframe, 'logged_in')
encode_numeric_zscore(test_dataframe, 'num_compromised')
encode_numeric_zscore(test_dataframe, 'root_shell')
encode_numeric_zscore(test_dataframe, 'su_attempted')
encode_numeric_zscore(test_dataframe, 'num_root')
encode_numeric_zscore(test_dataframe, 'num_file_creations')
encode_numeric_zscore(test_dataframe, 'num_shells')
encode_numeric_zscore(test_dataframe, 'num_access_files')
encode_numeric_zscore(test_dataframe, 'num_outbound_cmds')
encode_numeric_zscore(test_dataframe, 'is_host_login')
encode_numeric_zscore(test_dataframe, 'is_guest_login')
encode_numeric_zscore(test_dataframe, 'count')
encode_numeric_zscore(test_dataframe, 'srv_count')
encode_numeric_zscore(test_dataframe, 'serror_rate')
encode_numeric_zscore(test_dataframe, 'srv_serror_rate')
encode_numeric_zscore(test_dataframe, 'rerror_rate')
encode_numeric_zscore(test_dataframe, 'srv_rerror_rate')
encode_numeric_zscore(test_dataframe, 'same_srv_rate')
encode_numeric_zscore(test_dataframe, 'diff_srv_rate')
encode_numeric_zscore(test_dataframe, 'srv_diff_host_rate')
encode_numeric_zscore(test_dataframe, 'dst_host_count')
encode_numeric_zscore(test_dataframe, 'dst_host_srv_count')
encode_numeric_zscore(test_dataframe, 'dst_host_same_srv_rate')
encode_numeric_zscore(test_dataframe, 'dst_host_diff_srv_rate')
encode_numeric_zscore(test_dataframe, 'dst_host_same_src_port_rate')
encode_numeric_zscore(test_dataframe, 'dst_host_srv_diff_host_rate')
encode_numeric_zscore(test_dataframe, 'dst_host_serror_rate')
encode_numeric_zscore(test_dataframe, 'dst_host_srv_serror_rate')
encode_numeric_zscore(test_dataframe, 'dst_host_rerror_rate')
encode_numeric_zscore(test_dataframe, 'dst_host_srv_rerror_rate')

# After encoding all numeric features, remove any remaining columns with NaN values from the test_dataframe
test_dataframe.dropna(inplace=True, axis=1)



In [None]:
# Encoding categorical features for test data

# Generate dummy variables for the each categorical column that had strings in it, in order to make processing of the columns possible

protocol_dummies = pd.get_dummies(test_dataframe['protocol_type'])
service_dummies = pd.get_dummies(test_dataframe['service'])
flag_dummies = pd.get_dummies(test_dataframe['flag'])

# Concatenate the newly created dummy variables to the test_dataframe along the columns axis

test_dataframe = pd.concat((test_dataframe, protocol_dummies, service_dummies, flag_dummies), axis=1)

# Remove the string column labelings
test_dataframe.pop('protocol_type')
test_dataframe.pop('service')
test_dataframe.pop('flag')

# Display the test_dataframe to verify that the categorical columns have been successfully encoded and removed
# This helps in confirming that the dataframe is now ready for model evaluation without redundant columns

test_dataframe


0         S0
1         S0
2         S0
3         S0
4         S0
        ... 
22539     SF
22540     SF
22541     SF
22542     SF
22543    REJ
Name: flag, Length: 22544, dtype: object

In [None]:
# Import the NumPy library for numerical operations
import numpy as np

# Extract all feature values from the test_dataframe as a NumPy array
# This includes all columns in the dataframe
test_features = test_dataframe.values

# Extract the 'outcome' column from test_dataframe as the target labels
# This separates the target variable from the feature set
# Done mainly so that our machine won't see the answer prematurely
test_labels = test_dataframe['outcome'].values

# Remove the column at index 37 from the test_features array
# Assuming that the 'outcome' column is at index 37, this step excludes it from the features
test_features = np.delete(test_features, 37, axis=1)

In [None]:
# Import necessary libraries for model training and timing
from sklearn.model_selection import train_test_split  # For splitting data into training and testing sets
from sklearn import tree                            # For Decision Tree classifier
import time                                        # For measuring execution time


def intelect(features, labels, test_size_ratio):
    """
    Trains a Decision Tree classifier on the provided features and labels,
    evaluates its accuracy, and records the time taken for fitting and prediction.

    Parameters:
    - features (np.ndarray): The feature matrix for the dataset.
    - labels (np.ndarray): The target labels for the dataset.
    - test_size_ratio (float): The proportion of the dataset to include in the test split.
    """

    # Initialize an empty string to store the fitting message, readable and formatted message
    fitting_message = ''

    # Record the start time for the fitting process
    start_time_fitting = time.perf_counter()

    # Split the dataset into training and testing sets based on the specified test size ratio
    x_train, x_test, y_train, y_test = train_test_split(
        features, labels, test_size=test_size_ratio, random_state=0
    )

    # Initialize the Decision Tree classifier with a maximum depth of 1000
    decision_tree_classifier = tree.DecisionTreeClassifier(max_depth=1000)

    # Train the classifier on the training data
    decision_tree_classifier.fit(x_train, y_train)

    # Record the finish time for the fitting process
    finish_time_fitting = time.perf_counter()

    # Create a list of details about the fitting process
    fitting_details = [
        'Fitting with ',
        test_size_ratio,
        ' amount of training data was Finished in ',
        round(finish_time_fitting - start_time_fitting, 2),
        ' second(s)'
    ]

    # Record the start time for the prediction process
    start_time_prediction = time.perf_counter()

    # Make predictions on the test data
    y_pred = decision_tree_classifier.predict(x_test)

    # Calculate the accuracy of the classifier on the test data
    accuracy = decision_tree_classifier.score(x_test, y_test)

    # Print the accuracy of the model
    print('Accuracy of ', test_size_ratio, ' amount of training data ', accuracy)

    # Record the finish time for the prediction process
    finish_time_prediction = time.perf_counter()

    # Create a list of details about the prediction process
    prediction_details = [
        'Predicting and calculating accuracy of ',
        str(test_size_ratio),
        ' amount of training data was Finished in ',
        round(finish_time_prediction - start_time_prediction, 2),
        ' second(s)'
    ]

    # Combine the fitting details into a single formatted string
    fitting_message = '. '.join([' '.join(map(str, fitting_details))]) + '.'

    # Combine the prediction details into a single formatted string
    prediction_message = '. '.join([' '.join(map(str, prediction_details))]) + '.'

    # Print the fitting and prediction messages
    print(fitting_message)
    print(prediction_message)

    # Print a separator for clarity in the output
    print('-----------------------------------')


In [None]:
# Import necessary modules for threading and timing
import threading  # To create and manage multiple threads


# Record the start time of the session to measure total execution time
session_start_time = time.perf_counter()


# Creating and Starting Threads for Different Test Size Ratios

# Threading allows multiple instances of the `intelect` function to run concurrently.
# This is beneficial for speeding up the evaluation process when dealing with multiple test sizes.

# Create and start a thread for test_size_ratio = 0.1
thread_010 = threading.Thread(target=intelect, args=[test_features,test_labels,0.1]) # Pass features, labels, and test size ratio as arguments

thread_010.start()                     # Start the thread

# Create and start a thread for test_size_ratio = 0.2
thread_020 = threading.Thread(target=intelect, args=[test_features,test_labels,0.2]) # Pass features, labels, and test size ratio as arguments

thread_020.start()                     # Start the thread

# Create and start a thread for test_size_ratio = 0.3
thread_030 = threading.Thread(target=intelect, args=[test_features,test_labels,0.3]) # Pass features, labels, and test size ratio as arguments

thread_030.start()                     # Start the thread

# Create and start a thread for test_size_ratio = 0.5
thread_050 = threading.Thread(target=intelect, args=[test_features,test_labels,0.5]) # Pass features, labels, and test size ratio as arguments

thread_050.start()                     # Start the thread

# Create and start a thread for test_size_ratio = 0.6
thread_060 = threading.Thread(target=intelect, args=[test_features,test_labels,0.6]) # Pass features, labels, and test size ratio as arguments

thread_060.start()                     # Start the thread

# Create and start a thread for test_size_ratio = 0.7
thread_070 = threading.Thread(target=intelect, args=[test_features,test_labels,0.7]) # Pass features, labels, and test size ratio as arguments

thread_070.start()                     # Start the thread

# Create and start a thread for test_size_ratio = 0.8
thread_080 = threading.Thread(target=intelect, args=[test_features,test_labels,0.8]) # Pass features, labels, and test size ratio as arguments

thread_080.start()                     # Start the thread

# Create and start a thread for test_size_ratio = 0.9
thread_090 = threading.Thread(target=intelect, args=[test_features,test_labels,0.9]) # Pass features, labels, and test size ratio as arguments

thread_090.start()                     # Start the thread

# Create and start a thread for test_size_ratio = 0.95
thread_095 = threading.Thread(target=intelect, args=[test_features,test_labels,0.95]) # Pass features, labels, and test size ratio as arguments

thread_095.start()                     # Start the thread

# Create and start a thread for test_size_ratio = 0.99
thread_099 = threading.Thread(target=intelect, args=[test_features,test_labels,0.99]) # Pass features, labels, and test size ratio as arguments

thread_099.start()                     # Start the thread

# Waiting for All Threads to Complete

# Threading allows each `intelect` function to run independently for different test_size_ratios.
# This concurrent execution can significantly reduce the total time taken compared to sequential execution.

# Wait for all threads to complete their execution
result_a = thread_010.join()  # Wait for thread_010 (test_size_ratio=0.1) to finish
result_b = thread_020.join()  # Wait for thread_020 (test_size_ratio=0.2) to finish
result_c = thread_030.join()  # Wait for thread_030 (test_size_ratio=0.3) to finish
result_d = thread_050.join()  # Wait for thread_050 (test_size_ratio=0.5) to finish
result_e = thread_060.join()  # Wait for thread_060 (test_size_ratio=0.6) to finish
result_f = thread_070.join()  # Wait for thread_070 (test_size_ratio=0.7) to finish
result_g = thread_080.join()  # Wait for thread_080 (test_size_ratio=0.8) to finish
result_h = thread_090.join()  # Wait for thread_090 (test_size_ratio=0.9) to finish
result_i = thread_095.join()  # Wait for thread_095 (test_size_ratio=0.95) to finish
result_j = thread_099.join()  # Wait for thread_099 (test_size_ratio=0.99) to finish

# Finalizing and Reporting

# Record the finish time of the session to calculate total execution time
final_finish_time = time.perf_counter()

# Print the total time taken for the entire session
print('Finished in', round(final_finish_time - session_start_time, 2), 'second(s)')


0.9889135254988913
0.9463043376756782
0.8783995698732022
0.9891328454202706
0.9863378282469837
0.9753825681969395
0.982257706808605
0.9889118864577173
0.9703302119270577
0.9787719409416387
Finished in 305.64 second(s)


In [None]:
# Initialize an empty list to store performance data and record the start time
# This setup helps in tracking the performance of the algorithm we end up choosing efficiently
performance_data = []
sd_start_time = time.perf_counter()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import ensemble

# Start the timer to measure how long it takes to train the Random Forest classifier
start = time.perf_counter()

# Split the dataset into training and testing sets
# Here, 10% of the data is used for testing and 90% for training
x_train, x_test, y_train, y_test = train_test_split(test_features, test_labels, test_size=0.1, random_state=0)

# Initialize the Random Forest classifier with 250 trees
# More trees typically improve the model's performance but increase computational load
random_forest_classifier = ensemble.RandomForestClassifier(n_estimators=250)

# Train the Random Forest classifier with the training data
random_forest_classifier.fit(x_train, y_train)

# Stop the timer after training and calculate the duration
finish = time.perf_counter()
learning_time_rf = round(finish - start, 2)  # Round the training time to two decimal places

# Start the timer again to measure the time taken for making predictions and evaluating the model
start = time.perf_counter()

# Predict the outcomes for the test dataset
predictions_rf = random_forest_classifier.predict(x_test)

# Calculate the accuracy of the predictions
accuracy_rf = random_forest_classifier.score(x_test, y_test)

# Stop the timer after predictions and calculate the duration for prediction
finish = time.perf_counter()
prediction_time_rf = round(finish - start, 2)  # Round the prediction time to two decimal places

# Append the test size, learning time, prediction time, and accuracy to the performance data list
# This data is used for further analysis or comparison with other models
performance_data.append([0.9, 0.1, learning_time_rf, prediction_time_rf, accuracy_rf])



0.9870786516853932

In [None]:
# Measure the start time of the decision tree training session
start = time.perf_counter()

# Import the necessary modules for the decision tree classifier and data splitting
from sklearn import tree
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets with a test size of 10%
x_train, x_test, y_train, y_test = train_test_split(test_features, test_labels, test_size=0.1, random_state=0)

# Initialize the decision tree classifier with a specified maximum depth of 1000
decision_tree_classifier = tree.DecisionTreeClassifier(max_depth=1000)

# Train the decision tree classifier using the training data
decision_tree_classifier.fit(x_train, y_train)

# Measure the end time of the training session and calculate the duration
finish = time.perf_counter()
learning_time_dt = round(finish - start, 2)  # Round the training duration to two decimal places

# Measure the start time for predictions
start = time.perf_counter()

# Make predictions with the decision tree classifier on the testing set
y_pred_dt = decision_tree_classifier.predict(x_test)

# Calculate the accuracy of the classifier on the testing set
accuracy_dt = decision_tree_classifier.score(x_test, y_test)

# Measure the end time for predictions and calculate the duration
finish = time.perf_counter()
prediction_time_dt = round(finish - start, 2)  # Round the prediction duration to two decimal places

# Append the test size, learning time, prediction time, and accuracy to the performance data list
performance_data.append([0.9, 0.1, learning_time_dt, prediction_time_dt, accuracy_dt])


In [None]:
# We repeat all the same steps, only changing the distribution between test and train data sizes
start = time.perf_counter()
x_train, x_test, y_train, y_test = train_test_split(test_features, test_labels, test_size=0.2, random_state=0)
decision_tree_classifier = tree.DecisionTreeClassifier(max_depth=1000)
decision_tree_classifier.fit(x_train, y_train)
finish = time.perf_counter()
learning_time_dt = round(finish - start, 2)

start = time.perf_counter()
y_pred_dt = decision_tree_classifier.predict(x_test)
accuracy_dt = decision_tree_classifier.score(x_test, y_test)
finish = time.perf_counter()
prediction_time_dt = round(finish - start, 2)
performance_data.append([0.8, 0.2, learning_time_dt, prediction_time_dt, accuracy_dt])


In [None]:
# We repeat all the same steps, only changing the distribution between test and train data sizes
start = time.perf_counter()
x_train, x_test, y_train, y_test = train_test_split(test_features, test_labels, test_size=0.3, random_state=0)
decision_tree_classifier = tree.DecisionTreeClassifier(max_depth=1000)
decision_tree_classifier.fit(x_train, y_train)
finish = time.perf_counter()
learning_time_dt = round(finish - start, 2)

start = time.perf_counter()
y_pred_dt = decision_tree_classifier.predict(x_test)
accuracy_dt = decision_tree_classifier.score(x_test, y_test)
finish = time.perf_counter()
prediction_time_dt = round(finish - start, 2)
performance_data.append([0.7, 0.3, learning_time_dt, prediction_time_dt, accuracy_dt])

In [None]:
# We repeat all the same steps, only changing the distribution between test and train data sizes
start = time.perf_counter()
x_train, x_test, y_train, y_test = train_test_split(test_features, test_labels, test_size=0.4, random_state=0)
decision_tree_classifier = tree.DecisionTreeClassifier(max_depth=1000)
decision_tree_classifier.fit(x_train, y_train)
finish = time.perf_counter()
learning_time_dt = round(finish - start, 2)

start = time.perf_counter()
y_pred_dt = decision_tree_classifier.predict(x_test)
accuracy_dt = decision_tree_classifier.score(x_test, y_test)
finish = time.perf_counter()
prediction_time_dt = round(finish - start, 2)
performance_data.append([0.6, 0.4, learning_time_dt, prediction_time_dt, accuracy_dt])

In [None]:
# We repeat all the same steps, only changing the distribution between test and train data sizes
start = time.perf_counter()
x_train, x_test, y_train, y_test = train_test_split(test_features, test_labels, test_size=0.5, random_state=0)
decision_tree_classifier = tree.DecisionTreeClassifier(max_depth=1000)
decision_tree_classifier.fit(x_train, y_train)
finish = time.perf_counter()
learning_time_dt = round(finish - start, 2)

start = time.perf_counter()
y_pred_dt = decision_tree_classifier.predict(x_test)
accuracy_dt = decision_tree_classifier.score(x_test, y_test)
finish = time.perf_counter()
prediction_time_dt = round(finish - start, 2)
performance_data.append([0.5, 0.5, learning_time_dt, prediction_time_dt, accuracy_dt])


In [None]:
# We repeat all the same steps, only changing the distribution between test and train data sizes
start = time.perf_counter()
x_train, x_test, y_train, y_test = train_test_split(test_features, test_labels, test_size=0.6, random_state=0)
decision_tree_classifier = tree.DecisionTreeClassifier(max_depth=1000)
decision_tree_classifier.fit(x_train, y_train)
finish = time.perf_counter()
learning_time_dt = round(finish - start, 2)

start = time.perf_counter()
y_pred_dt = decision_tree_classifier.predict(x_test)
accuracy_dt = decision_tree_classifier.score(x_test, y_test)
finish = time.perf_counter()
prediction_time_dt = round(finish - start, 2)
performance_data.append([0.4, 0.6, learning_time_dt, prediction_time_dt, accuracy_dt])

In [None]:
# We repeat all the same steps, only changing the distribution between test and train data sizes
start = time.perf_counter()
x_train, x_test, y_train, y_test = train_test_split(test_features, test_labels, test_size=0.7, random_state=0)
decision_tree_classifier = tree.DecisionTreeClassifier(max_depth=1000)
decision_tree_classifier.fit(x_train, y_train)
finish = time.perf_counter()
learning_time_dt = round(finish - start, 2)

start = time.perf_counter()
y_pred_dt = decision_tree_classifier.predict(x_test)
accuracy_dt = decision_tree_classifier.score(x_test, y_test)
finish = time.perf_counter()
prediction_time_dt = round(finish - start, 2)
performance_data.append([0.3, 0.7, learning_time_dt, prediction_time_dt, accuracy_dt])

In [None]:
# We repeat all the same steps, only changing the distribution between test and train data sizes
start = time.perf_counter()
x_train, x_test, y_train, y_test = train_test_split(test_features, test_labels, test_size=0.8, random_state=0)
decision_tree_classifier = tree.DecisionTreeClassifier(max_depth=1000)
decision_tree_classifier.fit(x_train, y_train)
finish = time.perf_counter()
learning_time_dt = round(finish - start, 2)

start = time.perf_counter()
y_pred_dt = decision_tree_classifier.predict(x_test)
accuracy_dt = decision_tree_classifier.score(x_test, y_test)
finish = time.perf_counter()
prediction_time_dt = round(finish - start, 2)
performance_data.append([0.2, 0.8, learning_time_dt, prediction_time_dt, accuracy_dt])

In [None]:
# We repeat all the same steps, only changing the distribution between test and train data sizes
start = time.perf_counter()
x_train, x_test, y_train, y_test = train_test_split(test_features, test_labels, test_size=0.9, random_state=0)
decision_tree_classifier = tree.DecisionTreeClassifier(max_depth=1000)
decision_tree_classifier.fit(x_train, y_train)
finish = time.perf_counter()
learning_time_dt = round(finish - start, 2)

start = time.perf_counter()
y_pred_dt = decision_tree_classifier.predict(x_test)
accuracy_dt = decision_tree_classifier.score(x_test, y_test)
finish = time.perf_counter()
prediction_time_dt = round(finish - start, 2)
performance_data.append([0.1, 0.9, learning_time_dt, prediction_time_dt, accuracy_dt])


In [None]:
start = time.perf_counter()
x_train, x_test, y_train, y_test = train_test_split(test_features, test_labels, test_size=0.95, random_state=0)
decision_tree_classifier = tree.DecisionTreeClassifier(max_depth=1000)
decision_tree_classifier.fit(x_train, y_train)
finish = time.perf_counter()
learning_time_dt = round(finish - start, 2)

start = time.perf_counter()
y_pred_dt = decision_tree_classifier.predict(x_test)
accuracy_dt = decision_tree_classifier.score(x_test, y_test)
finish = time.perf_counter()
prediction_time_dt = round(finish - start, 2)
performance_data.append([0.05, 0.95, learning_time_dt, prediction_time_dt, accuracy_dt])



In [None]:
# We repeat all the same steps, only changing the distribution between test and train data sizes
start = time.perf_counter()

x_train, x_test, y_train, y_test = train_test_split(test_features, test_labels, test_size=0.99, random_state=0)
decision_tree_classifier = tree.DecisionTreeClassifier(max_depth=1000)
decision_tree_classifier.fit(x_train, y_train)
finish = time.perf_counter()
learning_time_dt = round(finish - start, 2)

start = time.perf_counter()
y_pred_dt = decision_tree_classifier.predict(x_test)
accuracy_dt = decision_tree_classifier.score(x_test, y_test)
finish = time.perf_counter()
prediction_time_dt = round(finish - start, 2)
performance_data.append([0.01, 0.99, learning_time_dt, prediction_time_dt, accuracy_dt])



In [None]:
# Record the start time of the entire modeling session for performance tracking
start_full_session = time.perf_counter()

# Calculate and print the total duration of the session by subtracting the start time from the current time
print('Full session finished in', round(start_full_session - session_start_time, 2), 'second(s)')

# Import the pandas library, necessary for data manipulation and analysis
import pandas as pd

# Create a DataFrame from the performance data collected during the session
# This DataFrame organizes the data into specified columns for easier analysis and visualization
performance_dataframe = pd.DataFrame(performance_data, columns=[
    'train data size',  # Percentage of data used for training
    'test data size',   # Percentage of data used for testing
    'time for learning dataset seconds',  # Time taken to train the model
    'time for predicting and calculating seconds',  # Time taken to make predictions and calculate metrics
    'accuracy'  # Accuracy of the model on the test data
])

# Display the created DataFrame to review the performance metrics of the models
performance_dataframe


Full session finished in 1870.54 second(s)


Unnamed: 0,train data size,test data size,time for learning dataset,time for predicting and calculating,accuracy
0,0.9,0.1,434.49,3.25,0.98764
1,0.8,0.2,216.34,6.27,0.986984
2,0.7,0.3,193.83,9.59,0.983707
3,0.6,0.4,168.52,11.52,0.982442
4,0.5,0.5,144.25,14.34,0.980148
5,0.4,0.6,121.83,16.49,0.97815
6,0.3,0.7,140.72,20.23,0.976001
7,0.2,0.8,96.22,20.61,0.971416
8,0.1,0.9,67.71,23.63,0.96321
9,0.005,0.95,60.52,24.04,0.947818
