# ML Pipeline

This notebook contains the machine learning work for this project. A preprocessing, splitting, and CV pipeline are created. 

In [37]:
# Load the data (raw / feature engineered)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

path = "/Users/djfiume/Desktop/DSI/1030/data-breach-ml"
data = pd.read_csv(path + "/data/cleaned_raw_data.csv", index_col=0)

data.head()
print("Full Data Shape: ", data.shape)

Full Data Shape:  (24586, 29)


## Preprocessing Setup

In [27]:
y = data['Max Records Impacted']

data.drop(columns=['Max Records Impacted'],inplace=True)
# the unprocessed feature matrix
X = data
print(f'feature matrix size: {X.shape}')
# the feature names
ftrs = data.columns

# MODEL MAKES MORE SENSE TO PREDICT THE LOGS
# ex: predicting 100 records for true 10,000 should be the same as predicting 1,000,000 for true 10,000
y = y+1
y = np.log10(y)
y.describe()

feature matrix size: (24586, 28)


count    24586.000000
mean         2.383157
std          1.519120
min          0.000000
25%          0.903090
50%          2.586024
75%          3.464526
max          9.477121
Name: Max Records Impacted, dtype: float64

In [28]:
# Categorize the Features

cat_ftrs = ["Breach Type", "Source", "Organization Type", "breach_location_state"]

num_ftrs = ["Reported Year", "Reported Month", "Days Until Reported", "Length of Breach (Days)", "IDENTIFIER",\
            "COMMERCIAL", "BIOMETRIC", "HEALTH", "INTERNETDATA", "GEOLOCATION", "RECORDING", "EMPLOYMENT",\
            "EDUCATION", "SENSITIVE-GOV", "SENSITIVE-LOGIN", "SENSITIVE-GEOLOCATION", "SENSITIVE-PROTECTED",\
            "SENSITIVE-COMMUNICATIONS", "SENSITIVE-DNA", "Type UNKN", "ENCRYPTED", "ENCRYPTED-WITH-DECRYPTIONKEY",\
            "UNENCRYPTED", "Encrypt UNKN"]

# From the Data README 
# info_types = ["IDENTIFIER", "COMMERCIAL", "BIOMETRIC", "HEALTH", "INTERNETDATA", "GEOLOCATION", "RECORDING", "EMPLOYMENT", "EDUCATION", "SENSITIVE-GOV", "SENSITIVE-LOGIN", "SENSITIVE-GEOLOCATION", "SENSITIVE-PROTECTED", "SENSITIVE-COMMUNICATIONS", "SENSITIVE-DNA", "UNKN"]
# encrypt_types = ["ENCRYPTED", "ENCRYPTED-WITH-DECRYPTIONKEY", "UNENCRYPTED"]
# breach_types = ["CARD", "HACK", "INSD", "PHYS", "PORT", "STAT", "DISC", "UNKN"]
# org_types = ["BSF", "BSO", "BSR", "EDU", "GOV", "MED", "NGO", "UNKN"]

In [38]:
# Set up the Preprocessor 

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# one-hot encoder
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant',fill_value='UNKN')),
    ('onehot', OneHotEncoder(sparse_output=False,handle_unknown='ignore'))])

# standard scaler
numeric_transformer = Pipeline(steps=[
    # This is only for the engineered date features. All other features have no nulls
    # Assume no date breach end date = 0 breach length 
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)), 
    ('scaler', StandardScaler())])

# collect the transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_ftrs),
        ('cat', categorical_transformer, cat_ftrs)])

## Pipelines

In [39]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
import joblib


def MLpipe_KFold_RMSE(X, y, preprocessor, ML_algo, param_grid, num_seeds):
    '''
    This function splits the data to other/test (80/20) and then applies KFold with 5 folds to other.
    The RMSLE (Root Mean Squared Log Error) is minimized in cross-validation.

    Args:
    X : DataFrame or ndarray
        Features for model training.
    y : DataFrame or ndarray
        Labels/target for model training.
    preprocessor : sklearn transformer
        The preprocessor object (e.g., ColumnTransformer).
    ML_algo : sklearn estimator
        The machine learning algorithm (e.g., RandomForestRegressor).
    param_grid : dict
        Grid of parameters for GridSearchCV.
    num_seeds : int
        Number of random seeds to use for cross-validation.
    save_model : bool, default False
        Whether to save the best model and predictions.
    name: string, default None
        What the model name is to name the filenames

    Returns:
    test_scores : list
        List of test RMSE scores.
    best_models : list
        List of best model parameters from GridSearchCV.
    r2_scores : list
        List of R² scores for the test set.
    '''
     
    test_scores = []
    r2_scores = []
    best_models = []

    # Loops through 5 different random states. This is to ensure the best model
    # on average is picked. They are fixed for reproducability. 
    random_states = np.linspace(0,100000, num_seeds, dtype=int)
    
    for i in range(num_seeds): 

        # Split Data Randomly 
        X_other, X_test, y_other, y_test = (train_test_split(X, 
            y, train_size=0.8, random_state=random_states[i]))

        # Fits a model using GridSearchCV with KFold and the predefined Preprocessor 
        kf = KFold(n_splits=5, shuffle=True, random_state=random_states[i]) 

        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('model', ML_algo)
        ])
        
        grid_search = GridSearchCV(pipeline, param_grid, 
                            scoring="neg_root_mean_squared_error", cv=kf, verbose=2) # you can turn this off
        grid_search.fit(X_other, y_other)

        # Calculate the model's error on the test set 
        y_pred = grid_search.predict(X_test)
    
        rmse = root_mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        best_params = grid_search.best_params_
        
        print(rmse)
        print(r2)
        test_scores.append(rmse)
        r2_scores.append(r2)
        best_models.append(best_params)

        # Only need to print this once, can use for debugging
        # if random_state == 0:
            # print("Train_Inputs Shape:", X_other.shape)
            # print("Train_Labels Shape:", y_other.shape)
            # print("Test_Inputs Shape:", X_test.shape)
            # print("Test_Labels Shape:", y_test.shape)

            # print("GridSearch", grid_search.cv_results_)
        
        # print("Random State: ", random_state, " RMSE: ", rmse, 
        #       " Best Model:", best_params)

    print("RMSE: ", test_scores, "\n Models: ", best_models, "r2 score:", r2_scores)
    return test_scores, best_models, r2_scores

### Use the pipeline on regression algorithims for CV / hyperparamter tuning

Tune individual models in the first cell. Only run the last cell when you have narrowed down your search (it takes a long time)

In [40]:
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor

In [41]:
# Use to determine the hyperparameters for the final run 

names_list = ["L1_Linerar_Regression", # i=0
              "L2_Linerar_Regression", # 1
              "Elastic-Net_Linerar_Regression", # 2
              "Random Forest", # 3
              "SVR", # 4
              "kNearestNeighbors", # 5
              "XGBoost", # 6
              "NN_MLP"] # 7
model_list = [Lasso(max_iter=500), 
              Ridge(max_iter=500), 
              ElasticNet(max_iter=500), 
              RandomForestRegressor(), 
              SVR(),
              KNeighborsRegressor(),
              XGBRegressor(),
              MLPRegressor(learning_rate="adaptive", batch_size="auto", solver="adam", early_stopping=True)]

# adjust the grids as you go 
model_grids = [{
                'model__alpha': np.logspace(-4, -2, 10)},
               {
                'model__alpha': np.linspace(1, 4, 15)},
               {
                'model__alpha': np.logspace(-4, -2, 5),
                'model__l1_ratio' : np.linspace(0.01,0.99,5)},
               {
                'model__n_estimators': [100, 150, 200],
                'model__max_depth': [10, 15, 20]},
               { 
                'model__gamma': [1e-3, 1e-2, 1e-1, 1e0],
                'model__C': [1e-1, 1e0, 1e1, 1e2, 1e3]},
               { 
                'model__n_neighbors': [1,3,5,7,10,15,20,25,30,40,50,75,100]},
               {
                'model__max_depth': [6], 
                'model__min_child_weight': [3],  
                'model__learning_rate': [0.03],  # Tune these two, then above two, then the below 2 and then gamma
                'model__n_estimators': [1000], 
                'model__subsample': [0.8],  
                'model__colsample_bytree': [0.8], 
                'model__gamma': [0,0.1,0.2]}, 
                {
                'model__hidden_layer_sizes': [(150,2)],
                'model__alpha': [0.001, 0.0003, 0.001], 
                'model__max_iter': [300]}]

# Pick a model type, run and print results
i = 5
num_seeds = 1 # can use 1 to narrow range, otherwise some models (SVM) will take a long time 
test_scores, best_models, r2_scores = MLpipe_KFold_RMSE(X,y,preprocessor,model_list[i], model_grids[i], num_seeds)
print(best_models)
print(names_list[i], "RMSE Average: ", np.round(np.mean(test_scores), 4))
print(names_list[i], "RMSE Standard Deviation: ", np.round(np.std(test_scores), 4))
print(names_list[i], "R2 Average: ", np.round(np.mean(r2_scores), 4))
print(names_list[i], "R2 Standard Deviation: ", np.round(np.std(r2_scores), 4))

Fitting 5 folds for each of 13 candidates, totalling 65 fits
[CV] END ...............................model__n_neighbors=1; total time=   0.3s
[CV] END ...............................model__n_neighbors=1; total time=   0.2s
[CV] END ...............................model__n_neighbors=1; total time=   0.2s
[CV] END ...............................model__n_neighbors=1; total time=   0.2s
[CV] END ...............................model__n_neighbors=1; total time=   0.2s
[CV] END ...............................model__n_neighbors=3; total time=   0.2s
[CV] END ...............................model__n_neighbors=3; total time=   0.3s
[CV] END ...............................model__n_neighbors=3; total time=   0.3s
[CV] END ...............................model__n_neighbors=3; total time=   0.3s
[CV] END ...............................model__n_neighbors=3; total time=   0.2s
[CV] END ...............................model__n_neighbors=5; total time=   0.2s
[CV] END ...............................model__n

## Final Testing Pipeline

This cell takes the chosen hyperparamters and runs the models with an 80/20 split over 5 random seeds. It saves each model and the predictions to results. 

In [43]:
path = "/Users/djfiume/Desktop/DSI/1030/data-breach-ml"

def MLpipe_Final_RMSE(X, y, preprocessor, ML_algo, param_grid, num_seeds, name=None):
    '''
    This function splits the data to train/test (80/20). It then trains the model and tests it
    for a set number of random seeds, saves the models and predictions, and returns the results  
    
    Args:
    X : DataFrame or ndarray
        Features for model training.
    y : DataFrame or ndarray
        Labels/target for model training.
    preprocessor : sklearn transformer
        The preprocessor object (e.g., ColumnTransformer).
    ML_algo : sklearn estimator
        The machine learning algorithm (e.g., RandomForestRegressor).
    param_grid : dict
        Grid of parameters for GridSearchCV.
    num_seeds : int
        Number of random seeds to use for cross-validation.
    name: string, default None
        What the model name is to name the filenames

    Returns:
    test_scores : list
        List of test RMSE scores.
    best_models : list
        List of best model parameters from GridSearchCV.
    r2_scores : list
        List of R² scores for the test set.
    '''
     
    test_scores = []
    r2_scores = []


    # Loops through a given number of different random states. This is to ensure the best model
    # on average is picked. They are fixed for reproducability (if you choose the same # of seeds). 
    random_states = np.linspace(0,100000, num_seeds, dtype=int)
    
    for i in range(num_seeds): 

        # Split Data Randomly 
        X_train, X_test, y_train, y_test = (train_test_split(X, 
            y, train_size=0.8, random_state=random_states[i]))

        model = ML_algo.set_params(**param_grid)

        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('model', model)
        ])
        
        pipeline.fit(X_train, y_train)

        # Calculate the model's error on the test set 
        y_pred = pipeline.predict(X_test)
    
        rmse = root_mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        test_scores.append(rmse)
        r2_scores.append(r2)


        # Only need to print this once, can use for debugging
        # if random_state == 0:
            # print("Train_Inputs Shape:", X_other.shape)
            # print("Train_Labels Shape:", y_other.shape)
            # print("Test_Inputs Shape:", X_test.shape)
            # print("Test_Labels Shape:", y_test.shape)

            # print("GridSearch", grid_search.cv_results_)
        
        # print("Random State: ", random_state, " RMSE: ", rmse, 
        #       " Best Model:", best_params)
    
        # Save the best model and its predictions
        if name is not None:
            model_file = path + "/results/" + name + str(i) + ".pkl"
            pred_file = path + "/results/" + name + str(i) + ".npy"
            joblib.dump(pipeline, model_file)
            np.save(pred_file, y_pred)
            # print(f"Best model saved as {model_file}")
            # print(f"Predictions saved as {pred_file}")
        

    print("RMSE: ", test_scores, "r2 score:", r2_scores)
    return test_scores, best_models, r2_scores

In [44]:
# Run all the models with the final chosen hyperparameters. 
model_grids = [{
                'alpha': 0.001},
               {
                'alpha': 3},
               {
                'alpha': 0.001,
                'l1_ratio': 0.99},
               {
                'n_estimators': 250,
                'max_depth': 15},
               { 
                'gamma': 1e-1,
                'C': 1e2},
               { 
                'n_neighbors': 15},
               { 
                'max_depth': 6, 
                'min_child_weight': 3,  
                'learning_rate': 0.03, 
                'n_estimators': 1000, 
                'subsample': 0.8,  
                'colsample_bytree': 0.8, 
                'gamma': 0.1}, 
                {
                'hidden_layer_sizes': (150),
                'alpha': 0.0003, 
                'max_iter': 300}]

# you might want to just do svms and then all other ones
for i in range(len(model_list)):

    test_scores, best_models, r2_scores = MLpipe_Final_RMSE(X,y,preprocessor,model_list[i], 
                                                 model_grids[i], 5, names_list[i])
    
    print(names_list[i], "RMSE Average: ", np.round(np.mean(test_scores), 4))
    print(names_list[i], "RMSE Standard Deviation: ", np.round(np.std(test_scores), 4))
    print(names_list[i], "R2 Average: ", np.round(np.mean(r2_scores), 4))
    print(names_list[i], "R2 Standard Deviation: ", np.round(np.std(r2_scores), 4))

RMSE:  [1.1154520499506049, 1.1166574608561253, 1.1406962921669241, 1.147215018568752, 1.1388048638494588] r2 score: [0.4520576008391215, 0.4512836217630314, 0.4447495145680067, 0.4358424124616791, 0.4386164503859119]
L1_Linerar_Regression RMSE Average:  1.1318
L1_Linerar_Regression RMSE Standard Deviation:  0.0131
L1_Linerar_Regression R2 Average:  0.4445
L1_Linerar_Regression R2 Standard Deviation:  0.0065
RMSE:  [1.1158984289298008, 1.1151074056675285, 1.138904566229831, 1.1459557648552305, 1.1385393229708693] r2 score: [0.4516189644880102, 0.4528059334341582, 0.44649244189670445, 0.43708024081408126, 0.4388782211352066]
L2_Linerar_Regression RMSE Average:  1.1309
L2_Linerar_Regression RMSE Standard Deviation:  0.0128
L2_Linerar_Regression R2 Average:  0.4454
L2_Linerar_Regression R2 Standard Deviation:  0.0064
RMSE:  [1.115452414526689, 1.1166578733946564, 1.1406986722404302, 1.1472029582277343, 1.1387825661720365] r2 score: [0.4520572426583698, 0.4512832163268107, 0.44474719749490

### Aside: Calculating Baseline RMSE / R2 Score:

In [45]:
num_seeds = 5
random_states = np.linspace(0,100000, num_seeds, dtype=int)
test_scores = []
r2_scores = []

for i in range(num_seeds): 

    # Split Data Randomly 
    X_other, X_test, y_other, y_test = (train_test_split(X, 
            y, train_size=0.8, random_state=random_states[i]))

    
    train_mean = np.ones(y_test.shape) * np.mean(y_other)
    rmse = root_mean_squared_error(y_test, train_mean)
    r2 = r2_score(y_test, train_mean)
    test_scores.append(rmse)
    r2_scores.append(r2)
   
print("Baseline RMSE Average: ", np.round(np.mean(test_scores), 4))
print("Baseline RMSE Standard Deviation: ", np.round(np.std(test_scores), 4))
print("Baseline R2 Average: ", np.round(np.mean(r2_scores), 4))
print("Baseline R2 Standard Deviation: ", np.round(np.std(r2_scores), 4))
    

Baseline RMSE Average:  1.5186
Baseline RMSE Standard Deviation:  0.0099
Baseline R2 Average:  -0.0001
Baseline R2 Standard Deviation:  0.0001


### Aside: Store the (Preprocessed) Test Sets

In [46]:
# Make sure this is the same number of seeds
num_seeds = 5
random_states = np.linspace(0,100000, num_seeds, dtype=int)

for i in range(num_seeds): 

    # Split Data Randomly Same as in pipeline
    X_other, X_test, y_other, y_test = (train_test_split(X, 
            y, train_size=0.8, random_state=random_states[i]))
    
    preprocessor.fit_transform(X_other)
    X_test_pp = preprocessor.transform(X_test)
    all_features = num_ftrs + list(preprocessor.transformers_[1][1].get_feature_names_out())
    df = pd.DataFrame(X_test_pp, columns=all_features, index=X_test.index)
    df["y"] = y_test
    print(y_test)
    print(X_test_pp)
    df.to_csv(path + "/results/processed_testdf" + str(i), index=False)
    

1467     3.164353
8690     3.778224
88       2.008600
12721    2.201397
6415     2.885361
           ...   
12855    4.177421
10005    1.812913
16699    3.839227
5153     0.602060
2820     1.778151
Name: Max Records Impacted, Length: 4918, dtype: float64
[[ 1.37624999 -0.09003737  0.24160603 ...  0.          0.
   0.        ]
 [ 1.0388647  -1.2877749  -0.2341902  ...  0.          0.
   0.        ]
 [ 0.7014794   0.20939702 -0.09989288 ...  0.          0.
   0.        ]
 ...
 [ 1.37624999  0.20939702 -0.08838168 ...  0.          0.
   0.        ]
 [ 1.0388647  -0.38947175  0.09963457 ...  0.          0.
   0.        ]
 [ 1.71363528 -0.68890613 -0.2341902  ...  0.          0.
   0.        ]]
5832     4.253265
11340    3.529045
24068    3.073352
20960    0.778151
2364     3.668199
           ...   
8427     3.602386
17721    4.074487
10458    0.301030
21125    3.579898
4827     1.748188
Name: Max Records Impacted, Length: 4918, dtype: float64
[[ 1.38386412  1.11071976  1.38812802 ...  0. 