## Model Training

#### 1.1 Import Data and Required Packages
##### Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.pipeline import Pipeline

In [2]:
# Load the dataset
data = pd.read_csv('./data/breached_services_info.csv')
data.head()

Unnamed: 0,Record,Name,Title,Domain,BreachDate,AddedDate,ModifiedDate,PwnCount,Description,LogoPath,DataClasses,IsVerified,IsFabricated,IsSensitive,IsRetired,IsSpamList,IsMalware,IsSubscriptionFree,financial_estimated_loss
0,0,000webhost,000webhost,000webhost.com,2015-03-01,2015-10-26T23:35:45Z,2017-12-10T21:44:27Z,14936670,"In approximately March 2015, the free web host...",https://haveibeenpwned.com/Content/Images/Pwne...,"['Email addresses', 'IP addresses', 'Names', '...",True,False,False,False,False,False,False,14937170.0
1,1,123RF,123RF,123rf.com,2020-03-22,2020-11-15T00:59:50Z,2020-11-15T01:07:10Z,8661578,"In March 2020, the stock photo site <a href=""h...",https://haveibeenpwned.com/Content/Images/Pwne...,"['Email addresses', 'IP addresses', 'Names', '...",True,False,False,False,False,False,False,8662078.0
2,2,126,126,126.com,2012-01-01,2016-10-08T07:46:05Z,2016-10-08T07:46:05Z,6414191,"In approximately 2012, it's alleged that the C...",https://haveibeenpwned.com/Content/Images/Pwne...,"['Email addresses', 'Passwords']",False,False,False,False,False,False,False,5131852.8
3,3,17Media,17,17app.co,2016-04-19,2016-07-08T01:55:03Z,2016-07-08T01:55:03Z,4009640,"In April 2016, customer data obtained from the...",https://haveibeenpwned.com/Content/Images/Pwne...,"['Device information', 'Email addresses', 'IP ...",True,False,False,False,False,False,False,4010140.0
4,4,17173,17173,17173.com,2011-12-28,2018-04-28T04:53:15Z,2018-04-28T04:53:15Z,7485802,"In late 2011, <a href=""https://news.softpedia....",https://haveibeenpwned.com/Content/Images/Pwne...,"['Email addresses', 'Passwords', 'Usernames']",False,False,False,False,False,False,False,5989141.6


### 2. Dataset preparation

In [3]:
# Feature engineering
data['AddedDate'] = pd.to_datetime(data['AddedDate'], utc=True).dt.tz_localize(None)
data['BreachDate'] = pd.to_datetime(data['BreachDate'], utc=True).dt.tz_localize(None)
data['TimeToDiscovery'] = (data['AddedDate'] - data['BreachDate']).dt.days
data['DataClassesCount'] = data['DataClasses'].str.count(',') + 1

# One-hot encoding for DataClasses field
data_encoded = pd.get_dummies(data, columns=['DataClasses'])

# Feature selection
features_to_keep = ['PwnCount', 'IsSensitive', 'IsRetired', 'IsSpamList', 'IsMalware', 'IsSubscriptionFree', 
                    'TimeToDiscovery', 'DataClassesCount'] + [col for col in data_encoded.columns if col.startswith('DataClasses_')]
X = data_encoded[features_to_keep]
y = data_encoded['financial_estimated_loss']

In [4]:
# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

### 3. Model training

In [5]:
# Define models and hyperparameters
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'XGBoost': XGBRegressor(objective='reg:squarederror', random_state=42)
}

params = {
    'Linear Regression': {},
    'Random Forest': {'model__n_estimators': [100, 200, 300], 'model__max_depth': [None, 10, 20, 30], 'model__min_samples_split': [2, 5, 10]},
    'Gradient Boosting': {'model__n_estimators': [100, 200, 300], 'model__learning_rate': [0.01, 0.1, 0.2], 'model__max_depth': [3, 5, 7]},
    'XGBoost': {'model__n_estimators': [100, 200, 300], 'model__learning_rate': [0.01, 0.1, 0.2], 'model__max_depth': [3, 5, 7]}
}

# Perform Grid Search with cross-validation
best_models = {}
for model_name in models:
    pipe = Pipeline([('model', models[model_name])])
    grid_search = GridSearchCV(pipe, params[model_name], cv=5, scoring='neg_mean_absolute_percentage_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_models[model_name] = grid_search.best_estimator_
    print(f'Best parameters for {model_name}: {grid_search.best_params_}')

# Evaluate the best model on the test set
for model_name, model in best_models.items():
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f'\nModel: {model_name}')
    print(f'Mean Squared Error: {mse}')
    print(f'Mean Absolute Error: {mae}')
    print(f'Mean Absolute Percentage Error: {mape}')
    print(f'R^2 Score: {r2}')

Best parameters for Linear Regression: {}
Best parameters for Random Forest: {'model__max_depth': 20, 'model__min_samples_split': 2, 'model__n_estimators': 300}
Best parameters for Gradient Boosting: {'model__learning_rate': 0.1, 'model__max_depth': 7, 'model__n_estimators': 300}
Best parameters for XGBoost: {'model__learning_rate': 0.2, 'model__max_depth': 7, 'model__n_estimators': 300}

Model: Linear Regression
Mean Squared Error: 7.525669726821233e+43
Mean Absolute Error: 4.5903729456026175e+21
Mean Absolute Percentage Error: 1.7704926649605288e+16
R^2 Score: -2.6470363240004497e+28

Model: Random Forest
Mean Squared Error: 108450101616751.61
Mean Absolute Error: 2488386.2647414566
Mean Absolute Percentage Error: 0.06335526280679114
R^2 Score: 0.9618543772525695

Model: Gradient Boosting
Mean Squared Error: 129831334391709.72
Mean Absolute Error: 2725881.7733108676
Mean Absolute Percentage Error: 0.06869495496006575
R^2 Score: 0.9543338638814454

Model: XGBoost
Mean Squared Error: 8

In [6]:
# Identify the best model
best_model_name = min(best_models, key=lambda x: mean_absolute_percentage_error(y_test, best_models[x].predict(X_test)))
best_model_name

'Random Forest'

### 4. Test

In [7]:
# Test
def test_model(model, scaler, entry, feature_names):
    # Convert the entry to a DataFrame
    df = pd.DataFrame([entry])
    
    # Perform the same feature engineering as in the training data
    df['AddedDate'] = pd.to_datetime(df['AddedDate'], utc=True).dt.tz_localize(None)
    df['BreachDate'] = pd.to_datetime(df['BreachDate'], utc=True).dt.tz_localize(None)
    df['TimeToDiscovery'] = (df['AddedDate'] - df['BreachDate']).dt.days
    df['DataClassesCount'] = df['DataClasses'].str.count(',') + 1
    
    # Create a DataFrame with all possible features, initialized to 0
    X = pd.DataFrame(0, index=[0], columns=feature_names)
    
    # Fill in the values we have
    for feature in feature_names:
        if feature in df.columns:
            X[feature] = df[feature]
        elif feature.startswith('DataClasses_'):
            class_name = feature.split('DataClasses_')[1]
            if class_name in df['DataClasses'].iloc[0]:
                X[feature] = 1
    
    # Scale the features
    X_scaled = scaler.transform(X)
    
    # Make prediction
    prediction = model.predict(X_scaled)
    
    return prediction[0]

# New fictional test entry
test_entry = {
    'Name': 'TechNovaHub',
    'Title': 'TechNovaHub Data Breach',
    'Domain': 'technovahub.com',
    'BreachDate': '2023-09-15',
    'AddedDate': '2023-11-20T14:30:00Z',
    'ModifiedDate': '2023-11-25T09:15:00Z',
    'PwnCount': 1,
    'Description': 'In September 2023, the tech community platform TechNovaHub suffered a significant data breach affecting approximately 2.75 million users. The breach was discovered in November and involved unauthorized access to user accounts. Compromised data included email addresses, hashed passwords, and some personal information.',
    'LogoPath': 'https://example.com/logos/technovahub.png',
    'DataClasses': "['Email addresses', 'Passwords', 'Names', 'Dates of birth', 'Phone numbers']",
    'IsVerified': None,  # Ignored in the test
    'IsFabricated': True,
    'IsSensitive': False,
    'IsRetired': False,
    'IsSpamList': False,
    'IsMalware': False,
    'IsSubscriptionFree': True
}

# Test the best model
best_model = best_models[best_model_name]
prediction = test_model(best_model, scaler, test_entry, features_to_keep)
print(f'\nEstimated financial loss for TechNovaHub: ${prediction:.2f}')


Estimated financial loss for TechNovaHub: $3650.30
