# Telco Platform Engineering Security Operations <br> AI Insights Generation with Fine Tuning of Meta's BART Model
Author: Fatih E. NAR <br>
This project aims to deliver a security risk factor insights <br>
Model Card: https://huggingface.co/google-t5/t5-small <br>

In [None]:
# Install the required packages
%pip install -r requirements.txt

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import seaborn as sns
import onnxmltools
from skl2onnx.common.data_types import FloatTensorType


# Load the data
# why did we use parquet format? Cos we love to complicate stuff. Just kidding. 
# Parquet is a columnar storage format that is optimized for reading and writing data. 
# And it is a good choice for storing large datasets.
data = pd.read_parquet('data/5G_SecOps_Data_10K.parquet')

# Preprocess the data
def preprocess_data(data):
    # Convert categorical variables to numeric
    data['Service Configurations'] = data['Service Configurations'].map({'Poor': 0, 'Medium': 1, 'Good': 2})
    data['Service Reachability'] = data['Service Reachability'].map({'Internal': 0, 'External': 1})
    data['RBAC Configuration'] = data['RBAC Configuration'].map({'Low': 0, 'Medium': 1, 'High': 2})
    # Drop non-numeric columns
    data = data.drop(columns=['Cluster', 'Namespace'])
    return data

data = preprocess_data(data)

# Define features and target
X = data.drop(columns=['Risk Score'])
y = data['Risk Score']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Train the XGBoost model with GPU support
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror', 
    n_estimators=100, 
    learning_rate=0.1, 
    max_depth=6, 
    #tree_method='gpu_hist'  # Use GPU for training if you have NVIDIA
    tree_method='hist'  # Use histogram-based algorithm for CPU
)
xgb_model.fit(X_train_scaled, y_train)

In [None]:
# Predict the test set
y_pred = xgb_model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R^2 Score: {r2}')

In [None]:
# Distribution of residuals
residuals = y_test - y_pred
plt.figure(figsize=(10, 6))
sns.histplot(residuals, bins=30, kde=True)
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.title('Distribution of Residuals')
plt.show()

# Scatter plot of actual vs. predicted values
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_test, y=y_pred, alpha=0.5, color='red', label='Predicted Risk Scores')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2, label='Actual Risk Scores')
plt.xlabel('Actual Risk Scores')
plt.ylabel('Predicted Risk Scores')
plt.title('Actual vs. Predicted Risk Scores')
plt.legend(loc='upper left')
plt.show()

# Feature importance with color gradient and percentage annotations
feature_names = X.columns.tolist()
importance = xgb_model.feature_importances_
importance_dict = {name: score for name, score in zip(feature_names, importance)}

# Create a color palette
norm = plt.Normalize(min(importance_dict.values()), max(importance_dict.values()))
colors = plt.cm.viridis(norm(list(importance_dict.values())))

plt.figure(figsize=(12, 8))
sns.barplot(x=list(importance_dict.values()), y=list(importance_dict.keys()), palette=colors)
plt.title('Feature Importance')
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.xticks(rotation=45)
plt.tight_layout()

# Add percentage annotations
for i, (name, value) in enumerate(importance_dict.items()):
    plt.text(value, i, f'{value * 100:.2f}%', ha='left', va='center')

plt.show()

In [None]:
# Convert the model to ONNX format
initial_type = [('float_input', FloatTensorType([None, X_train.shape[1]]))]
onnx_model = onnxmltools.convert_xgboost(xgb_model, initial_types=initial_type)

# Save the ONNX model
onnx_model_path = 'model/xgb_risk_score_model.onnx'
with open(onnx_model_path, 'wb') as f:
    f.write(onnx_model.SerializeToString())

print(f"XGBoost model saved to {onnx_model_path}")