In [None]:
import xgboost as xgb
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
final_dataset = pd.read_csv('prepared_dataset.csv')
print(final_dataset.keys())

final_dataset = final_dataset.drop(['Pod Event Age_0:00:25',
       'Pod Event Age_0:00:28', 'Pod Event Age_0:00:55',
       'Pod Event Age_0:01:48', 'Pod Event Age_0:02:02',
       'Pod Event Age_0:02:18', 'Pod Event Age_0:03:41',
       'Pod Event Age_0:09:03', 'Pod Event Age_0:17:18',
       'Pod Event Age_0:19:58', 'Pod Event Age_Other',
       'Pod Event Source_kubelet', 'Pod Event Source_node-controller',
       'Event Age_0:00:13', 'Event Age_0:00:35', 'Event Age_0:09:58',
       'Event Age_0:14:03', 'Event Age_0:14:24', 'Event Age_0:15:23',
       'Event Age_0:20:49', 'Event Age_0:26:12', 'Event Age_0:27:32',
       'Event Age_0:28:52', 'Event Age_Other',
       'Event Source_container-runtime-custom-plugin-monitor',
       'Event Source_custom-scheduledevents-consolidated-condition-plugin-monitor',
       'Event Source_custom-scheduledevents-consolidated-plugin-monitor',
       'Event Source_kernel-monitor',
       'Event Source_kubelet-custom-plugin-monitor',
       'Event Source_node-controller'], axis = 1)

print("after dropping")

final_dataset.drop(final_dataset.columns[0], axis=1, inplace=True)


print(final_dataset.keys())
start_time = final_dataset.index.min()
end_time = final_dataset.index.max()

train_cutoff = start_time + 0.8*(end_time - start_time)

# Train-test split
train_df = final_dataset[final_dataset.index <= train_cutoff]
test_df = final_dataset[final_dataset.index > train_cutoff]

# Separate features and target
X_train = train_df.drop(['will_fail'], axis=1)
y_train = train_df['will_fail']
X_test = test_df.drop(['will_fail'], axis=1)
y_test = test_df['will_fail']


# Compute class weights manually (since XGBoost doesn't support class_weight directly)
class_counts = np.bincount(y_train)
total_samples = len(y_train)
num_classes = len(class_counts)

scale_pos_weight = class_counts[0] / class_counts[1]  # Ratio of majority to minority class

# Define XGBoost model
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',  # Binary classification
    scale_pos_weight=scale_pos_weight,  # Balancing for class imbalance
    eval_metric='logloss',  # Logarithmic loss function
    n_estimators=200,  # Number of boosting rounds
    learning_rate=0.05,  # Step size shrinkage
    max_depth=5,  # Depth of trees
    subsample=0.8,  # Subsample ratio for boosting
    colsample_bytree=0.8,  # Fraction of features per tree
    random_state=42
)

# Train XGBoost model
xgb_model.fit(X_train, y_train)

training = xgb_model.predict(X_train)
print(classification_report(y_train, training))

xgb_model.save_model("xgboost_withoutD.json")

# Predict on test data
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate model
print(classification_report(y_test, y_pred_xgb))




Index(['Unnamed: 0', 'CPU Usage (%)', 'Memory Usage (%)', 'Pod Restarts',
       'Ready Containers', 'Total Containers', 'Memory Usage (MB)',
       'Active Memory Requests (MB)', 'Memory Requests (%)',
       'Active Memory Limits (MB)emory Limits Utilization (%)',
       'Memory Usage (MB).1', 'Memory Usage (Cache) (MB)',
       'Memory Usage (Swap) (MB)', 'Network Receive Bytes',
       'Network Transmit Bytes', 'Network Receive Packets (p/s)',
       'Network Transmit Packets (p/s)',
       'Network Receive Packets Dropped (p/s)',
       'Network Transmit Packets Dropped (p/s)', 'FS Reads Total (MB)',
       'FS Writes Total (MB)', 'FS Reads/Writes Total (MB)',
       'FS Reads Bytes Total (MB)', 'FS Writes Bytes Total (MB)',
       'FS Reads/Writes Bytes Total (MB)', 'will_fail', 'cpu_avg_5min',
       'cpu_avg_15min', 'memory_avg_5min', 'memory_avg_15min', 'net_traffic',
       'net_traffic_avg_5min', 'net_errors', 'net_errors_5min', 'cpu_trend',
       'memory_trend', 'restart_c

sample_data
