In [None]:
import pandas as pd
import time
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score

# Load your dataset
start_time = time.time()
print("Loading datasets...")
train_file_path = r"C:\Users\Elakkiya\Downloads\microsoft cybersecurity\reordered_features_training.csv"
test_file_path = r"C:\Users\Elakkiya\Downloads\microsoft cybersecurity\reordered_features_testing.csv"

train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)
end_time = time.time()
print(f"Datasets loaded in {end_time - start_time:.2f} seconds")

# Separate features and labels for training data
print("Separating features and labels...")
X_train = train_data.drop(columns=['IncidentGrade'])
y_train = train_data['IncidentGrade']
X_test = test_data.drop(columns=['IncidentGrade'])
y_test = test_data['IncidentGrade']

# Verify the shapes of the data
print("X_train Shape:", X_train.shape)
print("y_train Shape:", y_train.shape)
print("X_test Shape:", X_test.shape)
print("y_test Shape:", y_test.shape)

# Check for inconsistencies
assert X_train.shape[0] == y_train.shape[0], "Mismatch in training data samples"
assert X_test.shape[0] == y_test.shape[0], "Mismatch in testing data samples"

# Split training data for validation
start_time = time.time()
print("Splitting data for validation...")
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train, random_state=42)
end_time = time.time()
print(f"Data split for validation in {end_time - start_time:.2f} seconds")

# Verify the shapes of the split data
print("Training set shape:", X_train_split.shape, y_train_split.shape)
print("Validation set shape:", X_val_split.shape, y_val_split.shape)

# Define the model
print("Defining the model...")
model = GradientBoostingClassifier(random_state=42)

# Define cross-validation
print("Setting up cross-validation...")
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Evaluate using cross-validation
start_time = time.time()
print("Performing cross-validation...")
cv_results = cross_val_score(model, X_train_split, y_train_split, cv=kf, scoring='f1_macro')
end_time = time.time()
print(f"Cross-Validation F1 Macro Score: {cv_results.mean()} in {end_time - start_time:.2f} seconds")

# Sample 20% of the training data for hyperparameter tuning
print("Sampling 20% of the data for hyperparameter tuning...")
train_sample = train_data.sample(frac=0.2, random_state=42)
X_sample = train_sample.drop(columns=['IncidentGrade'])
y_sample = train_sample['IncidentGrade']

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 4]
}

# Set up GridSearchCV
print("Setting up GridSearchCV...")
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=kf, scoring='f1_macro', n_jobs=-1)

# Fit the grid search
start_time = time.time()
print("Fitting GridSearchCV...")
grid_search.fit(X_sample, y_sample)
end_time = time.time()
print(f"GridSearchCV completed in {end_time - start_time:.2f} seconds")

# Get the best parameters
print(f'Best Parameters: {grid_search.best_params_}')

# Train the final model
print("Training the final model with best parameters...")
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Predict on the test set
print("Making predictions on the test set...")
y_pred = best_model.predict(X_test)

# Evaluate the model
print("Evaluating the model...")
print(classification_report(y_test, y_pred, target_names=['Class 0', 'Class 1', 'Class 2']))

# Calculate F1, Precision, Recall, and Accuracy
f1 = f1_score(y_test, y_pred, average='macro')
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
accuracy = accuracy_score(y_test, y_pred)

print(f'Macro F1 Score: {f1}')
print(f'Macro Precision: {precision}')
print(f'Macro Recall: {recall}')
print(f'Accuracy: {accuracy}')

print("Script completed successfully!")


Loading datasets...
Datasets loaded in 15.86 seconds
Separating features and labels...
X_train Shape: (6503132, 17)
y_train Shape: (6503132,)
X_test Shape: (4105554, 17)
y_test Shape: (4105554,)
Splitting data for validation...
Data split for validation in 11.59 seconds
Training set shape: (5202505, 17) (5202505,)
Validation set shape: (1300627, 17) (1300627,)
Defining the model...
Setting up cross-validation...
Performing cross-validation...
