In [31]:
#Importing libraries

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, classification_report
from xgboost import XGBClassifier
import warnings

In [38]:
#  FIXED: load_and_prepare_data function - Proper indentation & logic

def load_and_prepare_data(filepath='cloud_dataset.csv'):
    """Load ANY CSV and auto-detect telemetry columns - NO DateTime required!"""
    df = pd.read_csv(filepath)
    print(" ACTUAL Columns:", df.columns.tolist())
    print("\n First 3 rows:")
    print(df.head(3))
    print(f"\nShape: {df.shape}")

    # Auto-detect numeric telemetry columns
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    print(f"\n Numeric columns (telemetry): {num_cols}")

    # Create target from anomaly labels OR high resource usage
    if 'Label' in df.columns:
        print(" Using 'Label' column for target")
        df['future_outage'] = df['Label'].rolling(window=60, min_periods=1).max().shift(-60).fillna(0).astype(int)
    else:
        # Fallback: high usage threshold
        print(" Creating synthetic target from high usage")
        high_usage = (df[num_cols].max(axis=1) > df[num_cols].quantile(0.95)).astype(int)
        df['future_outage'] = high_usage.rolling(60, min_periods=1).max().shift(-60).fillna(0).astype(int)

    print(f" Outage rate: {df['future_outage'].mean():.1%}")
    return df, num_cols


In [33]:
def engineer_features(df, base_cols):
    """Safe feature engineering using ONLY existing columns"""
    print(f"\nðŸ”§ Engineering features from: {base_cols[:3]}...")

    # Rolling stats on base telemetry
    for i, col in enumerate(base_cols[:4]):  # Limit to top 4 to avoid explosion
        df[f'{col}_roll_mean_5'] = df[col].rolling(5, min_periods=1).mean()
        df[f'{col}_roll_std_10'] = df[col].rolling(10, min_periods=1).std()

    # Simple ratios
    if len(base_cols) >= 2:
        df['ratio_01'] = df[base_cols[0]] / (df[base_cols[1]] + 1e-6)
        df['ratio_sum'] = df[base_cols[:2]].sum(axis=1)

    # Encode categorical columns
    cat_cols = df.select_dtypes(include=['object']).columns
    for col in cat_cols:
        le = LabelEncoder()
        df[f'{col}_encoded'] = le.fit_transform(df[col].astype(str).fillna('missing'))

    print(" Features engineered!")
    return df

In [34]:
def preprocess_data(df, target_col='future_outage'):
    """Robust split + scaling - NO time column needed"""
    # All numeric features except target
    feature_cols = [col for col in df.columns
                   if col not in [target_col] and df[col].dtype in [np.float64, np.int64]]

    print(f"\n Using {len(feature_cols)} features")

    # Drop NaNs and split
    df_clean = df[feature_cols + [target_col]].dropna()
    split_idx = int(0.7 * len(df_clean))

    X = df_clean[feature_cols].iloc[:split_idx].values
    y = df_clean[target_col].iloc[:split_idx].values
    X_test = df_clean[feature_cols].iloc[split_idx:].values
    y_test = df_clean[target_col].iloc[split_idx:].values

    # Scale
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X)
    X_test_scaled = scaler.transform(X_test)

    print(f"Train: {len(X_train_scaled)} | Test: {len(X_test_scaled)}")
    print(f"   Train positive rate: {y.mean():.1%} | Test: {y_test.mean():.1%}")

    return X_train_scaled, y, X_test_scaled, y_test, scaler, feature_cols

In [43]:
def train_and_eval(model, X_tr, y_tr, X_te, y_te, name):
    """Train + evaluate with full metrics"""
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_te)
    y_proba = model.predict_proba(X_te)[:, 1]

    metrics = {
        'ROC-AUC': roc_auc_score(y_te, y_proba),
        'Precision': precision_score(y_te, y_pred, zero_division=0),
        'Recall': recall_score(y_te, y_pred, zero_division=0),
        'F1': f1_score(y_te, y_pred, zero_division=0)
    }

    print(f"\n{name} Results:")
    print(classification_report(y_te, y_pred, zero_division=0))
    return metrics, model

# === RUN EVERYTHING ===
if __name__ == "__main__":
    print(" CLOUD OUTAGE PREDICTOR v2.0 - WORKS WITH ANY DATASET!")
    print("=" * 60)

    try:
        # PIPELINE
        df, telemetry_cols = load_and_prepare_data('cloud_dataset.csv')
        df = engineer_features(df, telemetry_cols)
        X_tr, y_tr, X_te, y_te, scaler, feats = preprocess_data(df)

        print("\n" + "="*60)
        print(" TRAINING MODELS...")

        metrics_results = {}
        trained_models = {}

        # Logistic Regression (baseline)
        lr_model = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
        metrics_results['Logistic'], trained_models['Logistic'] = train_and_eval(
            lr_model, X_tr, y_tr, X_te, y_te, "Logistic Regression")

        # Random Forest
        rf_model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
        metrics_results['Random Forest'], trained_models['Random Forest'] = train_and_eval(
            rf_model, X_tr, y_tr, X_te, y_te, "Random Forest")

        # XGBoost (best for imbalanced DevOps data)
        xgb_model = XGBClassifier(n_estimators=100, scale_pos_weight=10, random_state=42)
        metrics_results['XGBoost'], trained_models['XGBoost'] = train_and_eval(
            xgb_model, X_tr, y_tr, X_te, y_te, "XGBoost")

        # FINAL COMPARISON
        print("\n" + "="*60)
        print(" MODEL COMPARISON")
        print("="*60)
        results_df = pd.DataFrame(metrics_results).T.round(3)
        print(results_df[['ROC-AUC', 'Precision', 'Recall', 'F1']])

        best_model_name = results_df['Recall'].idxmax()
        best_recall = results_df['Recall'].max()
        print(f"\n PRODUCTION PICK: {best_model_name}")
        print(f"   Recall: {best_recall:.3f} (catches {best_recall:.1%} of outages)")

        # Save the best model and the scaler
        import joblib
        joblib.dump(scaler, 'scaler.joblib')
        joblib.dump(trained_models[best_model_name], f'{best_model_name.lower().replace(" ", "_")}_model.joblib')
        print(f"\nSaved scaler.joblib and {best_model_name.lower().replace(' ', '_')}_model.joblib")

        print("\n PIPELINE COMPLETE - Deploy Ready!")

    except FileNotFoundError:
        print("\n FILE MISSING - Download from:")
        print("https://www.kaggle.com/datasets/programmer3/cloud-resource-usage-dataset-for-anomaly-detection")
    except Exception as e:
        print(f"\n Error: {e}")
        import traceback
        traceback.print_exc() # Print full traceback for debugging
        print("This pipeline works with ANY CSV containing numeric telemetry columns!")

 CLOUD OUTAGE PREDICTOR v2.0 - WORKS WITH ANY DATASET!
 ACTUAL Columns: ['Timestamp', 'CPU_Usage', 'Memory_Usage', 'Disk_IO', 'Network_IO', 'Workload_Type', 'User_ID', 'Anomaly_Label']

 First 3 rows:
             Timestamp  CPU_Usage  Memory_Usage  Disk_IO  Network_IO  \
0  2025-07-01 00:00:00      18.88         43.19    11.40        6.01   
1  2025-07-01 00:01:00      25.31         45.43     7.68       17.67   
2  2025-07-01 00:02:00       3.87         49.50    14.08        3.48   

     Workload_Type User_ID  Anomaly_Label  
0   Database_Query  user_1              0  
1  Video_Streaming  user_1              0  
2   Database_Query  user_1              0  

Shape: (14400, 8)
 Using 'Anomaly_Label' column for target

 Numeric columns (telemetry for features): ['CPU_Usage', 'Memory_Usage', 'Disk_IO', 'Network_IO']
 Outage rate: 98.5%

ðŸ”§ Engineering features from: ['CPU_Usage', 'Memory_Usage', 'Disk_IO']...
 Features engineered!

 Using 18 features
Train: 10079 | Test: 4320
   Train p

In [45]:
import joblib
import pandas as pd
import numpy as np

print("\n--- Demonstrating Production Prediction Workflow ---")

# 1. Load the saved scaler and model
print("Loading saved scaler and model...")
loaded_scaler = joblib.load('scaler.joblib')
loaded_model = joblib.load('random_forest_model.joblib') # Load the best model identified
print("Scaler and model loaded successfully.")


num_features_expected = len(loaded_scaler.mean_)



synthetic_scaled_new_data = np.array([[-0.1, 0.5, -0.3, 0.2, 0.0, 0.1, -0.2, 0.3, -0.1, 0.0, 0.1, -0.1, 0.2, 0.5, 0.8, -1.0, 0.5, 1.0]])

print(f"\nSynthetic new scaled data point: {synthetic_scaled_new_data}")

prediction = loaded_model.predict(synthetic_scaled_new_data)
probability = loaded_model.predict_proba(synthetic_scaled_new_data)[:, 1]

print(f"\nPrediction for new data (0=No Outage, 1=Outage): {prediction[0]}")
print(f"Probability of Outage: {probability[0]:.4f}")

print("\n--- End of Production Prediction Demo ---")



--- Demonstrating Production Prediction Workflow ---
Loading saved scaler and model...
Scaler and model loaded successfully.

Synthetic new scaled data point: [[-0.1  0.5 -0.3  0.2  0.   0.1 -0.2  0.3 -0.1  0.   0.1 -0.1  0.2  0.5
   0.8 -1.   0.5  1. ]]

Prediction for new data (0=No Outage, 1=Outage): 1
Probability of Outage: 1.0000

--- End of Production Prediction Demo ---
