In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.preprocessing import LabelEncoder, StandardScaler
import xgboost as xgb
import lightgbm as lgb
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


In [None]:
# Load the dataset
df = pd.read_csv('../data/splunk_logs.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nColumn names: {list(df.columns)}")
print(f"\nFirst few rows:")
df.head()


In [None]:
# Basic data info
print("Dataset Info:")
df.info()

print("\nMissing values:")
print(df.isnull().sum())

print("\nBasic statistics:")
df.describe()


In [None]:
# Check target variable distribution (assuming there's a target column)
# Update this based on the actual column names in your dataset
target_cols = [col for col in df.columns if 'critical' in col.lower() or 'incident' in col.lower() or 'target' in col.lower()]
if target_cols:
    target_col = target_cols[0]
    print(f"Target column: {target_col}")
    print(f"Target distribution:")
    print(df[target_col].value_counts())
    
    # Plot target distribution
    plt.figure(figsize=(8, 6))
    df[target_col].value_counts().plot(kind='bar')
    plt.title('Target Variable Distribution')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
else:
    print("No obvious target column found. Available columns:")
    print(df.columns.tolist())

# Note: Continue with feature engineering, model training, and evaluation
# This notebook provides a comprehensive framework for ML model development
