In [None]:
import pandas as pd
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
import joblib
from google.cloud import bigquery
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# 1. Data Loading from BigQuery (VPC and VM Logs)
client = bigquery.Client()

# Example queries (adapt to your actual table schemas and data)
vpc_query = """
SELECT src_ip, dst_ip, protocol, bytes_sent, timestamp, attack_label
FROM `your-gcp-project-id.your_dataset.vpc_logs`
WHERE timestamp > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 7 DAY)
"""

vm_query = """
SELECT vm_name, event_type, user, timestamp, attack_label
FROM `your-gcp-project-id.your_dataset.vm_logs`
WHERE timestamp > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 7 DAY)
"""

vpc_df = client.query(vpc_query).to_dataframe()
vm_df = client.query(vm_query).to_dataframe()

# 2. Data Preprocessing
# VPC Logs
vpc_df = vpc_df.fillna(0) #handle nulls
vpc_features = ["src_ip", "dst_ip", "protocol", "bytes_sent"]
vpc_X = vpc_df[vpc_features]
vpc_y = vpc_df["attack_label"]

# VM Logs
vm_df = vm_df.fillna("unknown") #handle nulls
vm_features = ["vm_name", "event_type", "user"]
vm_X = vm_df[vm_features]
vm_y = vm_df["attack_label"]

# Label Encoding for categorical features (VM logs)
label_encoders = {}
for col in vm_features:
    le = LabelEncoder()
    vm_X[col] = le.fit_transform(vm_X[col])
    label_encoders[col] = le

# Feature Scaling
vpc_scaler = StandardScaler()
vpc_X_scaled = vpc_scaler.fit_transform(vpc_X)

vm_scaler = StandardScaler()
vm_X_scaled = vm_scaler.fit_transform(vm_X)

# 3. Model Training (Anomaly Detection - VPC)
vpc_anomaly_model = IsolationForest(contamination=0.01) # Adjust contamination
vpc_anomaly_model.fit(vpc_X_scaled)

# 4. Model Training (Classification - VM Logs)
vm_X_train, vm_X_test, vm_y_train, vm_y_test = train_test_split(vm_X_scaled, vm_y, test_size=0.2, random_state=42)

vm_classification_model = RandomForestClassifier(n_estimators=100, random_state=42) # Example classifier
vm_classification_model.fit(vm_X_train, vm_y_train)

# 5. Deep Learning Model (Combined VPC/VM analysis - Optional)
# Combine features (example: concatenate scaled features)
combined_X = pd.concat([pd.DataFrame(vpc_X_scaled), pd.DataFrame(vm_X_scaled)], axis=1)
combined_y = pd.concat([vpc_y, vm_y], axis=0)
combined_X_train, combined_X_test, combined_y_train, combined_y_test = train_test_split(combined_X, combined_y, test_size=0.2, random_state=42)
combined_y_train_encoded = LabelEncoder().fit_transform(combined_y_train)
combined_y_test_encoded = LabelEncoder().fit_transform(combined_y_test)

combined_model = keras.Sequential([
    layers.Dense(128, activation='relu', input_shape=(combined_X_train.shape[1],)),
    layers.Dropout(0.5),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(len(pd.Series(combined_y).unique()), activation='softmax') #output layer
])

combined_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
combined_model.fit(combined_X_train, combined_y_train_encoded, epochs=10, batch_size=32, validation_split=0.2)

# 6. Model Evaluation
# VPC Anomaly Detection
vpc_anomaly_predictions = vpc_anomaly_model.predict(vpc_X_scaled)
print("VPC Anomaly Detection Report:")
print(pd.Series(vpc_anomaly_predictions).value_counts()) # -1 anomalies , 1 normal

# VM Classification
vm_y_pred = vm_classification_model.predict(vm_X_test)
print("VM Classification Report:")
print(classification_report(vm_y_test, vm_y_pred))
print("VM Accuracy:", accuracy_score(vm_y_test, vm_y_pred))

# Combined Deep Learning
combined_y_pred = combined_model.predict(combined_X_test)
combined_y_pred_classes = combined_y_pred.argmax(axis=1)
print("Combined Deep Learning Report:")
print(classification_report(combined_y_test_encoded, combined_y_pred_classes))
print("Combined Accuracy:", accuracy_score(combined_y_test_encoded, combined_y_pred_classes))

# 7. Model Saving
joblib.dump(vpc_anomaly_model, "vpc_anomaly_model.joblib")
joblib.dump(vm_classification_model, "vm_classification_model.joblib")
joblib.dump(vpc_scaler, "vpc_scaler.joblib")
joblib.dump(vm_scaler, "vm_scaler.joblib")
for col, le in label_encoders.items():
    joblib.dump(le, f"vm_label_encoder_{col}.joblib")
combined_model.save("combined_model.h5")

: 