In [1]:
# Show gpu
!nvidia-smi

Wed Dec 10 07:49:28 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   48C    P8             12W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
!pip install torch-geometric -q

[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m63.7/63.7 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.3/1.3 MB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
# config.py
import torch

class config:
    # C·∫•u h√¨nh tham s·ªë Model
    HIDDEN_CHANNELS = 64
    LEARNING_RATE = 0.001
    EPOCHS = 50
    DROPOUT = 0.2
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # File paths
    NODE_FILE = '/content/dataset_nodes_info.csv'
    EDGE_FILE = '/content/dataset_topology_edges.csv'
    TICKET_FILE = '/content/dataset_tickets.csv'
    MODEL_PATH = '/content/rca_gnn_model.pth'
    VECTORIZER_PATH = '/content/vectorizer.pkl'

    # C·∫•u h√¨nh Feature
    # K√≠ch th∆∞·ªõc vector cho text log (Description)
    TEXT_EMBEDDING_DIM = 16

In [4]:
# model.py
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATConv
# import config  <-- Removed import

# --- TH√äM CLASS N√ÄY ---
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        # inputs: Logits (ch∆∞a qua sigmoid)
        # targets: labels (0 ho·∫∑c 1)

        bce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        pt = torch.exp(-bce_loss) # pt l√† x√°c su·∫•t d·ª± ƒëo√°n ƒë√∫ng
        focal_loss = self.alpha * (1 - pt) ** self.gamma * bce_loss

        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss


class RootCauseGNN(torch.nn.Module):
    def __init__(self, num_node_features, hidden_channels, num_classes=1):
        super(RootCauseGNN, self).__init__()

        # Layer 1: GAT Conv
        # heads=4 gi√∫p model h·ªçc ƒë∆∞·ª£c nhi·ªÅu m·ªëi quan h·ªá kh√°c nhau
        self.conv1 = GATConv(num_node_features, hidden_channels, heads=4, dropout=config.DROPOUT)

        # Layer 2: GAT Conv
        # Input dim = hidden_channels * heads
        self.conv2 = GATConv(hidden_channels * 4, hidden_channels, heads=2, dropout=config.DROPOUT)

        # Layer 3: Output Layer
        # Tr·∫£ v·ªÅ 1 gi√° tr·ªã duy nh·∫•t (Logit) cho m·ªói node ƒë·ªÉ d√πng BCEWithLogitsLoss
        self.conv3 = GATConv(hidden_channels * 2, num_classes, heads=1, concat=False, dropout=config.DROPOUT)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        # Layer 1
        x = self.conv1(x, edge_index)
        x = F.elu(x)
        x = F.dropout(x, p=config.DROPOUT, training=self.training)

        # Layer 2
        x = self.conv2(x, edge_index)
        x = F.elu(x)
        x = F.dropout(x, p=config.DROPOUT, training=self.training)

        # Layer 3
        x = self.conv3(x, edge_index)

        # Tr·∫£ v·ªÅ Logits (ch∆∞a qua Sigmoid)
        return x

In [5]:
# data_processor.py
import pandas as pd
import torch
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from torch_geometric.data import Data
# import config <-- Removed import
import pickle
import os

class TelcoGraphDataset:
    def __init__(self, mode='train'):
        self.node_mapping = {}
        self.reverse_mapping = {}
        self.tfidf = TfidfVectorizer(max_features=config.TEXT_EMBEDDING_DIM, stop_words='english')
        self.type_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

        # 1. Load Data
        self.nodes_df = pd.read_csv(config.NODE_FILE)
        self.edges_df = pd.read_csv(config.EDGE_FILE)

        # N·∫øu file ticket t·ªìn t·∫°i th√¨ load, kh√¥ng th√¨ ƒë·ªÉ tr·ªëng (cho tr∆∞·ªùng h·ª£p infer sau n√†y)
        if os.path.exists(config.TICKET_FILE):
            self.tickets_df = pd.read_csv(config.TICKET_FILE)
            self.tickets_df['Timestamp'] = pd.to_datetime(self.tickets_df['Timestamp'])
        else:
            self.tickets_df = pd.DataFrame()

        # 2. X·ª≠ l√Ω Vectorizer (FIT TR∆Ø·ªöC khi d√πng)
        if mode == 'train':
            print("Fitting Vectorizers on Training Data...")
            # Fit text features
            self.tfidf.fit(self.tickets_df['Description'].fillna(""))
            # Fit node static features
            self.type_encoder.fit(self.nodes_df[['type', 'vendor']])

            # Save vectorizers
            with open(config.VECTORIZER_PATH, 'wb') as f:
                pickle.dump((self.tfidf, self.type_encoder), f)
        else:
            # Mode eval/infer: Load vectorizers ƒë√£ train
            if os.path.exists(config.VECTORIZER_PATH):
                print("Loading Vectorizers...")
                with open(config.VECTORIZER_PATH, 'rb') as f:
                    self.tfidf, self.type_encoder = pickle.load(f)
            else:
                raise Exception(f"Vectorizer file {config.VECTORIZER_PATH} not found! Run training first.")

        # 3. Sau khi ƒë√£ c√≥ encoder, m·ªõi chu·∫©n b·ªã mapping v√† static features
        self._prepare_mappings_and_static_data()

    def _prepare_mappings_and_static_data(self):
        # Map Node ID <-> Integer Index
        for idx, row in self.nodes_df.iterrows():
            self.node_mapping[row['id']] = idx
            self.reverse_mapping[idx] = row['id']

        # X√¢y d·ª±ng Edge Index (C·∫•u tr√∫c ƒë·ªì th·ªã tƒ©nh)
        src, dst = [], []
        for _, row in self.edges_df.iterrows():
            if row['Source'] in self.node_mapping and row['Target'] in self.node_mapping:
                u, v = self.node_mapping[row['Source']], self.node_mapping[row['Target']]
                # ƒê·ªì th·ªã v√¥ h∆∞·ªõng (2 chi·ªÅu) ƒë·ªÉ tin lan truy·ªÅn t·ªët h∆°n
                src.extend([u, v])
                dst.extend([v, u])
        self.edge_index = torch.tensor([src, dst], dtype=torch.long)

        # T·∫°o Static Features (Type, Vendor) cho t·∫•t c·∫£ c√°c node
        # L√∫c n√†y self.type_encoder ƒê√É ƒê∆Ø·ª¢C FIT r·ªìi, n√™n g·ªçi transform s·∫Ω kh√¥ng l·ªói
        self.static_x = self.type_encoder.transform(self.nodes_df[['type', 'vendor']])

    def create_time_windows(self, window_size_min=10):
        """Chia to√†n b·ªô ticket th√†nh c√°c c·ª≠a s·ªï th·ªùi gian"""
        if self.tickets_df.empty:
            return []

        start_time = self.tickets_df['Timestamp'].min()
        end_time = self.tickets_df['Timestamp'].max()

        windows = []
        current = start_time
        while current < end_time:
            next_window = current + pd.Timedelta(minutes=window_size_min)
            # L·ªçc ticket trong kho·∫£ng n√†y
            mask = (self.tickets_df['Timestamp'] >= current) & (self.tickets_df['Timestamp'] < next_window)
            batch_df = self.tickets_df[mask]

            # Ch·ªâ l·∫•y window n√†o C√ì ticket (ƒë·ªÉ ti·∫øt ki·ªám th·ªùi gian train)
            if not batch_df.empty:
                windows.append(batch_df)
            current = next_window

        return windows

    def df_to_graph_data(self, batch_df):
        """Chuy·ªÉn ƒë·ªïi DataFrame ticket c·ªßa 1 c·ª≠a s·ªï th·ªùi gian th√†nh PyG Data"""
        num_nodes = len(self.nodes_df)
        labels = np.zeros(num_nodes, dtype=float)

        # Dynamic Features (Text Embedding) kh·ªüi t·∫°o b·∫±ng 0
        dynamic_x = np.zeros((num_nodes, config.TEXT_EMBEDDING_DIM))

        # Map tickets v√†o node t∆∞∆°ng ·ª©ng
        for _, row in batch_df.iterrows():
            if row['Device_ID'] in self.node_mapping:
                idx = self.node_mapping[row['Device_ID']]

                # Vector h√≥a Description
                # D√πng transform (kh√¥ng fit l·∫°i)
                vec = self.tfidf.transform([row['Description']]).toarray()[0] # type: ignore
                dynamic_x[idx] += vec

                # G√°n nh√£n Root Cause
                if row.get('Is_Root_Cause', 0) == 1:
                    labels[idx] = 1.0

        # K·∫øt h·ª£p Static Features v√† Dynamic Features
        # Static (v√≠ d·ª• 10 chi·ªÅu) + Dynamic (16 chi·ªÅu) -> Feature Vector 26 chi·ªÅu
        final_x = np.hstack([self.static_x, dynamic_x]) # type: ignore

        return Data(
            x=torch.tensor(final_x, dtype=torch.float),
            edge_index=self.edge_index,
            y=torch.tensor(labels, dtype=torch.float)
        )

In [8]:
# main.py
import torch
import torch.nn as nn
from torch_geometric.loader import DataLoader
# import config <-- Removed import
# from data_processor import TelcoGraphDataset <-- Removed import
# from model import RootCauseGNN, FocalLoss <-- Removed import
from sklearn.metrics import precision_score, recall_score, f1_score
import pandas as pd
import os

def train():
    print(">>> Initializing Dataset...")
    dataset_handler = TelcoGraphDataset(mode='train')

    print(">>> Creating Time Windows...")
    # C·ª≠a s·ªï 30 ph√∫t ƒë·ªÉ gom ƒë·ªß ng·ªØ c·∫£nh
    all_windows = dataset_handler.create_time_windows(window_size_min=30)

    if len(all_windows) == 0:
        print("ERROR: No data windows created.")
        return

    # Split Train/Test
    split_idx = int(len(all_windows) * 0.8)
    train_windows = all_windows[:split_idx]
    test_windows = all_windows[split_idx:]

    print(f"Total Windows: {len(all_windows)}. Train: {len(train_windows)}, Test: {len(test_windows)}")

    print(">>> Converting to Graphs...")
    train_data_list = [dataset_handler.df_to_graph_data(df) for df in train_windows]
    test_data_list = [dataset_handler.df_to_graph_data(df) for df in test_windows]

    train_loader = DataLoader(train_data_list, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_data_list, batch_size=32, shuffle=False)

    # Setup Model
    sample_data = train_data_list[0]
    num_features = sample_data.num_features
    print(f"Model Input Features: {num_features}")

    model = RootCauseGNN(num_features, config.HIDDEN_CHANNELS, num_classes=1).to(config.DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=config.LEARNING_RATE, weight_decay=1e-4)

    # --- FIX TR·ªåNG S·ªê (POS_WEIGHT) ---
    total_pos = sum([d.y.sum().item() for d in train_data_list])
    total_neg = sum([(d.y == 0).sum().item() for d in train_data_list])

    # C≈®: pos_weight_val = total_neg / (total_pos + 1e-5) -> Ra 14000 (Qu√° l·ªõn)
    # M·ªöI: D√πng cƒÉn b·∫≠c hai ƒë·ªÉ l√†m m∆∞·ª£t (Damping)
    pos_weight_val = np.sqrt(total_neg / (total_pos + 1e-5))

    # Ho·∫∑c n·∫øu v·∫´n cao, g√°n c·ª©ng m·ªôt con s·ªë h·ª£p l√Ω (v√≠ d·ª• 50.0)
    # pos_weight_val = 50.0

    pos_weight = torch.tensor([pos_weight_val]).to(config.DEVICE)
    print(f"‚öñÔ∏è Adjusted Pos Weight: {pos_weight.item():.2f}")

    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

    print("\n--- START TRAINING ---")
    best_f1 = 0.0

    for epoch in range(config.EPOCHS):
        model.train()
        total_loss = 0

        for batch in train_loader:
            batch = batch.to(config.DEVICE)
            optimizer.zero_grad()
            out = model(batch)
            loss = criterion(out.squeeze(), batch.y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)

        if (epoch + 1) % 5 == 0:
            val_metrics = evaluate(model, test_loader)
            print(f"Epoch {epoch+1:03d} | Loss: {avg_loss:.4f} | "
                  f"Val Recall: {val_metrics['recall']:.2f} | "
                  f"Val Prec: {val_metrics['precision']:.2f} | "
                  f"Val F1: {val_metrics['f1']:.2f}")

            # L∆∞u model n·∫øu F1 c·∫£i thi·ªán
            if val_metrics['f1'] > best_f1:
                best_f1 = val_metrics['f1']
                torch.save(model.state_dict(), config.MODEL_PATH)
                print(f"   >>> New Best Model Saved (F1: {best_f1:.2f})")

def evaluate(model, loader):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in loader:
            batch = batch.to(config.DEVICE)
            out = model(batch)
            probs = torch.sigmoid(out.squeeze())

            # Threshold quan tr·ªçng: TƒÉng l√™n 0.7 ho·∫∑c 0.8 ƒë·ªÉ l·ªçc b·ªõt False Positive
            preds = (probs > 0.7).float()

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(batch.y.cpu().numpy())

    return {
        'recall': recall_score(all_labels, all_preds, zero_division=0),
        'precision': precision_score(all_labels, all_preds, zero_division=0),
        'f1': f1_score(all_labels, all_preds, zero_division=0)
    }

# --- PH·∫¶N INFERENCE SAU KHI TRAIN ---
def run_inference_demo():
    print("\n--- RUNNING DEMO INFERENCE ---")

    # 1. Load Data Handler (Mode test ƒë·ªÉ load vectorizer ƒë√£ train)
    if not os.path.exists(config.VECTORIZER_PATH):
        print("‚ö†Ô∏è Ch∆∞a c√≥ file vectorizer, c·∫ßn train tr∆∞·ªõc!")
        return

    dataset_handler = TelcoGraphDataset(mode='test')

    # 2. Load Model
    # C·∫ßn t·∫°o 1 dummy data ƒë·ªÉ l·∫•y num_features kh·ªüi t·∫°o model
    dummy_df = pd.DataFrame([{"Device_ID": "DUMMY", "Description": "", "Timestamp": pd.Timestamp.now()}])
    dummy_data = dataset_handler.df_to_graph_data(dummy_df)

    model = RootCauseGNN(dummy_data.num_features, config.HIDDEN_CHANNELS, num_classes=1).to(config.DEVICE)

    if os.path.exists(config.MODEL_PATH):
        model.load_state_dict(torch.load(config.MODEL_PATH, map_location=config.DEVICE))
        print(f"üìÇ Loaded model from {config.MODEL_PATH}")
    else:
        print("‚ö†Ô∏è Model file not found!")
        return

    model.eval()

    # 3. ƒê·ªãnh nghƒ©a Test Cases

    # CASE A: S·ª± c·ªë th·∫≠t (Root Cause: AGG-005 b·ªã l·ªói OSPF)
    # K√©o theo: ACCESS-0021 (Link Down), ONT-00156 (Lost)
    case_incident = [
        {"Device_ID": "AGG-005", "Description": "OSPF State Change to Down", "Is_Root_Cause": "?"},
        {"Device_ID": "ACCESS-0021", "Description": "Link Down", "Is_Root_Cause": "?"},
        {"Device_ID": "ONT-00156", "Description": "ERROR_Lost 100%", "Is_Root_Cause": "?"},
        {"Device_ID": "CORE-01", "Description": "Interface Up", "Is_Root_Cause": "?"} # Nhi·ªÖu
    ]

    # CASE B: B√¨nh th∆∞·ªùng (Ch·ªâ c√≥ b·∫£o tr√¨)
    case_normal = [
        {"Device_ID": "AGG-010", "Description": "B·∫£o tr√¨ ƒë·ªãnh k·ª≥ h·ªá th·ªëng", "Is_Root_Cause": "?"},
        {"Device_ID": "ONT-00888", "Description": "Kh√°ch h√†ng b√°o ch·∫≠m", "Is_Root_Cause": "?"}
    ]

    # H√†m ph·ª• tr·ª£ ƒë·ªÉ ch·∫°y 1 case
    def predict_case(case_name, tickets_list):
        print(f"\nTesting {case_name} ({len(tickets_list)} tickets)...")
        df_batch = pd.DataFrame(tickets_list)
        graph_data = dataset_handler.df_to_graph_data(df_batch)
        graph_data = graph_data.to(config.DEVICE)

        with torch.no_grad():
            out = model(graph_data)
            probs = torch.sigmoid(out.squeeze())

        # In k·∫øt qu·∫£
        found_root = False
        for idx, prob in enumerate(probs):
            p_val = prob.item()
            if p_val > 0.5: # Threshold hi·ªÉn th·ªã
                node_id = dataset_handler.reverse_mapping[idx]
                # L·∫•y description t·ª´ input n·∫øu c√≥
                desc = next((t['Description'] for t in tickets_list if t['Device_ID'] == node_id), "N/A (No Ticket)")
                print(f"üî¥ ALERT: {node_id} | Prob: {p_val:.4f} | Log: {desc}")
                found_root = True

        if not found_root:
            print("üü¢ System Normal (No Root Cause Detected)")

    # Ch·∫°y test
    predict_case("CASE A (S·ª± c·ªë OSPF)", case_incident)
    predict_case("CASE B (B·∫£o tr√¨)", case_normal)

if __name__ == "__main__":
    if not os.path.exists(config.TICKET_FILE):
        print("Please run generate_data.py first!")
    else:
        # 1. Train
        train()

        # 2. Test Cases
        run_inference_demo()

>>> Initializing Dataset...
Fitting Vectorizers on Training Data...
>>> Creating Time Windows...
Total Windows: 1409. Train: 1127, Test: 282
>>> Converting to Graphs...
Model Input Features: 31
‚öñÔ∏è Adjusted Pos Weight: 120.86

--- START TRAINING ---
Epoch 005 | Loss: 0.0386 | Val Recall: 0.39 | Val Prec: 0.12 | Val F1: 0.19
   >>> New Best Model Saved (F1: 0.19)
Epoch 010 | Loss: 0.0356 | Val Recall: 0.36 | Val Prec: 0.12 | Val F1: 0.18
Epoch 015 | Loss: 0.0325 | Val Recall: 0.75 | Val Prec: 0.08 | Val F1: 0.15
Epoch 020 | Loss: 0.0299 | Val Recall: 0.34 | Val Prec: 0.19 | Val F1: 0.25
   >>> New Best Model Saved (F1: 0.25)
Epoch 025 | Loss: 0.0286 | Val Recall: 0.75 | Val Prec: 0.19 | Val F1: 0.30
   >>> New Best Model Saved (F1: 0.30)
Epoch 030 | Loss: 0.0266 | Val Recall: 0.82 | Val Prec: 0.07 | Val F1: 0.12
Epoch 035 | Loss: 0.0248 | Val Recall: 0.23 | Val Prec: 0.22 | Val F1: 0.22
Epoch 040 | Loss: 0.0233 | Val Recall: 0.55 | Val Prec: 0.25 | Val F1: 0.35
   >>> New Best Model 