# Logs Dataset: Preprocessing and Problem Formulation

This notebook demonstrates dataset description/preprocessing and problem formulation for a lean log intelligence pipeline.
yo

In [None]:
# Coding style: clear imports and configuration
from pathlib import Path
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

plt.style.use('seaborn-v0_8')
plt.rcParams['figure.figsize'] = (11, 5)
DATA = Path('../../data/raw/synthetic')



In [None]:
# Generate synthetic logs if missing
if not (DATA / 'logs.csv').exists():
    import subprocess, sys
    subprocess.check_call([sys.executable, '../../scripts/generate_synthetic_logs.py', '--out', str(DATA / 'logs.csv'), '--n', '5000'])

logs = pd.read_csv(DATA / 'logs.csv', parse_dates=['timestamp'])
logs.head()


In [None]:
# Preprocess
logs = logs.copy()
logs['timestamp'] = logs['timestamp'].dt.tz_localize('UTC')
severity_map = {'INFO':0, 'WARN':1, 'ERROR':2}
logs['severity_num'] = logs['severity'].map(severity_map)
logs = logs.drop_duplicates(subset=['timestamp','host','message'])
logs['len'] = logs['message'].str.len()
logs.describe(include='all')


In [None]:
# Stratified split
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(logs, test_size=0.2, stratify=logs['category'], random_state=42)
val_df, test_df = train_test_split(test_df, test_size=0.5, stratify=test_df['category'], random_state=42)
len(train_df), len(val_df), len(test_df)


## Problem Formulation
- Goal: classify log chunks into incident categories; extract root cause / remediation text.
- Constraints: p95 latency < 1s per 1k lines; cost budget <$0.50 per 100k lines.
- Metrics: accuracy, macro‑F1, per‑class PR/F1, calibration (ECE/Brier).
- Baseline: keyword rules; Model: compact GPT‑4‑family with streaming.


## ML Baseline: TF‑IDF + Logistic Regression


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2), min_df=2, max_features=50000, sublinear_tf=True)),
    ('clf', LogisticRegression(max_iter=200, class_weight='balanced', n_jobs=None, solver='liblinear')),
])

pipeline.fit(train_df['message'], train_df['category'])
probs = pipeline.predict_proba(val_df['message'])
preds = pipeline.classes_[probs.argmax(axis=1)]
print(classification_report(val_df['category'], preds))
confusion_matrix(val_df['category'], preds)


## Calibration: ECE and Brier Score


In [None]:
import numpy as np
from sklearn.metrics import brier_score_loss

# Multiclass Brier: average over one-vs-all
classes = list(pipeline.classes_)
y_true = val_df['category'].values

brier_list = []
for k, c in enumerate(classes):
    y_bin = (y_true == c).astype(int)
    p = probs[:, k]
    brier_list.append(brier_score_loss(y_bin, p))

brier = float(np.mean(brier_list))
print('Brier (multiclass mean):', round(brier, 4))

# Expected Calibration Error (ECE)
def expected_calibration_error(y_true, probas, bins=10):
    # y_true: labels; probas: [N, K]
    confidences = probas.max(axis=1)
    predictions = probas.argmax(axis=1)
    correct = (classes_np[predictions] == y_true)
    bin_bounds = np.linspace(0.0, 1.0, bins + 1)
    ece = 0.0
    for i in range(bins):
        lo, hi = bin_bounds[i], bin_bounds[i+1]
        mask = (confidences > lo) & (confidences <= hi)
        if not np.any(mask):
            continue
        acc = correct[mask].mean()
        conf = confidences[mask].mean()
        ece += (mask.mean()) * abs(acc - conf)
    return float(ece)

classes_np = np.array(classes)
ece = expected_calibration_error(y_true, probs, bins=15)
print('ECE:', round(ece, 4))
