In [13]:
import numpy as np
import pandas as pd
import random
from collections import deque, defaultdict

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [14]:
import sys
sys.path.append('..')
import datetime
import time
import random
from collections import Counter, defaultdict, deque
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt
from kneed import KneeLocator
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score, accuracy_score, log_loss, roc_auc_score
import json
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from IPython.display import display, HTML

# display(HTML("<style>:root { --jp-notebook-max-width: 100% !important; }</style>"))

In [15]:
import warnings

# suppress only the “y_pred values do not sum to one” warning
warnings.filterwarnings(
    "ignore",
    message=".*y_pred values do not sum to one.*"
)

In [16]:
def cross_entropy_loss(model, x_test, y_test):
    """
    For each sample i:
      loss_i = -∑_c [1{c = y_true_i} · log P_model(c | x_i)]
    If the true label isn’t in model.classes_, returns a default high loss.
    Works for any len(x_test) >= 1, including the single-class case.
    """
    probs = model.predict_proba(x_test)
    default = log_loss([[1, 0]], [[0, 1]]) + 1  # fallback loss

    losses = []
    for i, true_label in enumerate(y_test):
        sample_probs = probs[i]
        classes = model.classes_

        # if only one class in the model
        if sample_probs.size == 1:
            if classes[0] == true_label:
                losses.append(0.0)  # perfect prediction
            else:
                losses.append(default)
            continue

        # find index of the true label
        idx_arr = np.where(classes == true_label)[0]
        if idx_arr.size == 0:
            losses.append(default)
        else:
            y_true_onehot = np.zeros_like(sample_probs)
            y_true_onehot[idx_arr[0]] = 1

            # normalize just in case
            sample_probs = sample_probs / sample_probs.sum()
            y_true_onehot = y_true_onehot / y_true_onehot.sum()

            loss_i = log_loss([y_true_onehot], [sample_probs])
            losses.append(loss_i)

    return np.array(losses)


In [17]:
def normal_loss(model, x_test, y_test):
    """
    For each sample i:
      loss_i = 1 - P_model(y_true_i | x_i)
    If the true label isn’t in model.classes_, we return 1.1 as before.
    Works for any len(x_test) >= 1.
    """
    # predict_proba returns shape (n_samples, n_classes)
    probs = model.predict_proba(x_test)
    
    losses = []
    for i, true_label in enumerate(y_test):
        sample_probs = probs[i]
        # find index of the true label in model.classes_
        idx_arr = np.where(model.classes_ == true_label)[0]
        if idx_arr.size == 0:
            losses.append(1.1)
        else:
            col_index = idx_arr[0]
            losses.append(1 - sample_probs[col_index])
    
    return np.array(losses)

In [18]:
def safe_transform_target(encoder, targets, unknown_value=-1):
    classes = set(encoder.classes_)
    transformed = []
    for t in targets:
        if t in classes:
            transformed.append(encoder.transform([t])[0])
        else:
            transformed.append(unknown_value)
    return np.array(transformed)

In [19]:
def get_clean_loss(normal_loss_value, cross_entropy_loss_value):
    normal_loss_dist = []
    cross_loss_dist = []
    for pos, prediction in  enumerate(normal_loss_value):
        if prediction != 1:
            cross_loss_dist.append(cross_entropy_loss_value[pos])
            normal_loss_dist.append(prediction)

    return np.array(normal_loss_dist), np.array(cross_loss_dist)

In [20]:
def cleaning_cls_result(classification_result):
    
    for i in classification_result.keys():
        print(i, classification_result[i].keys())

        if '1' not in classification_result[i].keys():
            classification_result[i]['1'] = {'precision': 0, 'recall': 0, 'f1-score': 0, 'support': 0.0}
    return classification_result

In [21]:
def sample_with_min_anomalies(gt_labels, num_samples=10, min_anomalies=3, random_state=None):
    """
    Randomly sample `num_samples` indices from gt_labels (0/1 array),
    ensuring at least `min_anomalies` true-anomaly (1) indices are included.

    Parameters
    ----------
    gt_labels : array-like, shape (n_samples,)
        Ground-truth labels (0 = normal, 1 = anomaly).
    num_samples : int, default=10
        Total number of indices to sample.
    min_anomalies : int, default=3
        Minimum number of anomaly indices to include.
    random_state : int or None
        Seed for reproducibility.

    Returns
    -------
    selected_indices : ndarray, shape (<= num_samples,)
        Shuffled indices, containing at least `min_anomalies` anomalies
        (or as many as available if fewer exist).
    """
    gt_labels = np.asarray(gt_labels)
    if random_state is not None:
        np.random.seed(random_state)

    # locate anomaly vs normal indices
    anomaly_idx = np.where(gt_labels == 1)[0]
    normal_idx  = np.where(gt_labels == 0)[0]

    # determine how many anomalies we can pick
    n_anom = min(len(anomaly_idx), min_anomalies)
    # pick anomalies without replacement
    picked_anom = np.random.choice(anomaly_idx, n_anom, replace=False) if n_anom > 0 else np.array([], dtype=int)

    # fill the rest from normals
    n_normal = num_samples - n_anom
    n_normal = min(n_normal, len(normal_idx))
    picked_norm = np.random.choice(normal_idx, n_normal, replace=False) if n_normal > 0 else np.array([], dtype=int)

    # combine and shuffle
    selected = np.concatenate([picked_anom, picked_norm])
    np.random.shuffle(selected)

    return selected

In [60]:
def compute_gap_cutoff(rf, Xw, yw):
    """
    Given a fitted RandomForest `rf` and its training data (Xw, yw),
    compute cross‐entropy losses for each sample. If there are fewer than
    2 samples, just return the single loss (or 0 if somehow empty). Otherwise
    use find_largest_gap to get a gap‐based cutoff.
    """
    ce_losses = cross_entropy_loss(rf, Xw, yw)
    n = ce_losses.size

    if n == 0:
        return 0.0
    if n == 1:
        # Only one loss → no “gap” to find. Use the single value as cutoff.
        return float(ce_losses[0])

    # Now we have ≥2 losses; sorting in descending order ensures diff is nonempty
    _, cutoff_gap = find_largest_gap(ce_losses)
    return cutoff_gap


In [61]:
def find_largest_gap(losses):
    if len(losses) ==1:
        return 0, -1
    else:
        y = sorted(losses, reverse=True)
        diffs = abs(np.diff(y))
        idx = np.argmax(diffs) + 1   # +1 because diffs[i] = y[i+1]-y[i]
    return idx, y[idx]

In [62]:
def ce_loss_topk_balanced(model, x_pool, y_pool, k=20):
    """
    Selects k samples with highest CE-loss, ensuring at least min_anom anomalies.

    Args:
        model:      Classifier supporting predict_proba.
        x_pool:     array-like, shape (n_samples, n_features)
        y_pool:     array-like, shape (n_samples,), true binary labels (0=normal,1=anomaly)
        k:          Total number of samples to select.
        min_anom:   Minimum number of anomalies to include.

    Returns:
        np.ndarray: Shuffled indices of selected samples (length k).
    """
    # 1) Compute CE loss for each sample
    losses = np.array(cross_entropy_loss(model, x_pool, y_pool))
    # 2) Sort indices by loss descending
    idx_desc = np.argsort(losses)[::-1]

    # 3) Separate anomaly and normal indices in loss-sorted order
    anomaly_idxs = [i for i in idx_desc if y_pool[i] == 1]
    normal_idxs  = [i for i in idx_desc if y_pool[i] == 0]

    # 4) Select at least min_anom anomalies
    min_anom = k//2
    n_anom = min(len(anomaly_idxs), min_anom)
    sel_anom = anomaly_idxs[:n_anom]

    # 5) Fill remaining from all candidates (excluding already selected)
    remaining = k - len(sel_anom)
    candidates = [i for i in idx_desc if i not in sel_anom]
    sel_rest = candidates[:remaining]

    # 6) Combine and shuffle
    selected_idx = sel_anom + sel_rest
    random.shuffle(selected_idx)

    return np.array(selected_idx)

In [73]:
# ----------------------------
# Step 1: Read and Process the Data
# ----------------------------
dataset = '0.049_sample.csv'
df = pd.read_csv("../data/%s" % (dataset))
df = df.sort_values(by='Timestamp')
# Process the 'noise' column:
# - If NaN, assume Normal (0).
# - Otherwise, treat True/1/'True' as anomaly (1); everything else as Normal (0).
df['noise'] = df['noise'].fillna(0).apply(lambda x: 1 if (x == True or x == 1 or x == 'True' or x=='true') else 0)
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
print(dataset)

# ----------------------------
# 2) Prefixes & global window settings
# ----------------------------
prefix_range = range(2, 35)   # prefix lengths 2..15
WINDOW_EVENTS = 2500          # keep the last 2 500 raw events

0.049_sample.csv


In [74]:
# --- 3) Pre‐fit encoders for each prefix’s NAP ---
all_acts = df["Activity"].unique()
ohe_nap  = {p: OneHotEncoder(sparse_output=False, handle_unknown="ignore")
              .fit(np.array([[a]*(p-1) for a in all_acts]))
            for p in prefix_range}
le_nap   = {p: LabelEncoder().fit(all_acts) for p in prefix_range}


In [75]:
# ----------------------------
# 4) Buffers & global state
# ----------------------------
# Create a single sliding window for the last WINDOW_EVENTS raw prefix-events
global_events = deque(maxlen=WINDOW_EVENTS)

# Per-prefix buffers (unbounded; we’ll evict manually when global_events drops)
buffers = {}
for p in prefix_range:
    buffers[p] = {
        "raw_feats":      deque(),   # stores list of activities for each prefix-event
        "raw_tgts":       deque(),   # stores the target activity string
        "X":               deque(),   # one-hot–encoded feature vectors
        "y":               deque(),   # label‐encoded target indices
        "noise":           deque(),   # true anomaly flag (0/1)
        "model":          None,       # RandomForest NAP model
        "filled":         False,      # has the NAP model been trained at least once?
        "cutoff":         None,       # CE‐loss cutoff for anomaly flagging
        "update_counter": 0           # no longer used per-prefix
    }

case_events      = defaultdict(list)
detect_pool      = []   # accumulated AD training samples (dicts)
anom_clf         = None
enc_ad           = None
max_prob_ad      = 0
max_pfx          = max(prefix_range) - 1

online_nap_reports = []
online_ad_reports  = []


In [76]:
# ----------------------------
# 5) Streaming loop with NAP + AD
# ----------------------------
total = len(df)
# single sliding window of the last WINDOW_EVENTS raw events
global_update_counter = 0
global_retrain_batch = WINDOW_EVENTS // 2   # 1250

for i, (_, row) in enumerate(df.iterrows(), start=1):
    # progress logging
    if i % 1000 == 0 or i == total:
        pct = i / total * 100
        print(f"Processed {i}/{total} rows ({pct:.1f}%)")
    global_update_counter += 1
    
    cid = row["ID"]
    case_events[cid].append(row)
    cur_len = len(case_events[cid])

   # Only process when a case first reaches prefix length p
    for p in prefix_range:
        if cur_len != p:
            continue

        # 5.1) Build current sample
        group      = case_events[cid]
        feats      = [e.Activity for e in group[: p - 1]]
        target_act = group[p - 1].Activity
        noise_flag = group[p - 1].noise

        buf = buffers[p]

        # --- Slide the global window: peek dropped if full ---
        dropped = None
        if len(global_events) == WINDOW_EVENTS:
            dropped = global_events[0]  # will be auto-evicted on append()

        # Transform features/target for NAP
        Xp_vec = ohe_nap[p].transform([feats]).ravel()
        yp     = le_nap[p].transform([target_act])[0]

        # Append to global_events: store (prefix, X_vec, y_label, noise, raw_feats, raw_target)
        global_events.append((p, Xp_vec, yp, noise_flag, feats, target_act))

        # Append to this prefix’s buffers (unbounded deques)
        buf["raw_feats"].append(feats)
        buf["raw_tgts"].append(target_act)
        buf["X"].append(Xp_vec)
        buf["y"].append(yp)
        buf["noise"].append(noise_flag)

        # If something was dropped from global_events, evict it from its prefix buffer
        if dropped is not None:
            old_p, old_Xp, old_yp, old_noise, old_feats, old_tgt = dropped
            old_buf = buffers[old_p]
            if old_buf["X"]:
                old_buf["raw_feats"].popleft()
                old_buf["raw_tgts"].popleft()
                old_buf["X"].popleft()
                old_buf["y"].popleft()
                old_buf["noise"].popleft()

        # --- 5.2) Initial NAP training (once we have the first sample) ---
        if buf["model"] is None:
            Xw = np.vstack(buf["X"])
            yw = np.array(buf["y"])

            rf = RandomForestClassifier(
                n_estimators=100, random_state=42, n_jobs=-1
            )
            rf.fit(Xw, yw)

            # Compute CE‐loss cutoff (gap‐based) on current buffer
            cutoff = compute_gap_cutoff(rf, Xw, yw)

            buf["model"]  = rf
            buf["filled"] = True
            buf["cutoff"] = cutoff

            # print(f"Prefix {p} NAP initial train (buffer size = {len(buf['X'])})")

            # 5.2a) Bootstrap the AD pool
            MAX_ANOM = 25
            TOTAL_SAMPLES = 50
            
            sel_idxs = ce_loss_topk_balanced(rf, Xw, yw, k = TOTAL_SAMPLES)
            random.shuffle(sel_idxs)

            for idx in sel_idxs:
                prob_vec = rf.predict_proba(buf["X"][idx].reshape(1, -1))[0].tolist()
                ce0      = cross_entropy_loss(rf, buf["X"][idx].reshape(1, -1), [buf["y"][idx]])[0]
                detect_pool.append({
                    "raw_feats": buf["raw_feats"][idx],
                    "target":    buf["raw_tgts"][idx],
                    "prefix":    p,
                    "prob":      prob_vec,
                    "ce_loss":   ce0,
                    "anomaly":   buf["noise"][idx]
                })

            # 5.2b) Train the AD classifier if we have ≥20 samples
            cat_rows = []
            for d in detect_pool:
                row_cat = d["raw_feats"] + [None] * (max_pfx - len(d["raw_feats"]))
                row_cat += [d["prefix"], d["target"]]
                cat_rows.append(row_cat)
            enc_ad = OneHotEncoder(sparse_output=False, handle_unknown="ignore").fit(cat_rows)
            X_cat = enc_ad.transform(cat_rows)

            max_prob_ad = max(len(d["prob"]) for d in detect_pool)
            prob_mat = [
                d["prob"] + [0.0] * (max_prob_ad - len(d["prob"]))
                for d in detect_pool
            ]
            ce_vec = [[d["ce_loss"]] for d in detect_pool]
            X_num = np.hstack([prob_mat, ce_vec])

            y_ad = np.array([d["anomaly"] for d in detect_pool])
            X_ad = np.hstack([X_cat, X_num])

            anom_clf = RandomForestClassifier(n_estimators=10, random_state=42, n_jobs=-1)
            anom_clf.fit(X_ad, y_ad)
            # anom_clf = LogisticRegression(solver='lbfgs', max_iter=1000, class_weight='balanced',  # if your anomalies are rare
            #     random_state=42).fit(X_ad, y_ad)
            # anom_clf = XGBClassifier(objective='binary:logistic', n_estimators=10, learning_rate=0.01, eval_metric='logloss',
            #                          random_state=42).fit(X_ad, y_ad)

            # print(f"Prefix {p} AD initial train on {len(detect_pool)} samples")

            AD_CAT_FEATS = X_cat.shape[1]
            AD_NUM_FEATS = X_num.shape[1]
            # Skip further processing of this new event
    

        # --- 5.3) Prequential NAP prediction & store ---
        rf   = buf["model"]
        Xp   = Xp_vec.reshape(1, -1)
        y_sp = yp
        cutoff_nap = buf["cutoff"]

        ce_cur = cross_entropy_loss(rf, Xp, [y_sp])[0]
        pred_nap_anom = int(ce_cur > cutoff_nap)

        online_nap_reports.append({
            "i":             i,
            "prefix":        p,
            "case_id":       cid,
            "true_noise":    noise_flag,
            "pred_nap_anom": pred_nap_anom,
            "cutoff":        cutoff_nap
        })

        # --- 5.4) Global retrain trigger (increment once per prefix-event) ---
        if global_update_counter >= global_retrain_batch:
            # print("=== Global retrain of all prefix NAP models ===")

            # Retrain each NAP model on its current buffer, recompute cutoff, 
            # and sample AD points
            for q in prefix_range:
                buf_q = buffers[q]
                if len(buf_q["X"]) == 0:
                    continue

                Xw = np.vstack(buf_q["X"])
                yw = np.array(buf_q["y"])
                rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
                rf.fit(Xw, yw)
                buf_q["model"] = rf

                # Recompute CE‐loss cutoff (gap-based)
                cutoff_q = compute_gap_cutoff(rf, Xw, yw)
                buf_q["cutoff"] = cutoff_q
                # print(f"  Recomputed cutoff for prefix {q} (buffer size = {len(buf_q['X'])})")

                sel_idxs = ce_loss_topk_balanced(rf, Xw, yw, k = TOTAL_SAMPLES)

                random.shuffle(sel_idxs)
    
                for idx in sel_idxs:
                    prob_vec = rf.predict_proba(buf_q["X"][idx].reshape(1, -1))[0].tolist()
                    ce0      = cross_entropy_loss(rf, buf_q["X"][idx].reshape(1, -1), [buf_q["y"][idx]])[0]
                    detect_pool.append({
                        "raw_feats": buf_q["raw_feats"][idx],
                        "target":    buf_q["raw_tgts"][idx],
                        "prefix":    p,
                        "prob":      prob_vec,
                        "ce_loss":   ce0,
                        "anomaly":   buf_q["noise"][idx]
                    })

            # Retrain AD classifier if we have ≥20 samples
            if len(detect_pool) >= 20:
                cat_rows = []
                for d in detect_pool:
                    row_cat = d["raw_feats"] + [None] * (max_pfx - len(d["raw_feats"]))
                    row_cat += [d["prefix"], d["target"]]
                    cat_rows.append(row_cat)
                enc_ad = OneHotEncoder(sparse_output=False, handle_unknown="ignore").fit(cat_rows)
                X_cat = enc_ad.transform(cat_rows)
                max_prob_ad = max(len(d["prob"]) for d in detect_pool)
                prob_mat = [
                    d["prob"] + [0.0] * (max_prob_ad - len(d["prob"]))
                    for d in detect_pool
                ]
                ce_vec = [[d["ce_loss"]] for d in detect_pool]
                X_num = np.hstack([prob_mat, ce_vec])

                y_ad = np.array([d["anomaly"] for d in detect_pool])
                X_ad = np.hstack([X_cat, X_num])

                anom_clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
                anom_clf.fit(X_ad, y_ad)
                AD_CAT_FEATS = X_cat.shape[1]
                AD_NUM_FEATS = X_num.shape[1]
                                
                # anom_clf = LogisticRegression(solver='lbfgs', max_iter=1000, class_weight='balanced',  # if your anomalies are rare
                #     random_state=42).fit(X_ad, y_ad)
                # anom_clf = XGBClassifier(objective='binary:logistic', n_estimators=10, learning_rate=0.01, eval_metric='logloss',
                #                          random_state=42).fit(X_ad, y_ad)
                # print(f"  AD retrain on {len(detect_pool)} samples")
                # detect_pool = []
            global_update_counter = 0

        # --- 5.5) Prequential AD classification for current event ---
        if anom_clf is not None:
            # Build AD feature vector: categorical + numeric
            row_cat = feats + [None] * (max_pfx - len(feats)) + [p, y_sp]
            # 1) Categorical part
            Xc = enc_ad.transform([row_cat])
            if Xc.shape[1] != AD_CAT_FEATS:
               raise ValueError(f"Expected {AD_CAT_FEATS} cat features, got {Xc.shape[1]}")
            
            # 2) Numeric part (prob_vector + ce_loss)
            model = buffers[p]['model']
            pvec = model.predict_proba(Xp)[0].tolist()
            pad_len = AD_NUM_FEATS - 1
            pvec_padded = pvec + [0.0] * (pad_len - len(pvec))
            Xn = np.array([pvec_padded + [ce_cur]])
            if Xn.shape[1] != AD_NUM_FEATS:
               raise ValueError(f"Expected {AD_NUM_FEATS} num features, got {Xn.shape[1]}")
            
            # 3) Combine & predict
            Xa = np.hstack([Xc, Xn])
            pred_ad = anom_clf.predict(Xa)[0]

            if len(anom_clf.predict_proba(Xa)[0]) ==1:
                prob_ad = anom_clf.predict_proba(Xa)[0, 0]
            else:
                prob_ad = anom_clf.predict_proba(Xa)[0, 1]
            
            online_ad_reports.append({
                "i":            i,
                "prefix":       p,
                "case_id":      cid,
                "true_noise":   noise_flag,
                "pred_ad_anom": int(pred_ad),
                "score":        float(prob_ad)
            })


Processed 1000/6630 rows (15.1%)
Processed 2000/6630 rows (30.2%)
Processed 3000/6630 rows (45.2%)
Processed 4000/6630 rows (60.3%)
Processed 5000/6630 rows (75.4%)
Processed 6000/6630 rows (90.5%)
Processed 6630/6630 rows (100.0%)


In [72]:
pred_ad

np.int64(0)

In [48]:
detect_pool

[{'raw_feats': ['start_event_Loan  application received'],
  'target': 'Check  application  form completeness',
  'prefix': 2,
  'prob': [1.0],
  'ce_loss': np.float64(0.0),
  'anomaly': 0},
 {'raw_feats': ['start_event_Loan  application received',
   'Check  application  form completeness'],
  'target': 'Return application back to applicant',
  'prefix': 3,
  'prob': [1.0],
  'ce_loss': np.float64(0.0),
  'anomaly': 0},
 {'raw_feats': ['start_event_Loan  application received',
   'Check  application  form completeness',
   'Return application back to applicant'],
  'target': 'Receive updated application',
  'prefix': 4,
  'prob': [1.0],
  'ce_loss': np.float64(0.0),
  'anomaly': 0},
 {'raw_feats': ['start_event_Loan  application received',
   'Check  application  form completeness',
   'Return application back to applicant',
   'Receive updated application'],
  'target': 'Check  application  form completeness',
  'prefix': 5,
  'prob': [1.0],
  'ce_loss': np.float64(0.0),
  'anomaly':

In [77]:
# 6) Summarize
reports_df = pd.DataFrame(online_ad_reports)
for p in prefix_range:
    sub = reports_df[reports_df["prefix"] == p]
    if not sub.empty:
        print(f"\n--- Prefix {p} ---")
        print(classification_report(
            sub["true_noise"], sub["pred_ad_anom"], zero_division=0
        ))
reports_df 
# reports_df.to_csv('../result/%s_classifier_xgb_%s_random_sample_cumulative.csv'%(dataset, TOTAL_SAMPLES), index=False)


--- Prefix 2 ---
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       482
           1       1.00      0.78      0.88        18

    accuracy                           0.99       500
   macro avg       1.00      0.89      0.94       500
weighted avg       0.99      0.99      0.99       500


--- Prefix 3 ---
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       476
           1       1.00      0.71      0.83        24

    accuracy                           0.99       500
   macro avg       0.99      0.85      0.91       500
weighted avg       0.99      0.99      0.98       500


--- Prefix 4 ---
              precision    recall  f1-score   support

           0       0.99      0.91      0.95       476
           1       0.30      0.79      0.43        24

    accuracy                           0.90       500
   macro avg       0.64      0.85      0.69       500
weighted avg       0

Unnamed: 0,i,prefix,case_id,true_noise,pred_ad_anom,score
0,2,2,0,0,0,1.00
1,4,2,1,0,0,1.00
2,5,3,0,0,0,1.00
3,6,3,1,0,0,1.00
4,7,4,0,0,0,1.00
...,...,...,...,...,...,...
6121,6626,6,499,0,0,0.25
6122,6627,7,499,0,0,0.08
6123,6628,8,499,0,0,0.26
6124,6629,16,498,0,0,0.17
