In [497]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from lightgbm import LGBMClassifier
from tqdm import tqdm
import warnings
import gc

# Read Data

In [464]:
AS_UPDATES_ROOT_DIR = './data/asn_updates'

as_updates = {}
as_updates_dirs = sorted(glob(AS_UPDATES_ROOT_DIR+'/*'))
for dir_path in tqdm(as_updates_dirs):
    asn = dir_path.split('/')[-1]
    as_updates_files = sorted(glob(dir_path+'/*'))
    as_df_list = [pd.read_csv(file_path) for file_path in as_updates_files]
    as_df =  pd.concat(as_df_list).reset_index(drop=True)
    del as_df_list
    gc.collect()
    as_df = as_df.sort_values('time')
    as_updates[asn] = as_df    

100%|████████████████████████████| 100/100 [01:31<00:00,  1.09it/s]


# Data Preparation Functions

In [465]:
def calculate_features(df):
    df['minute'] = df.time // 60
    df = df.groupby('minute')[['prefix']]\
                       .count()\
                       .rename(columns={'prefix': 'updates'})
    minutes = pd.Series(df.index, index=df.index)
    df['periods_before_update'] = minutes - minutes.shift(1)
    df = df.dropna()
    return df

In [466]:
def interpolate_index(df):
    index_interpolated = np.arange(int(df.index.min()), int(df.index.max()))
    df = df.reindex(index_interpolated).fillna({'updates': 0})
    return df

In [467]:
def make_sequences(df, feature_cols, target_col, lag):    
    X = []
    y = []
    for i in range(df.shape[0]-lag):
        X.append(df.iloc[i:i+lag][feature_cols].to_numpy().reshape(-1))
        y.append(df.iloc[i+lag][target_col])
        
    return np.array(X), np.array(y)

In [468]:
def insert_croston_zero_rows(X, y):
    zero_rows = {}
    
    for i in range(X.shape[0]):
        if i == 0:
            continue
        
        row_ = X[i].copy()
        n_zero_rows = int(row_[-1] - 1)

        if n_zero_rows > 0:
            row_[-2] = 0
            row_[-1] = 1
            zero_rows[i] = [row_]
        
        for _ in range(n_zero_rows-1):
            row_ = row_.copy()
            row_[-1] += 1
            zero_rows[i].append(row_)
    
    for i in sorted(zero_rows.keys())[-1::-1]:
        X_zeros = np.array(zero_rows[i])
        y_zeros = np.zeros(X_zeros.shape[0])
        
        X_before = X[:i]
        y_before = y[:i]
        
        X_after = X[i:]
        y_after = y[i:]
    
        X = np.concatenate([X_before, X_zeros, X_after])
        y = np.concatenate([y_before, y_zeros, y_after])

    return X, y

# Single AS

In [293]:
df = calculate_features(as_updates['25139'])

## AR

In [145]:
df_ar = interpolate_index(df)

In [146]:
X, y = make_sequences(df_ar, ['updates'], 'updates', 20)

In [147]:
X.shape, y.shape

((20049, 20), (20049,))

In [148]:
test_size = 24 * 60
test_size

1440

In [149]:
X_train, y_train = X[:-test_size], (y[:-test_size] > 0).astype(int)
X_test, y_test = X[-test_size:], (y[-test_size:] > 0).astype(int)

In [150]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [151]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [152]:
y_pred = model.predict(X_test)

In [153]:
accuracy_score(y_test, y_pred)

0.9618055555555556

In [154]:
recall_score(y_test, y_pred)

0.7814207650273224

In [155]:
precision_score(y_test, y_pred)

0.9050632911392406

In [156]:
f1_score(y_test, y_pred)

0.8387096774193549

## Croston-like AR

In [157]:
X, y = make_sequences(df, ['updates', 'periods_before_update'], 'updates', 20)

In [158]:
X, y = insert_croston_zero_rows(X, y)

In [159]:
test_size = 1000

In [160]:
X_train, y_train = X[:-test_size], (y[:-test_size] > 0).astype(int)
X_test, y_test = X[-test_size:], (y[-test_size:] > 0).astype(int)

In [161]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [162]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [163]:
y_pred = model.predict(X_test)

In [164]:
accuracy_score(y_test, y_pred)

0.993

In [165]:
recall_score(y_test, y_pred)

0.944

In [166]:
precision_score(y_test, y_pred)

1.0

In [167]:
f1_score(y_test, y_pred)

0.9711934156378601

# Multiple AS

In [469]:
test_size = 1000

In [470]:
X_trains = []
y_trains = []
X_tests = []
y_tests = []
X_zero_shots = []
y_zero_shots = []

for i, (asn, df) in enumerate(as_updates.items()):
    print(i, '| Processing AS:', asn)
    df = calculate_features(df)
    X, y = make_sequences(df, ['updates', 'periods_before_update'], 'updates', 20)
    X, y = insert_croston_zero_rows(X, y)

    if i % 10 == 0:
        X_zero_shots.append(X)
        y_zero_shots.append((y > 0).astype(int))
    
    X_train, y_train = X[:-test_size], (y[:-test_size] > 0).astype(int)
    X_test, y_test = X[-test_size:], (y[-test_size:] > 0).astype(int)
    X_trains.append(X_train)
    y_trains.append(y_train)
    X_tests.append(X_test)
    y_tests.append(y_test)

X_train = np.concatenate(X_trains)
y_train = np.concatenate(y_trains)
X_test = np.concatenate(X_tests)
y_test = np.concatenate(y_tests)
X_zero_shot = np.concatenate(X_zero_shots)
y_zero_shot = np.concatenate(y_zero_shots)

del X_trains
del y_trains
del X_tests
del y_tests
del X_zero_shots
del y_zero_shots
gc.collect()

0 | Processing AS: 11913
1 | Processing AS: 131292
2 | Processing AS: 133840
3 | Processing AS: 134645
4 | Processing AS: 135101
5 | Processing AS: 136844
6 | Processing AS: 136991
7 | Processing AS: 138146
8 | Processing AS: 138630
9 | Processing AS: 138645
10 | Processing AS: 139002
11 | Processing AS: 139054
12 | Processing AS: 139245
13 | Processing AS: 141139
14 | Processing AS: 142354
15 | Processing AS: 147182
16 | Processing AS: 149001
17 | Processing AS: 149282
18 | Processing AS: 151853
19 | Processing AS: 152438
20 | Processing AS: 18036
21 | Processing AS: 18109
22 | Processing AS: 19263
23 | Processing AS: 197915
24 | Processing AS: 198239
25 | Processing AS: 200179
26 | Processing AS: 200400
27 | Processing AS: 200536
28 | Processing AS: 200914
29 | Processing AS: 201547
30 | Processing AS: 2018
31 | Processing AS: 202140
32 | Processing AS: 202188
33 | Processing AS: 202627
34 | Processing AS: 204446
35 | Processing AS: 20783
36 | Processing AS: 208115
37 | Processing AS

0

In [471]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape, X_zero_shot.shape, y_zero_shot.shape

((1752279, 40), (1752279,), (100000, 40), (100000,), (188800, 40), (188800,))

In [472]:
scaler1 = StandardScaler()
X_train = scaler1.fit_transform(X_train)
X_test = scaler1.transform(X_test)

In [473]:
X_train_test, y_train_test = np.concatenate([X_train, X_test]), np.concatenate([y_train, y_test])

In [474]:
scaler2 = StandardScaler()
X_train_test = scaler2.fit_transform(X_train_test)
X_zero_shot = scaler2.transform(X_zero_shot)

### Logistic Regression

In [475]:
model = LogisticRegression()

In [476]:
model.fit(X_train, y_train)

In [477]:
y_pred = model.predict(X_test)

In [478]:
accuracy_score(y_test, y_pred)

0.99888

In [479]:
recall_score(y_test, y_pred)

0.9821143404663047

In [480]:
precision_score(y_test, y_pred)

1.0

In [481]:
f1_score(y_test, y_pred)

0.9909764743796327

In [482]:
feature_names = []
for i in range(1, 21):
    feature_names.extend([f'updates_{i}', f'periods_before_{i}'])
    
pd.DataFrame(model.coef_, columns=feature_names).T

Unnamed: 0,0
updates_1,-0.02177
periods_before_1,-0.05465
updates_2,0.004575
periods_before_2,-0.039186
updates_3,0.00035
periods_before_3,-0.039378
updates_4,-0.015964
periods_before_4,-0.056665
updates_5,-0.001545
periods_before_5,-0.033234


In [483]:
model = LogisticRegression()

In [484]:
model.fit(X_train_test, y_train_test)

In [485]:
y_pred = model.predict(X_zero_shot)

In [486]:
accuracy_score(y_zero_shot, y_pred)

0.9177277542372881

In [488]:
recall_score(y_zero_shot, y_pred)

0.9943460794524625

In [489]:
precision_score(y_zero_shot, y_pred)

0.46372688477951635

In [491]:
f1_score(y_zero_shot, y_pred)

0.6324855081036318

### GBDT

In [357]:
model = LGBMClassifier(objective='binary', n_estimators=5)

In [358]:
model.fit(X_train, y_train)



[LightGBM] [Info] Number of positive: 135834, number of negative: 1616445
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.282467 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10077
[LightGBM] [Info] Number of data points in the train set: 1752279, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.077518 -> initscore=-2.476551
[LightGBM] [Info] Start training from score -2.476551




In [359]:
y_pred = model.predict(X_test)



In [360]:
accuracy_score(y_test, y_pred)

1.0

In [361]:
recall_score(y_test, y_pred)

1.0

In [362]:
precision_score(y_test, y_pred)

1.0

In [363]:
f1_score(y_test, y_pred)

1.0

In [390]:
model = LGBMClassifier(objective='binary', n_estimators=10)

In [391]:
model.fit(X_train_test, y_train_test)



[LightGBM] [Info] Number of positive: 142096, number of negative: 1710183
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.280188 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10083
[LightGBM] [Info] Number of data points in the train set: 1852279, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.076714 -> initscore=-2.487853
[LightGBM] [Info] Start training from score -2.487853




In [392]:
y_pred = model.predict(X_zero_shot)



In [393]:
accuracy_score(y_zero_shot, y_pred)

0.07119703389830509

In [394]:
recall_score(y_zero_shot, y_pred)

1.0

In [395]:
precision_score(y_zero_shot, y_pred)

0.07119703389830509

In [396]:
f1_score(y_zero_shot, y_pred)

0.13292985631075643