In [69]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from lightgbm import LGBMClassifier
from tqdm import tqdm
import warnings
import gc

# Read Data

In [63]:
AS_UPDATES_ROOT_DIR = './data/asn_updates'

as_updates = {}
as_updates_dirs = sorted(glob(AS_UPDATES_ROOT_DIR+'/*'))
for dir_path in tqdm(as_updates_dirs):
    asn = dir_path.split('/')[-1]
    as_updates_files = sorted(glob(dir_path+'/*'))
    as_df_list = [pd.read_csv(file_path) for file_path in as_updates_files]
    as_df =  pd.concat(as_df_list).reset_index(drop=True)
    del as_df_list
    gc.collect()
    as_df = as_df.sort_values('time')
    as_updates[asn] = as_df    

100%|████████████████████████████| 100/100 [01:07<00:00,  1.49it/s]


# Data Preparation Functions

In [64]:
def calculate_features(df):
    df['minute'] = df.time // 60
    df = df.groupby('minute')[['prefix']]\
                       .count()\
                       .rename(columns={'prefix': 'updates'})
    minutes = pd.Series(df.index, index=df.index)
    df['periods_before_update'] = minutes - minutes.shift(1)
    df = df.dropna()
    return df

In [65]:
def interpolate_index(df):
    index_interpolated = np.arange(int(df.index.min()), int(df.index.max()))
    df = df.reindex(index_interpolated).fillna({'updates': 0})
    return df

In [66]:
def make_sequences(df, feature_cols, target_col, lag):    
    X = []
    y = []
    for i in range(df.shape[0]-lag):
        X.append(df.iloc[i:i+lag][feature_cols].to_numpy().reshape(-1))
        y.append(df.iloc[i+lag][target_col])
        
    return np.array(X), np.array(y)

In [67]:
def insert_croston_zero_rows(X, y):
    zero_rows = {}
    
    for i in range(X.shape[0]):
        if i == 0:
            continue
        
        row_ = X[i].copy()
        n_zero_rows = int(row_[-1] - 1)

        if n_zero_rows > 0:
            row_[-2] = 0
            row_[-1] = 1
            zero_rows[i] = [row_]
        
        for _ in range(n_zero_rows-1):
            row_ = row_.copy()
            row_[-1] += 1
            zero_rows[i].append(row_)
    
    for i in sorted(zero_rows.keys())[-1::-1]:
        X_zeros = np.array(zero_rows[i])
        y_zeros = np.zeros(X_zeros.shape[0])
        
        X_before = X[:i]
        y_before = y[:i]
        
        X_after = X[i:]
        y_after = y[i:]
    
        X = np.concatenate([X_before, X_zeros, X_after])
        y = np.concatenate([y_before, y_zeros, y_after])

    return X, y

# Single AS

In [70]:
test_size = 24 * 60
test_size

1440

## AR

In [81]:
df = calculate_features(as_updates['25139'])
df_ar = interpolate_index(df)
X, y = make_sequences(df_ar, ['updates'], 'updates', 20)

In [82]:
X_train, y_train = X[:-test_size], (y[:-test_size] > 0).astype(int)
X_test, y_test = X[-test_size:], (y[-test_size:] > 0).astype(int)

In [83]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((18609, 20), (18609,), (1440, 20), (1440,))

In [84]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [85]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('F1:', f1_score(y_test, y_pred))

Accuracy: 0.9680555555555556
Recall: 0.7704918032786885
Precision: 0.9724137931034482
F1: 0.8597560975609756


In [86]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('F1:', f1_score(y_test, y_pred))

Accuracy: 0.9784722222222222
Recall: 0.8961748633879781
Precision: 0.9318181818181818
F1: 0.9136490250696379


In [87]:
model = LGBMClassifier(objective='binary')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('F1:', f1_score(y_test, y_pred))

[LightGBM] [Info] Number of positive: 2687, number of negative: 15922
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003309 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1080
[LightGBM] [Info] Number of data points in the train set: 18609, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.144392 -> initscore=-1.779276
[LightGBM] [Info] Start training from score -1.779276




Accuracy: 0.98125
Recall: 0.8852459016393442
Precision: 0.9642857142857143
F1: 0.9230769230769231




## Croston-like AR

In [88]:
df = calculate_features(as_updates['25139'])
X, y = make_sequences(df, ['updates', 'periods_before_update'], 'updates', 20)
X, y = insert_croston_zero_rows(X, y)

In [89]:
X_train, y_train = X[:-test_size], (y[:-test_size] > 0).astype(int)
X_test, y_test = X[-test_size:], (y[-test_size:] > 0).astype(int)

In [90]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((18447, 40), (18447,), (1440, 40), (1440,))

In [91]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [92]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('F1:', f1_score(y_test, y_pred))

Accuracy: 0.9923611111111111
Recall: 0.9402173913043478
Precision: 1.0
F1: 0.969187675070028


In [93]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('F1:', f1_score(y_test, y_pred))

Accuracy: 1.0
Recall: 1.0
Precision: 1.0
F1: 1.0


In [94]:
model = LGBMClassifier(objective='binary')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('F1:', f1_score(y_test, y_pred))



[LightGBM] [Info] Number of positive: 2670, number of negative: 15777
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002732 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1675
[LightGBM] [Info] Number of data points in the train set: 18447, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.144739 -> initscore=-1.776475
[LightGBM] [Info] Start training from score -1.776475
Accuracy: 1.0
Recall: 1.0
Precision: 1.0
F1: 1.0




# Multiple AS

## AR

### Data Preparation

In [95]:
X_trains = []
y_trains = []
X_tests = []
y_tests = []
X_zero_shots = []
y_zero_shots = []

for i, (asn, df) in enumerate(as_updates.items()):
    print(i, '| Processing AS:', asn)
    df = calculate_features(df)
    df = interpolate_index(df)
    X, y = make_sequences(df, ['updates'], 'updates', 20)

    if i % 10 == 0:
        X_zero_shots.append(X)
        y_zero_shots.append((y > 0).astype(int))
    
    X_train, y_train = X[:-test_size], (y[:-test_size] > 0).astype(int)
    X_test, y_test = X[-test_size:], (y[-test_size:] > 0).astype(int)
    X_trains.append(X_train)
    y_trains.append(y_train)
    X_tests.append(X_test)
    y_tests.append(y_test)

X_train = np.concatenate(X_trains)
y_train = np.concatenate(y_trains)
X_test = np.concatenate(X_tests)
y_test = np.concatenate(y_tests)
X_zero_shot = np.concatenate(X_zero_shots)
y_zero_shot = np.concatenate(y_zero_shots)

del X_trains
del y_trains
del X_tests
del y_tests
del X_zero_shots
del y_zero_shots
gc.collect()

0 | Processing AS: 11913
1 | Processing AS: 131292
2 | Processing AS: 133840
3 | Processing AS: 134645
4 | Processing AS: 135101
5 | Processing AS: 136844
6 | Processing AS: 136991
7 | Processing AS: 138146
8 | Processing AS: 138630
9 | Processing AS: 138645
10 | Processing AS: 139002
11 | Processing AS: 139054
12 | Processing AS: 139245
13 | Processing AS: 141139
14 | Processing AS: 142354
15 | Processing AS: 147182
16 | Processing AS: 149001
17 | Processing AS: 149282
18 | Processing AS: 151853
19 | Processing AS: 152438
20 | Processing AS: 18036
21 | Processing AS: 18109
22 | Processing AS: 19263
23 | Processing AS: 197915
24 | Processing AS: 198239
25 | Processing AS: 200179
26 | Processing AS: 200400
27 | Processing AS: 200536
28 | Processing AS: 200914
29 | Processing AS: 201547
30 | Processing AS: 2018
31 | Processing AS: 202140
32 | Processing AS: 202188
33 | Processing AS: 202627
34 | Processing AS: 204446
35 | Processing AS: 20783
36 | Processing AS: 208115
37 | Processing AS

0

In [96]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape, X_zero_shot.shape, y_zero_shot.shape

((1814183, 20), (1814183,), (144000, 20), (144000,), (198765, 20), (198765,))

In [97]:
scaler1 = StandardScaler()
X_train = scaler1.fit_transform(X_train)
X_test = scaler1.transform(X_test)

In [98]:
X_train_test, y_train_test = np.concatenate([X_train, X_test]), np.concatenate([y_train, y_test])

In [99]:
scaler2 = StandardScaler()
X_train_test = scaler2.fit_transform(X_train_test)
X_zero_shot = scaler2.transform(X_zero_shot)

### Logistic Regression

In [100]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('F1:', f1_score(y_test, y_pred))

Accuracy: 0.9500347222222222
Recall: 0.23836824696802647
Precision: 0.8828093099224174
F1: 0.3753798072749371


In [101]:
feature_names = []
for i in range(1, 21):
    feature_names.extend([f'updates_{i}'])
    
pd.DataFrame(model.coef_, columns=feature_names).T

Unnamed: 0,0
updates_1,0.274407
updates_2,0.036595
updates_3,0.050525
updates_4,0.067109
updates_5,0.129664
updates_6,0.161972
updates_7,0.03961
updates_8,0.104192
updates_9,0.094269
updates_10,0.033865


In [102]:
model = LogisticRegression()
model.fit(X_train_test, y_train_test)
y_pred = model.predict(X_zero_shot)

print('Accuracy:', accuracy_score(y_zero_shot, y_pred))
print('Recall:', recall_score(y_zero_shot, y_pred))
print('Precision:', precision_score(y_zero_shot, y_pred))
print('F1:', f1_score(y_zero_shot, y_pred))

Accuracy: 0.8517545845596559
Recall: 0.4126506024096386
Precision: 0.20736119314825754
F1: 0.276019656019656


### Decision Tree

In [103]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('F1:', f1_score(y_test, y_pred))

Accuracy: 0.9573541666666666
Recall: 0.5317530319735392
Precision: 0.7180288819413428
F1: 0.6110090580857668


In [104]:
model = DecisionTreeClassifier()
model.fit(X_train_test, y_train_test)
y_pred = model.predict(X_zero_shot)

print('Accuracy:', accuracy_score(y_zero_shot, y_pred))
print('Recall:', recall_score(y_zero_shot, y_pred))
print('Precision:', precision_score(y_zero_shot, y_pred))
print('F1:', f1_score(y_zero_shot, y_pred))

Accuracy: 0.07122481322164365
Recall: 0.992873934763444
Precision: 0.06824861381838748
F1: 0.12771808465398463


### Gradient Boosted Decision Tree

In [105]:
model = LGBMClassifier(objective='binary')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('F1:', f1_score(y_test, y_pred))



[LightGBM] [Info] Number of positive: 134681, number of negative: 1679502
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.184458 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5100
[LightGBM] [Info] Number of data points in the train set: 1814183, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.074238 -> initscore=-2.523344
[LightGBM] [Info] Start training from score -2.523344




Accuracy: 0.9643958333333333
Recall: 0.5119073869900772
Precision: 0.8689874602283362
F1: 0.6442794699229862


In [106]:
model = LGBMClassifier(objective='binary')
model.fit(X_train_test, y_train_test)
y_pred = model.predict(X_zero_shot)

print('Accuracy:', accuracy_score(y_zero_shot, y_pred))
print('Recall:', recall_score(y_zero_shot, y_pred))
print('Precision:', precision_score(y_zero_shot, y_pred))
print('F1:', f1_score(y_zero_shot, y_pred))



[LightGBM] [Info] Number of positive: 143751, number of negative: 1814432
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.182315 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5100
[LightGBM] [Info] Number of data points in the train set: 1958183, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.073410 -> initscore=-2.535445
[LightGBM] [Info] Start training from score -2.535445




Accuracy: 0.06853319246346187
Recall: 0.9999265354099324
Precision: 0.06848198517758222
F1: 0.12818496456572412


## Croston-like AR

### Data Preparation

In [107]:
X_trains = []
y_trains = []
X_tests = []
y_tests = []
X_zero_shots = []
y_zero_shots = []

for i, (asn, df) in enumerate(as_updates.items()):
    print(i, '| Processing AS:', asn)
    df = calculate_features(df)
    X, y = make_sequences(df, ['updates', 'periods_before_update'], 'updates', 20)
    X, y = insert_croston_zero_rows(X, y)

    if i % 10 == 0:
        X_zero_shots.append(X)
        y_zero_shots.append((y > 0).astype(int))
    
    X_train, y_train = X[:-test_size], (y[:-test_size] > 0).astype(int)
    X_test, y_test = X[-test_size:], (y[-test_size:] > 0).astype(int)
    X_trains.append(X_train)
    y_trains.append(y_train)
    X_tests.append(X_test)
    y_tests.append(y_test)

X_train = np.concatenate(X_trains)
y_train = np.concatenate(y_trains)
X_test = np.concatenate(X_tests)
y_test = np.concatenate(y_tests)
X_zero_shot = np.concatenate(X_zero_shots)
y_zero_shot = np.concatenate(y_zero_shots)

del X_trains
del y_trains
del X_tests
del y_tests
del X_zero_shots
del y_zero_shots
gc.collect()

0 | Processing AS: 11913
1 | Processing AS: 131292
2 | Processing AS: 133840
3 | Processing AS: 134645
4 | Processing AS: 135101
5 | Processing AS: 136844
6 | Processing AS: 136991
7 | Processing AS: 138146
8 | Processing AS: 138630
9 | Processing AS: 138645
10 | Processing AS: 139002
11 | Processing AS: 139054
12 | Processing AS: 139245
13 | Processing AS: 141139
14 | Processing AS: 142354
15 | Processing AS: 147182
16 | Processing AS: 149001
17 | Processing AS: 149282
18 | Processing AS: 151853
19 | Processing AS: 152438
20 | Processing AS: 18036
21 | Processing AS: 18109
22 | Processing AS: 19263
23 | Processing AS: 197915
24 | Processing AS: 198239
25 | Processing AS: 200179
26 | Processing AS: 200400
27 | Processing AS: 200536
28 | Processing AS: 200914
29 | Processing AS: 201547
30 | Processing AS: 2018
31 | Processing AS: 202140
32 | Processing AS: 202188
33 | Processing AS: 202627
34 | Processing AS: 204446
35 | Processing AS: 20783
36 | Processing AS: 208115
37 | Processing AS

0

In [108]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape, X_zero_shot.shape, y_zero_shot.shape

((1708279, 40), (1708279,), (144000, 40), (144000,), (188800, 40), (188800,))

In [109]:
scaler1 = StandardScaler()
X_train = scaler1.fit_transform(X_train)
X_test = scaler1.transform(X_test)

In [110]:
X_train_test, y_train_test = np.concatenate([X_train, X_test]), np.concatenate([y_train, y_test])

In [111]:
scaler2 = StandardScaler()
X_train_test = scaler2.fit_transform(X_train_test)
X_zero_shot = scaler2.transform(X_zero_shot)

### Logistic Regression

In [112]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('F1:', f1_score(y_test, y_pred))

Accuracy: 0.9988402777777777
Recall: 0.9818340041335799
Precision: 1.0
F1: 0.9908337449914923


In [113]:
feature_names = []
for i in range(1, 21):
    feature_names.extend([f'updates_{i}', f'periods_before_{i}'])
    
pd.DataFrame(model.coef_, columns=feature_names).T

Unnamed: 0,0
updates_1,-0.024737
periods_before_1,-0.054723
updates_2,0.006528
periods_before_2,-0.035427
updates_3,-0.006182
periods_before_3,-0.026815
updates_4,-0.009391
periods_before_4,-0.042382
updates_5,-0.001742
periods_before_5,-0.026195


In [114]:
model = LogisticRegression()
model.fit(X_train_test, y_train_test)
y_pred = model.predict(X_zero_shot)

print('Accuracy:', accuracy_score(y_zero_shot, y_pred))
print('Recall:', recall_score(y_zero_shot, y_pred))
print('Precision:', precision_score(y_zero_shot, y_pred))
print('F1:', f1_score(y_zero_shot, y_pred))

Accuracy: 0.9175794491525424
Recall: 0.9947180479095372
Precision: 0.4632895603062957
F1: 0.6321537479611375


### Decision Tree

In [115]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('F1:', f1_score(y_test, y_pred))

Accuracy: 1.0
Recall: 1.0
Precision: 1.0
F1: 1.0


In [116]:
model = DecisionTreeClassifier()
model.fit(X_train_test, y_train_test)
y_pred = model.predict(X_zero_shot)

print('Accuracy:', accuracy_score(y_zero_shot, y_pred))
print('Recall:', recall_score(y_zero_shot, y_pred))
print('Precision:', precision_score(y_zero_shot, y_pred))
print('F1:', f1_score(y_zero_shot, y_pred))

Accuracy: 0.07119703389830509
Recall: 1.0
Precision: 0.07119703389830509
F1: 0.13292985631075643


### GBDT

In [117]:
model = LGBMClassifier(objective='binary', n_estimators=5)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('F1:', f1_score(y_test, y_pred))



[LightGBM] [Info] Number of positive: 132903, number of negative: 1575376
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.187011 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10082
[LightGBM] [Info] Number of data points in the train set: 1708279, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.077799 -> initscore=-2.472630
[LightGBM] [Info] Start training from score -2.472630
Accuracy: 1.0
Recall: 1.0
Precision: 1.0
F1: 1.0




In [118]:
model = LGBMClassifier(objective='binary', n_estimators=5)
model.fit(X_train_test, y_train_test)
y_pred = model.predict(X_zero_shot)

print('Accuracy:', accuracy_score(y_zero_shot, y_pred))
print('Recall:', recall_score(y_zero_shot, y_pred))
print('Precision:', precision_score(y_zero_shot, y_pred))
print('F1:', f1_score(y_zero_shot, y_pred))



[LightGBM] [Info] Number of positive: 142096, number of negative: 1710183
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.205022 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10087
[LightGBM] [Info] Number of data points in the train set: 1852279, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.076714 -> initscore=-2.487853
[LightGBM] [Info] Start training from score -2.487853
Accuracy: 0.07119703389830509
Recall: 1.0
Precision: 0.07119703389830509
F1: 0.13292985631075643


