In [1]:
from __future__ import absolute_import, print_function

# --- System ---
import os
import sys
import time
import warnings
warnings.filterwarnings('ignore')

# --- Utility ---
import pandas as pd
import numpy as np
import math
import random
import logging
import pickle
from tqdm import tqdm
from datetime import datetime

# --- Pytorch ---
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import torch.backends.cudnn as cudnn
from torch.utils.data import random_split
from torch.utils.data import Dataset, DataLoader, TensorDataset

# --- sklearn ---
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import MultiLabelBinarizer

# --- Plot ---
import matplotlib.pyplot as plt
import seaborn as sns


# --- Sklearn ---
from sklearn.datasets import fetch_openml
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
from sklearn import decomposition, discriminant_analysis, linear_model, svm, tree, neural_network
from sklearn.model_selection import GridSearchCV

# --- Models ---
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn import neural_network

# random weight initialization
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything()
# ----------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
root_dir = os.getcwd()

# data loading and preprocessing
dataPath = "../data/statistics-5.csv"
df = pd.read_csv(dataPath)
# ----------------------------------

In [2]:
# Dropping columns that are not required at the moment
df = df.drop(columns=['Unnamed: 0', 'HOSTNAME', 'UUID', 'TIMESTAMP', 'CONGESTION (Receiver)'])

In [3]:
df.head()

Unnamed: 0,ALIAS,STREAMS,PACING,THROUGHPUT (Sender),THROUGHPUT (Receiver),LATENCY (min.),LATENCY (max.),LATENCY (mean),RETRANSMITS,CONGESTION (Sender),BYTES (Receiver)
0,hostA,1,1gbit,1623277000.0,1623277000.0,30062.0,30264.5,30185.5,1535.0,cubic,12186858284
1,hostA,1,1gbit,1652145000.0,3275422000.0,60206.5,60572.0,60378.5,2879.0,cubic,12403578844
2,hostA,1,1gbit,983358400.0,4258781000.0,91576.5,92073.0,91801.0,2879.0,cubic,7382614448
3,hostA,1,2gbit,1965511000.0,6224292000.0,122954.0,123533.5,123221.5,2879.0,cubic,14756187004
4,hostA,1,3gbit,2946649000.0,9170940000.0,154383.5,155109.0,154710.5,2879.0,cubic,22122117968


In [4]:
# Pre-processing
pacing = df['PACING'].values
for i, p in enumerate(pacing):
    v, _ = p.split("gbit")
    pacing[i] = float(v)

df['PACING'] = pacing
# Dropping rows with pacing rate 10.5, glitch in the training data
df.drop( df[ df['PACING'] == 10.5 ].index, inplace=True)
# Supervised training approach needs total number of classes for classification task
num_of_classes = len(df['PACING'].unique())

df.head()

Unnamed: 0,ALIAS,STREAMS,PACING,THROUGHPUT (Sender),THROUGHPUT (Receiver),LATENCY (min.),LATENCY (max.),LATENCY (mean),RETRANSMITS,CONGESTION (Sender),BYTES (Receiver)
0,hostA,1,1,1623277000.0,1623277000.0,30062.0,30264.5,30185.5,1535.0,cubic,12186858284
1,hostA,1,1,1652145000.0,3275422000.0,60206.5,60572.0,60378.5,2879.0,cubic,12403578844
2,hostA,1,1,983358400.0,4258781000.0,91576.5,92073.0,91801.0,2879.0,cubic,7382614448
3,hostA,1,2,1965511000.0,6224292000.0,122954.0,123533.5,123221.5,2879.0,cubic,14756187004
4,hostA,1,3,2946649000.0,9170940000.0,154383.5,155109.0,154710.5,2879.0,cubic,22122117968


In [5]:
df['CONGESTION (Sender)'] = (df['CONGESTION (Sender)'] == 'cubic').astype(int)
df['ALIAS'] = pd.factorize(df['ALIAS'])[0]
# df['HOSTNAME'] = pd.factorize(df['HOSTNAME'])[0]
df.head()

Unnamed: 0,ALIAS,STREAMS,PACING,THROUGHPUT (Sender),THROUGHPUT (Receiver),LATENCY (min.),LATENCY (max.),LATENCY (mean),RETRANSMITS,CONGESTION (Sender),BYTES (Receiver)
0,0,1,1,1623277000.0,1623277000.0,30062.0,30264.5,30185.5,1535.0,1,12186858284
1,0,1,1,1652145000.0,3275422000.0,60206.5,60572.0,60378.5,2879.0,1,12403578844
2,0,1,1,983358400.0,4258781000.0,91576.5,92073.0,91801.0,2879.0,1,7382614448
3,0,1,2,1965511000.0,6224292000.0,122954.0,123533.5,123221.5,2879.0,1,14756187004
4,0,1,3,2946649000.0,9170940000.0,154383.5,155109.0,154710.5,2879.0,1,22122117968


In [6]:
# df['ALIAS'] = (df['ALIAS'] == '').astype(int)
# for name in df['ALIAS'].unique():
#     df[name] = (df['ALIAS']==name).astype(int)

# df = df.drop(['ALIAS'],axis=1)
# df.head()

In [7]:
y = df['PACING'].values
y = y.astype('float')

df_train = df.drop(['PACING'], axis=1)
X = df_train[df_train.columns.values].values

# Normalization
minmax_scale = preprocessing.MinMaxScaler().fit(df_train[df_train.columns.values])
df_minmax = minmax_scale.transform(df_train[df_train.columns.values])

final_df = pd.DataFrame(df_minmax, columns=df_train.columns.values)
final_df.head(10)

Unnamed: 0,ALIAS,STREAMS,THROUGHPUT (Sender),THROUGHPUT (Receiver),LATENCY (min.),LATENCY (max.),LATENCY (mean),RETRANSMITS,CONGESTION (Sender),BYTES (Receiver)
0,0.0,0.0,0.162861,8e-05,0.0,0.0,0.0,0.014931,1.0,0.164083
1,0.0,0.0,0.165783,0.000172,0.000132,0.000129,0.00013,0.028003,1.0,0.167005
2,0.0,0.0,0.09808,0.000227,0.000269,0.000263,0.000265,0.028003,1.0,0.099302
3,0.0,0.0,0.197506,0.000336,0.000407,0.000397,0.000401,0.028003,1.0,0.198729
4,0.0,0.0,0.29683,0.000499,0.000544,0.000531,0.000536,0.028003,1.0,0.298052
5,0.0,0.0,0.395975,0.000717,0.000681,0.000666,0.000672,0.028003,1.0,0.397198
6,0.0,0.0,0.494802,0.000989,0.00082,0.000801,0.000809,0.028003,1.0,0.496026
7,0.0,0.0,0.593956,0.001315,0.000959,0.000937,0.000946,0.028003,1.0,0.59518
8,0.125,0.0,0.023091,0.001329,0.001178,0.001156,0.001163,0.028217,1.0,0.024329
9,0.125,0.0,0.018696,0.00134,0.001398,0.001373,0.001381,0.028529,1.0,0.01993


In [8]:
X = final_df[df_train.columns.values].values

# ----------------------------------
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.10, random_state=1)

In [9]:
def train_and_tune(X, y, model, parameters, scoring='f1_macro', kfold=5, verbose=0):
    """
    X:          array-like of shape (n_samples, n_features)
    y:          array-like of shape (n_samples,)
    model:      (object) a sklearn model class
    parameters: (dict) contains the parameters you want to tune in the model
    metric:     (str) the metric used to evaluate the quality of the model
    return:     a trained model with the best parameters
    """
    cvSearchObj = GridSearchCV(model,
                               parameters,
                               scoring=scoring,
                               n_jobs=-1,
                               cv=kfold,
                               verbose=verbose)
    cvSearchObj.fit(X,y)
    return cvSearchObj.best_estimator_

def save_model(filename, model):
    """
    filename: Filename to save the model
    model:    Model weights to be saved
    """
    pickle.dump(model, open(filename, 'wb'))
    print("Model Saved")

def load_model(filename):
    """
    filename: Filename to load the model
    return:   Model weights that are reloaded
    """
    model_reloaded = pickle.load(open(filename, 'rb'))
    return model_reloaded

def DecisionTree(train, save, test): 
    filename = "dtreeBest.pkl"
    decisiontreeclassifier = DecisionTreeClassifier(random_state=999)
    if train:
        '''
        Train
        '''
        params = {'min_samples_leaf':[1,2,3]}

        dtreeBest = train_and_tune(X_train,
                                   Y_train,
                                   decisiontreeclassifier,
                                   params,
                                   scoring='f1_macro',
                                   kfold=5)

        if save:
            save_model(filename, dtreeBest)

    if test:
        '''
        Test
        '''
        dtreeBest_reloaded = load_model(filename)
        pred = dtreeBest_reloaded.predict(X_test)
        acc  = dtreeBest_reloaded.score(X_test, Y_test)
        
        cf_matrix = confusion_matrix(Y_test, pred)
        df_cm = pd.DataFrame(cf_matrix/np.sum(cf_matrix) *10, index = [i for i in classes],
                             columns = [i for i in classes])
        plt.figure(figsize = (12,10))
        sn.heatmap(df_cm, annot=True)
        
        print("Accuracy: ", acc)
    print("Decision Tree Completed!")


DecisionTree(train=True, save=True, test=True)

ValueError: Unknown label type: 'continuous'

In [None]:
# X_train = torch.tensor(X_train)
# y_train = torch.tensor(y_train)
# X_test  = torch.tensor(X_test)
# y_test  = torch.tensor(y_test)

# # Hyperparameters
# EPOCH = 1000
# BATCH = 256
# LEARNING_RATE = 0.01

# INTERVAL = 50
# SAVE = False
# BESTLOSS = 10

# lossfn  = nn.CrossEntropyLoss()
# BCE = nn.BCELoss(reduction='mean')
# MSE = nn.MSELoss(reduction='mean') # 'mean', 'sum'. 'none'

# # Custom data loader for ELK stack dataset
# class PacingDataset(Dataset):
#     """ TensorDataset with support of transforms. """
#     def __init__(self, tensors, transform=None):
#         assert all(tensors[0].size(0) == tensor.size(0) for tensor in tensors)
#         self.tensors = tensors
#         self.transform = transform

#     def __getitem__(self, index):
#         x = self.tensors[0][index]
#         if self.transform:
#             x = self.transform(x)
#         y = self.tensors[1][index]
#         return x, y

#     def __len__(self):
#         return self.tensors[0].size(0)

# # Dataset w/o any tranformations
# traindata   = PacingDataset(tensors=(X_train, y_train), transform=None)
# trainloader = torch.utils.data.DataLoader(traindata, batch_size=BATCH, shuffle=True)
# testdata    = PacingDataset(tensors=(X_test, y_test), transform=None)
# testloader = torch.utils.data.DataLoader(testdata, batch_size=BATCH, shuffle=True)

# inputFea = len(traindata[0][0])
# print("Total Input Features: ", inputFea)
# # model definition
# class PacingClassifier (nn.Module):
#     # https://visualstudiomagazine.com/Articles/2021/02/11/pytorch-define.aspx?Page=2
#     def __init__(self, nc=20, inputFeatures=10):
#         super(PacingClassifier, self).__init__()

#         self.fc1 = torch.nn.Linear(inputFeatures, 20)
#         self.fc2 = torch.nn.Linear(20, 20)
#         self.fc3 = torch.nn.Linear(20, nc)

# #         self.fc1 = torch.nn.Linear(inputFeatures, 32)
# #         self.fc2 = torch.nn.Linear(32, 64)        
# #         self.fc3 = torch.nn.Linear(64, 128)
# #         self.fc4 = torch.nn.Linear(128, 128)
# #         self.fc5 = torch.nn.Linear(128, 64)
# #         self.fc6 = torch.nn.Linear(64, nc)

#         torch.nn.init.xavier_uniform_(self.fc1.weight)
#         torch.nn.init.zeros_(self.fc1.bias)
#         torch.nn.init.xavier_uniform_(self.fc2.weight)
#         torch.nn.init.zeros_(self.fc2.bias)
#         torch.nn.init.xavier_uniform_(self.fc3.weight)
#         torch.nn.init.zeros_(self.fc3.bias)
# #         torch.nn.init.xavier_uniform_(self.fc4.weight)
# #         torch.nn.init.zeros_(self.fc4.bias)
# #         torch.nn.init.xavier_uniform_(self.fc5.weight)
# #         torch.nn.init.zeros_(self.fc5.bias)
# #         torch.nn.init.xavier_uniform_(self.fc6.weight)
# #         torch.nn.init.zeros_(self.fc6.bias)

#         self.lrelu = torch.nn.LeakyReLU(negative_slope=0.02)

#     def forward(self, x):
#         z = self.lrelu(self.fc1(x))
#         z = self.lrelu(self.fc2(z))
#         z = self.fc3(z)  # no activation
#         return z

# #         z = self.lrelu(self.fc1(x))
# #         z = self.lrelu(self.fc2(z))
# #         z = self.lrelu(self.fc3(z))
# #         z = self.lrelu(self.fc4(z))
# #         z = self.lrelu(self.fc5(z))
# #         z = self.fc6(z)  # no activation
# #         return z

# model = PacingClassifier (nc=num_of_classes, inputFeatures=inputFea)
# print(model)

In [None]:
# optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=0.9) #, weight_decay=5e-4, nesterov=True)
# scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[700], gamma=0.1)

# print("\nBatch Size = %3d " % BATCH)
# print("Loss = " + str(lossfn))
# print("Optimizer = SGD")
# print("Max Epochs = %3d " % EPOCH)
# print("Learning Rate = %0.3f " % LEARNING_RATE)
# print("Number of Classes = %d " % num_of_classes)
# print("\nStarting training ...")

# model.train()
# trainloss = []
# for epoch in range(0, EPOCH):
#     torch.manual_seed(epoch+1)              # recovery reproducibility
#     epoch_loss = 0                          # for one full epoch

#     for (batch_idx, batch) in enumerate(trainloader):
#         (xs, ys) = batch                    # (predictors, targets)
#         xs, ys = xs.float(), ys.float()
#         optimizer.zero_grad()               # prepare gradients

#         output = model(xs)                  # predicted pacing rate
#         loss = lossfn(output, ys.long())    # avg per item in batch

#         epoch_loss += loss.item()           # accumulate averages
#         loss.backward()                     # compute gradients
#         optimizer.step()                    # update weights
    
#     scheduler.step()
#     trainloss.append(epoch_loss)
#     if epoch % INTERVAL == 0:

#         model.eval()                        # evaluation phase
#         correct, acc = 0, 0
#         with torch.no_grad():
#             for xs, ys in testloader:
#                 xs, ys = xs.float(), ys.long()
#                 pred = torch.max(model(xs), 1)[1]
#                 correct += (pred == ys).sum().item()
#             acc = (100 * float(correct / len(testdata)) )

#         print("Epoch = %4d      Loss = %0.4f      Accuracy = %0.4f" % (epoch, epoch_loss, acc))

#         dt = time.strftime("%Y_%m_%d-%H_%M_%S")
#         fn = "../checkpoint/" + str(dt) + str("-") + str(epoch) + "_ckpt.pt"

#         info_dict = {
#             'epoch' : epoch,
#             'model_state' : model.state_dict(),
#             'optimizer_state' : optimizer.state_dict()
#         }
#         if SAVE:
#             torch.save(info_dict, fn)       # save checkpoint

# print("\nDone")