In [None]:
from __future__ import absolute_import, print_function

# --- System ---
import os
import sys
import warnings

# --- Utility ---
import pandas as pd
import numpy as np
import math
import random
import logging
import pickle
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# --- Plot --
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# --- Pytorch ---
import torch
import torch.nn as nn
import torch.distributions
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import torch.backends.cudnn as cudnn

from torch.utils.data import Dataset, DataLoader, TensorDataset
from tqdm import tqdm
from datetime import datetime
from torch.utils.data import random_split

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
root_dir = os.getcwd()

In [None]:
dataPath = "data/statistics-3.csv"
df = pd.read_csv(dataPath)
# columnList = df.columns

# Dropping columns that are not required at the moment
df = df.drop(columns=[ 'Unnamed: 0', 'UUID', 'HOSTNAME', 'ALIAS', 'TIMESTAMP',
                       'THROUGHPUT (Receiver)', 'LATENCY (min.)', 'LATENCY (max.)', 
                       'CONGESTION (Receiver)', 'BYTES (Receiver)'
                     ])

# Pre-processing
pacing = df['PACING'].values
for i, p in enumerate(pacing):
    v, _ = p.split("gbit")
    pacing[i] = int(v)

df['PACING'] = pacing
df['CONGESTION (Sender)'] = (df['CONGESTION (Sender)'] == 'cubic').astype(int) # Cubic = 1 & BBRV2 = 0

X = df[['THROUGHPUT (Sender)', 'LATENCY (mean)', 'RETRANSMITS', 'STREAMS', 'CONGESTION (Sender)']].values
y = df['PACING'].values
y = y.astype('float')

In [None]:
df['ALIAS'].unique()

In [None]:
df.head(5)

In [None]:
# # Standerdization
# std_scale = preprocessing.StandardScaler().fit(df[['THROUGHPUT (Sender)', 'LATENCY (mean)', 'RETRANSMITS', 'STREAMS', 'CONGESTION (Sender)']])
# df_std = std_scale.transform(df[['THROUGHPUT (Sender)', 'LATENCY (mean)', 'RETRANSMITS', 'STREAMS', 'CONGESTION (Sender)']])

# # Normalization
minmax_scale = preprocessing.MinMaxScaler().fit(df[['THROUGHPUT (Sender)', 'LATENCY (mean)', 'RETRANSMITS', 'STREAMS', 'CONGESTION (Sender)']])
df_minmax = minmax_scale.transform(df[['THROUGHPUT (Sender)', 'LATENCY (mean)', 'RETRANSMITS', 'STREAMS', 'CONGESTION (Sender)']])

final_df = pd.DataFrame(df_minmax, columns=['THROUGHPUT (Sender)', 'LATENCY (mean)', 'RETRANSMITS', 'STREAMS', 'CONGESTION (Sender)'])

X = final_df[['THROUGHPUT (Sender)', 'LATENCY (mean)', 'RETRANSMITS', 'STREAMS', 'CONGESTION (Sender)']].values

final_df.head(5)

In [None]:
EPOCH = 400
BATCH = 32
LEARNING_RATE = 0.001

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

X_train = torch.tensor(X_train)
y_train = torch.tensor(y_train)
X_test  = torch.tensor(X_test)
y_test  = torch.tensor(y_test) 

class CustomTensorDataset(Dataset):
    """
    TensorDataset with support of transforms.
    """
    def __init__(self, tensors, transform=None):
        assert all(tensors[0].size(0) == tensor.size(0) for tensor in tensors)
        self.tensors = tensors
        self.transform = transform

    def __getitem__(self, index):
        x = self.tensors[0][index]

        if self.transform:
            x = self.transform(x)

        y = self.tensors[1][index]

        return x, y

    def __len__(self):
        return self.tensors[0].size(0)

# Dataset w/o any tranformations
traindata   = CustomTensorDataset(tensors=(X_train, y_train), transform=None)
trainloader = torch.utils.data.DataLoader(traindata, batch_size=BATCH)

testdata    = CustomTensorDataset(tensors=(X_test, y_test), transform=None)
testloader = torch.utils.data.DataLoader(testdata, batch_size=BATCH)

print(len(traindata), len(testdata))


In [None]:
input_feature = 5
latent_feature = 16

class VAERegressor(nn.Module):
    def __init__(self):
        super(VAERegressor, self).__init__()
 
        # encoder
        self.enc1 = nn.Linear(in_features=input_feature, out_features=128)
        self.enc2 = nn.Linear(in_features=128, out_features=latent_feature*2)
 
        # decoder
        self.dec1 = nn.Linear(in_features=latent_feature, out_features=128)
        self.dec2 = nn.Linear(in_features=128, out_features=5)

        # Regressor
        self.fc1 = torch.nn.Linear (5, 32)
        self.fc2 = torch.nn.Linear (32, 1)

    def reparameterize(self, mu, log_var):
        """
        :param mu: mean from the encoder's latent space
        :param log_var: log variance from the encoder's latent space
        """
        std = torch.exp(0.5*log_var) # standard deviation
        eps = torch.randn_like(std) # `randn_like` as we need the same size
        sample = mu + (eps * std) # sampling as if coming from the input space
        return sample
 
    def forward(self, x):
        
        # encoding
        x = self.enc1(x)
        x = F.relu(x)
        x = self.enc2(x).view(-1, 2, latent_feature)

        # get `mu` and `log_var`
        mu      = x[:, 0, :]    # the first feature values as mean
        log_var = x[:, 1, :]    # the other feature values as variance

        # get the latent vector through reparameterization
        z = self.reparameterize(mu, log_var)
 
        # decoding
        x = self.dec1(z)
        x = F.relu(x)
        x = self.dec2(x)
        recon = torch.sigmoid(x)

        # regressor
        x = self.fc1(recon)
        x = F.relu(x)
        x = self.fc2(x)

        return x, recon, mu, log_var


model = VAERegressor()
print( f"====================\nTotal params: {len(list(model.parameters()))}\n====================" )
# print(model)

In [None]:
CE  = nn.CrossEntropyLoss()
BCE = nn.BCELoss(reduction='mean')
MSE = nn.MSELoss(reduction='mean')

def criterion(bce_loss, mu, logvar):
    """
    This function will add the reconstruction loss (BCELoss) and the 
    KL-Divergence.
    KL-Divergence = 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
    :param bce_loss: recontruction loss
    :param mu: the mean from the latent vector
    :param logvar: log variance from the latent vector
    """
    BCE = bce_loss 
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return BCE + KLD


optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=0.9)
# optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
bestloss = 10

for epoch in range(EPOCH):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        xs, ys = data
        xs, ys = xs.float(), ys.float()

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        output, recon, mu, log_var = model(xs)
        # print(output, ys)
        
        # loss = CE(output, ys)
        mse_loss = MSE(ys, output)
        bce_loss = BCE(recon, xs)
        loss = criterion(bce_loss, mu, log_var) + mse_loss

        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()

    print(f"[{epoch+1}/{EPOCH}] loss: {running_loss/len(trainloader.dataset):.3f}")
    running_loss = 0.0

print('Finished Training')

correct, acc, total = 0, 0, 0
with torch.no_grad():
    for xs, ys in testloader:
        xs, ys = xs.float(), ys.long()

        output = model(xs)
        
        mse_loss = MSE(ys, output)
        bce_loss = BCE(recon, xs)
        loss = criterion(bce_loss, mu, log_var) + mse_loss

        running_loss += loss.item()
        # total += ys.size(0)
        # pred = torch.max(output, 1)[1]
        # correct += (pred == ys).sum().item()
    # acc = (100 * correct / total)
# print(acc)

In [None]:
# import os
# import sys

# import pandas as pd
# import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns
# import warnings

# %matplotlib inline

# # --- Sklearn ---
# from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
# from sklearn import decomposition, discriminant_analysis
# from sklearn.model_selection import GridSearchCV

# # --- Models ---
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import GradientBoostingClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.naive_bayes import GaussianNB
# from sklearn import svm
# from sklearn import neural_network
# from sklearn.linear_model import LogisticRegression

# from sklearn.preprocessing import LabelEncoder
# from sklearn.cross_validation import train_test_split

# # --- Utility ---
# import os
# import pickle, torch
# import numpy as np, pandas as pd
# import seaborn as sn
# import matplotlib.pyplot as plt


# dataPath = "data/statistics (pacing).csv"
# df_full = pd.read_csv(dataPath)
# print(df_full.describe())

In [None]:
columnList = df_full.columns
print(f"Total columns list: {columnList}")

# Dropping columns that are not required at the moment
df = df_full.drop(columns=[ 'Unnamed: 0', 'UUID', 'HOSTNAME', 'ALIAS', 'TIMESTAMP', 'STREAMS',
                            'THROUGHPUT (Receiver)', 'LATENCY (min.)', 'LATENCY (max.)', 
                            'CONGESTION (Receiver)', 'BYTES (Receiver)'
                          ])

print(f"New columns list: {df.columns}")

In [None]:
df.describe()

In [None]:
# df.head(5)

# Preprocessing

In [None]:
pacing = df['PACING'].values
for i, p in enumerate(pacing):
    v, _ = p.split("gbit")
    pacing[i] = int(v)

df['PACING'] = pacing
# df['CONGESTION (Sender)'] = df['CONGESTION (Sender)']=="cubic"=1
# df['CONGESTION (Sender)'] = df['CONGESTION (Sender)']=="bbr2"=0
df['CONGESTION (Sender)'] = (df['CONGESTION (Sender)'] == 'cubic').astype(int)

In [None]:

df.head(5)

In [None]:
# sns.set(style='whitegrid', context='notebook')
# cols = ['THROUGHPUT (Sender)', 'LATENCY (mean)', 'RETRANSMITS', 'CONGESTION (Sender)', 'PACING']

# sns.pairplot(df[cols], height=3)
# plt.tight_layout()
# # plt.savefig('./figures/scatter.png', dpi=300)
# plt.show()

# Dataset

In [None]:
X = df[['THROUGHPUT (Sender)', 'LATENCY (mean)', 'RETRANSMITS']].values
y = df['PACING'].values
y = y.astype('int')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [None]:
def train_and_tune(X, y, model, parameters, scoring='f1_macro', kfold=5, verbose=0):
    """
    X:          array-like of shape (n_samples, n_features)
    y:          array-like of shape (n_samples,)
    model:      (object) a sklearn model class
    parameters: (dict) contains the parameters you want to tune in the model
    metric:     (str) the metric used to evaluate the quality of the model
    return:     a trained model with the best parameters
    """
    cvSearchObj = GridSearchCV(model,
                               parameters,
                               scoring=scoring,
                               n_jobs=-1,
                               cv=kfold,
                               verbose=verbose)
    cvSearchObj.fit(X,y)
    return cvSearchObj.best_estimator_

def save_model(filename, model):
    """
    filename: Filename to save the model
    model:    Model weights to be saved
    """
    pickle.dump(model, open(filename, 'wb'))
    print("Model Saved")

def load_model(filename):
    """
    filename: Filename to load the model
    return:   Model weights that are reloaded
    """
    model_reloaded = pickle.load(open(filename, 'rb'))
    return model_reloaded


def MLP(train, save, test):
    filename = "./mlpBest.pkl"
    mlp = neural_network.MLPClassifier(random_state=999)
    if train:
        '''
        Train
        '''
        params = {"alpha" : [0.0001],
                "learning_rate_init" : [0.001],
                "batch_size" : [32, 64, 128],
                "activation" : ["relu"],
                "early_stopping" : [True],
                "hidden_layer_sizes" : [10, 50, 100],
                }

        mlpBest = train_and_tune(X, y,
                                 mlp,
                                 params,
                                 scoring='f1_macro',
                                 kfold=5)

        if save:
            save_model(filename, mlpBest)

    if test:
        '''
        Test
        '''
        mlpBest_reloaded = load_model(filename)
        pred = mlpBest_reloaded.predict(X)
        acc  = mlpBest_reloaded.score(X, y)
        
        # cf_matrix = confusion_matrix(y, pred)
        # df_cm = pd.DataFrame(cf_matrix/np.sum(cf_matrix) *10, index = [i for i in classes],
        #                      columns = [i for i in classes])
        # plt.figure(figsize = (12,10))
        # sn.heatmap(df_cm, annot=True)
        
        print("Accuracy: ", acc)

MLP(train=True, save=True, test=True)

In [None]:
le = LabelEncoder()
y = le.fit_transform(y)
# le.transform(['M', 'B'])

model = LogisticRegression()
model.fit(X_train, y_train)
print('Test Accuracy: %.3f' % model.score(X_test, y_test))

In [None]:
from __future__ import absolute_import, print_function

# --- System ---
import os
import sys
import time
import warnings

# --- Utility ---
import pandas as pd
import numpy as np
import math
import random
import logging
import pickle
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# --- Plot ---
import matplotlib.pyplot as plt
# %matplotlib inline
import seaborn as sns

# --- Pytorch ---
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import torch.backends.cudnn as cudnn

from torch.utils.data import Dataset, DataLoader, TensorDataset
from tqdm import tqdm
from datetime import datetime
from torch.utils.data import random_split

from lib.dataloader import PacingDataset
from lib.classifier import PacingClassifier, resnet50
import lib.utils

# random weight initialization
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything()
# ----------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
root_dir = os.getcwd()

# data loading and preprocessing
dataPath = "data/statistics-5.csv"
df = pd.read_csv(dataPath)
# ----------------------------------
# Dropping columns that are not required at the moment
df = df.drop(columns=['Unnamed: 0', 'UUID', 'HOSTNAME', 'TIMESTAMP', 'THROUGHPUT (Receiver)', 'LATENCY (mean)', 'CONGESTION (Receiver)', 'BYTES (Receiver)'])

# Pre-processing
pacing = df['PACING'].values
for i, p in enumerate(pacing):
    v, _ = p.split("gbit")
    pacing[i] = float(v) # int(v)

df['PACING'] = pacing
df['CONGESTION (Sender)'] = (df['CONGESTION (Sender)'] == 'cubic').astype(int)
df['ALIAS'] = pd.factorize(df['ALIAS'])[0]

num_of_classes = len(df['PACING'].unique())

X = df[['THROUGHPUT (Sender)', 'LATENCY (min.)', 'LATENCY (max.)', 'RETRANSMITS', 'STREAMS', 'CONGESTION (Sender)', 'ALIAS']].values
y = df['PACING'].values
y = y.astype('int')

# Normalization
minmax_scale = preprocessing.MinMaxScaler().fit(df[['THROUGHPUT (Sender)', 'LATENCY (min.)', 'LATENCY (max.)', 'RETRANSMITS', 'STREAMS', 'CONGESTION (Sender)', 'ALIAS']])
df_minmax = minmax_scale.transform(df[['THROUGHPUT (Sender)', 'LATENCY (min.)', 'LATENCY (max.)', 'RETRANSMITS', 'STREAMS', 'CONGESTION (Sender)', 'ALIAS']])

final_df = pd.DataFrame(df_minmax, columns=['THROUGHPUT (Sender)', 'LATENCY (min.)', 'LATENCY (max.)', 'RETRANSMITS', 'STREAMS', 'CONGESTION (Sender)', 'ALIAS'])
X = final_df[['THROUGHPUT (Sender)', 'LATENCY (min.)', 'LATENCY (max.)', 'RETRANSMITS', 'STREAMS', 'CONGESTION (Sender)', 'ALIAS']].values
# ----------------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25,
                                                    random_state=1)

X_train = torch.tensor(X_train)
y_train = torch.tensor(y_train)
X_test  = torch.tensor(X_test)
y_test  = torch.tensor(y_test)

# Hyperparameters
EPOCH = 1500
BATCH = 512
LEARNING_RATE = 0.01

INTERVAL = 50
SAVE = False
BESTLOSS = 10

CE  = nn.CrossEntropyLoss()
BCE = nn.BCELoss(reduction='mean')
MSE = nn.MSELoss(reduction='mean') # 'mean', 'sum'. 'none'

# Dataset w/o any tranformations
traindata   = PacingDataset(tensors=(X_train, y_train), transform=None)
trainloader = torch.utils.data.DataLoader(traindata, batch_size=BATCH)
testdata    = PacingDataset(tensors=(X_test, y_test), transform=None)
testloader = torch.utils.data.DataLoader(testdata, batch_size=1)

inputFea = len(traindata[0][0])
model = PacingClassifier (nc=num_of_classes, inputFeatures=inputFea)
print(model)

optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=0.9, weight_decay=5e-4, nesterov=True)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[350,500], gamma=0.1)

print("\nBatch Size = %3d " % BATCH)
print("Loss = " + str(CE))
print("Optimizer = SGD")
print("Max Epochs = %3d " % EPOCH)
print("Learning Rate = %0.3f " % LEARNING_RATE)
print("Number of Classes = %d " % num_of_classes)
print("\nStarting training ...")

model.train()
for epoch in range(0, EPOCH):
    torch.manual_seed(epoch+1) # recovery reproducibility
    epoch_loss = 0             # for one full epoch

    for (batch_idx, batch) in enumerate(trainloader):
        (xs, ys) = batch                # (predictors, targets)
        xs, ys = xs.float(), ys.float()
        optimizer.zero_grad()           # prepare gradients

        output = model(xs)              # predicted pacing rate
        loss = CE(output, ys.long())    # avg per item in batch

        epoch_loss += loss.item()       # accumulate averages
        loss.backward()                 # compute gradients
        optimizer.step()                # update weights
    
    scheduler.step()
    if epoch % INTERVAL == 0:
        print("Epoch = %4d    Loss = %0.4f" % (epoch, epoch_loss))

        # save checkpoint
        dt = time.strftime("%Y_%m_%d-%H_%M_%S")
        fn = str(dt) + str("-") + str(epoch) + "_ckpt.pt"

        info_dict = {
            'epoch' : epoch,
            'model_state' : model.state_dict(),
            'optimizer_state' : optimizer.state_dict()
        }
        if SAVE:
            torch.save(info_dict, fn)

print("\nDone")

model.eval()
correct, acc = 0, 0
with torch.no_grad():
    for xs, ys in testloader:
        xs, ys = xs.float(), ys.long()
        pred = torch.max(model(xs), 1)[1]
        correct += (pred == ys).sum().item()
    acc = (100 * float(correct / len(testdata)) )

print(f"Accuracy: {acc:.3f}%")


In [None]:
df.PACING.hist(figsize=(14,4))
plt.title('Pacing Rates')
df[['THROUGHPUT (Sender)']].hist(figsize=(14,4))
plt.title('Throughputs')
plt.show()

In [None]:
sns.pairplot(final_df, diag_kind="kde")


In [None]:
X_train_ = pd.get_dummies(X_train)

In [None]:
X_test.shape

In [None]:
X_test_ = pd.get_dummies(X_test)

In [None]:
from __future__ import absolute_import, print_function

# --- System ---
import os
import sys
import time
import warnings

# --- Utility ---
import pandas as pd
import numpy as np
import math
import random
import logging
import pickle
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# --- Plot ---
import matplotlib.pyplot as plt
# %matplotlib inline
import seaborn as sns

# --- Pytorch ---
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import torch.backends.cudnn as cudnn

from torch.utils.data import Dataset, DataLoader, TensorDataset
from tqdm import tqdm
from datetime import datetime
from torch.utils.data import random_split

from lib.dataloader import PacingDataset
from lib.classifier import PacingClassifier, resnet50
import lib.utils

# random weight initialization
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything()
# ----------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
root_dir = os.getcwd()

# data loading and preprocessing
dataPath = "data/statistics-5.csv"
df = pd.read_csv(dataPath)
# ----------------------------------
# Dropping columns that are not required at the moment
df = df.drop(columns=['Unnamed: 0', 'UUID', 'HOSTNAME', 'TIMESTAMP', 'THROUGHPUT (Receiver)', 'LATENCY (mean)', 'CONGESTION (Receiver)', 'BYTES (Receiver)'])

# Pre-processing
pacing = df['PACING'].values
for i, p in enumerate(pacing):
    v, _ = p.split("gbit")
    pacing[i] = float(v) # int(v)

df['PACING'] = pacing
df['CONGESTION (Sender)'] = (df['CONGESTION (Sender)'] == 'cubic').astype(int)
df['ALIAS'] = pd.factorize(df['ALIAS'])[0]

num_of_classes = len(df['PACING'].unique())

X = df[['THROUGHPUT (Sender)', 'LATENCY (min.)', 'LATENCY (max.)', 'RETRANSMITS', 'STREAMS', 'CONGESTION (Sender)', 'ALIAS']].values
y = df['PACING'].values
y = y.astype('float')

# Normalization
minmax_scale = preprocessing.MinMaxScaler().fit(df[['THROUGHPUT (Sender)', 'LATENCY (min.)', 'LATENCY (max.)', 'RETRANSMITS', 'STREAMS', 'CONGESTION (Sender)', 'ALIAS']])
df_minmax = minmax_scale.transform(df[['THROUGHPUT (Sender)', 'LATENCY (min.)', 'LATENCY (max.)', 'RETRANSMITS', 'STREAMS', 'CONGESTION (Sender)', 'ALIAS']])

final_df = pd.DataFrame(df_minmax, columns=['THROUGHPUT (Sender)', 'LATENCY (min.)', 'LATENCY (max.)', 'RETRANSMITS', 'STREAMS', 'CONGESTION (Sender)', 'ALIAS'])
X = final_df[['THROUGHPUT (Sender)', 'LATENCY (min.)', 'LATENCY (max.)', 'RETRANSMITS', 'STREAMS', 'CONGESTION (Sender)', 'ALIAS']].values
# ----------------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25,
                                                    random_state=1)

In [None]:
X_train = torch.tensor(X_train)
y_train = torch.tensor(y_train)
X_test  = torch.tensor(X_test)
y_test  = torch.tensor(y_test)

In [None]:

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
from xgboost import XGBRegressor
from ml_metrics import rmse

model_up = XGBRegressor(n_estimators=350,max_depth=15,random_state=5,learning_rate=.1,
                        n_jobs=10, subsample=1,min_child_weight=0.6) # Your code here

# Fit the model
model_up.fit(X_train, y_train)

# Get validation predictions and MAE
preds = model_up.predict(X_test)

print("RMSE (Your appraoch):")
print(rmse(y_test, preds))

In [1]:
from __future__ import absolute_import, print_function

# --- System ---
import os
import sys
import time
import warnings

# --- Utility ---
import pandas as pd
import numpy as np
import math
import random
import logging
import pickle
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# --- Plot ---
import matplotlib.pyplot as plt
# %matplotlib inline
import seaborn as sns

# --- Pytorch ---
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import torch.backends.cudnn as cudnn

from torch.utils.data import Dataset, DataLoader, TensorDataset
from tqdm import tqdm
from datetime import datetime
from torch.utils.data import random_split


# random weight initialization
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything()
# ----------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
root_dir = os.getcwd()

# data loading and preprocessing
dataPath = "data/statistics-5.csv"
df = pd.read_csv(dataPath)
# ----------------------------------
# Dropping columns that are not required at the moment
df = df.drop(columns=['Unnamed: 0', 'UUID', 'HOSTNAME', 'TIMESTAMP', 'THROUGHPUT (Receiver)', 'LATENCY (mean)', 'CONGESTION (Receiver)', 'BYTES (Receiver)'])
original_df = df

# Pre-processing
pacing = original_df['PACING'].values
for i, p in enumerate(pacing):
    v, _ = p.split("gbit")
    pacing[i] = float(v) # int(v)

original_df['PACING'] = pacing
original_df.drop( original_df[ original_df['PACING'] == 10.5 ].index, inplace=True)
num_of_classes = len(original_df['PACING'].unique())

# w/o preprocessing
original_df.head(5)

Unnamed: 0,ALIAS,STREAMS,PACING,THROUGHPUT (Sender),LATENCY (min.),LATENCY (max.),RETRANSMITS,CONGESTION (Sender)
0,hostA,1,1.0,1623277000.0,30062.0,30264.5,1535.0,cubic
1,hostA,1,1.0,1652145000.0,60206.5,60572.0,2879.0,cubic
2,hostA,1,1.0,983358400.0,91576.5,92073.0,2879.0,cubic
3,hostA,1,2.0,1965511000.0,122954.0,123533.5,2879.0,cubic
4,hostA,1,3.0,2946649000.0,154383.5,155109.0,2879.0,cubic


In [2]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer(sparse_output=True)

alias_df = original_df.join(pd.DataFrame.sparse.from_spmatrix(mlb.fit_transform(original_df.pop('ALIAS')),
                                                            index=original_df.index,
                                                            columns=mlb.classes_))

df_ = alias_df.join(pd.DataFrame.sparse.from_spmatrix(mlb.fit_transform(alias_df.pop('CONGESTION (Sender)')),
                                                            index=alias_df.index,
                                                            columns=mlb.classes_),
                                                            how = 'left', lsuffix='left', rsuffix='right')

# df_['CONGESTION (Sender)'] = (df_['CONGESTION (Sender)'] == 'cubic').astype(int)
df_

Unnamed: 0,STREAMS,PACING,THROUGHPUT (Sender),LATENCY (min.),LATENCY (max.),RETRANSMITS,A,B,C,D,...,h,o,s,t,2,bright,cright,i,r,u
0,1,1.0,1.623277e+09,30062.0,30264.5,1535.0,1,0,0,0,...,1,1,1,1,0,1,1,1,0,1
1,1,1.0,1.652145e+09,60206.5,60572.0,2879.0,1,0,0,0,...,1,1,1,1,0,1,1,1,0,1
2,1,1.0,9.833584e+08,91576.5,92073.0,2879.0,1,0,0,0,...,1,1,1,1,0,1,1,1,0,1
3,1,2.0,1.965511e+09,122954.0,123533.5,2879.0,1,0,0,0,...,1,1,1,1,0,1,1,1,0,1
4,1,3.0,2.946649e+09,154383.5,155109.0,2879.0,1,0,0,0,...,1,1,1,1,0,1,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5121,1,2.0,1.612733e+09,3131908.0,4538737.5,10626.0,0,0,0,1,...,1,1,1,1,0,1,1,1,0,1
5122,1,2.0,1.866827e+09,3163305.0,4570179.0,10626.0,1,0,0,0,...,1,1,1,1,0,1,1,1,0,1
5123,1,1.0,8.338280e+08,3243547.5,4651874.0,10626.0,0,0,1,0,...,1,1,1,1,0,1,1,1,0,1
5124,1,1.0,4.970197e+08,3298707.5,4709556.5,10635.0,0,1,0,0,...,1,1,1,1,0,1,1,1,0,1


In [3]:
X = df_[df_.columns.values].values
y = df_['PACING'].values
y = y.astype('float')

# Normalization
minmax_scale = preprocessing.MinMaxScaler().fit(df_[df_.columns.values])
df_minmax = minmax_scale.transform(df_[df_.columns.values])

final_df = pd.DataFrame(df_minmax, columns=df_.columns.values)
X = final_df[df_.columns.values].values
# ----------------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25,
                                                    random_state=1)

X_train = torch.tensor(X_train)
y_train = torch.tensor(y_train)
X_test  = torch.tensor(X_test)
y_test  = torch.tensor(y_test)

# normalized data
final_df.head(5)

Unnamed: 0,STREAMS,PACING,THROUGHPUT (Sender),LATENCY (min.),LATENCY (max.),RETRANSMITS,A,B,C,D,...,h,o,s,t,2,bright,cright,i,r,u
0,0.0,0.052632,0.162861,0.0,0.0,0.014931,1.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0
1,0.0,0.052632,0.165783,0.000132,0.000129,0.028003,1.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0
2,0.0,0.052632,0.09808,0.000269,0.000263,0.028003,1.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0
3,0.0,0.157895,0.197506,0.000407,0.000397,0.028003,1.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0
4,0.0,0.263158,0.29683,0.000544,0.000531,0.028003,1.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0


In [8]:
# Hyperparameters
EPOCH = 300
BATCH = 256
LEARNING_RATE = 0.001

INTERVAL = 50
SAVE = False
BESTLOSS = 10

CE  = nn.CrossEntropyLoss()
BCE = nn.BCELoss(reduction='mean')
MSE = nn.MSELoss(reduction='mean') # 'mean', 'sum'. 'none'

# Custom data loader for ELK stack dataset
class PacingDataset(Dataset):
    """ TensorDataset with support of transforms. """
    def __init__(self, tensors, transform=None):
        assert all(tensors[0].size(0) == tensor.size(0) for tensor in tensors)
        self.tensors = tensors
        self.transform = transform

    def __getitem__(self, index):
        x = self.tensors[0][index]
        if self.transform:
            x = self.transform(x)
        y = self.tensors[1][index]
        return x, y

    def __len__(self):
        return self.tensors[0].size(0)

# Dataset w/o any tranformations
traindata   = PacingDataset(tensors=(X_train, y_train), transform=None)
trainloader = torch.utils.data.DataLoader(traindata, batch_size=BATCH)
testdata    = PacingDataset(tensors=(X_test, y_test), transform=None)
testloader = torch.utils.data.DataLoader(testdata, batch_size=1)

inputFea = len(traindata[0][0])

# model definition
class PacingClassifier (nn.Module):
    # https://visualstudiomagazine.com/Articles/2021/02/11/pytorch-define.aspx?Page=2
    def __init__(self, nc=20, inputFeatures=7):
        super(PacingClassifier, self).__init__()

        self.fc1 = torch.nn.Linear(inputFeatures, 32)
        self.fc2 = torch.nn.Linear(32, 64)
        self.fc3 = torch.nn.Linear(64, 128)
        self.fc4 = torch.nn.Linear(128, 128)
        self.fc5 = torch.nn.Linear(128, 64)
        self.fc6 = torch.nn.Linear(64, nc)

        torch.nn.init.xavier_uniform_(self.fc1.weight)
        torch.nn.init.zeros_(self.fc1.bias)
        torch.nn.init.xavier_uniform_(self.fc2.weight)
        torch.nn.init.zeros_(self.fc2.bias)
        torch.nn.init.xavier_uniform_(self.fc3.weight)
        torch.nn.init.zeros_(self.fc3.bias)
        torch.nn.init.xavier_uniform_(self.fc4.weight)
        torch.nn.init.zeros_(self.fc4.bias)
        torch.nn.init.xavier_uniform_(self.fc5.weight)
        torch.nn.init.zeros_(self.fc5.bias)
        torch.nn.init.xavier_uniform_(self.fc6.weight)
        torch.nn.init.zeros_(self.fc6.bias)

        self.lrelu = torch.nn.LeakyReLU(negative_slope=0.02)

    def forward(self, x):
        z = self.lrelu(self.fc1(x))
        z = self.lrelu(self.fc2(z))
        z = self.lrelu(self.fc3(z))
        z = self.lrelu(self.fc4(z))
        z = self.lrelu(self.fc5(z))
        z = self.fc6(z)  # no activation
        return z

model = PacingClassifier (nc=num_of_classes, inputFeatures=inputFea)
print(model)

optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=0.9, weight_decay=5e-4, nesterov=True)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[250], gamma=0.1)

print("\nBatch Size = %3d " % BATCH)
print("Loss = " + str(CE))
print("Optimizer = SGD")
print("Max Epochs = %3d " % EPOCH)
print("Learning Rate = %0.3f " % LEARNING_RATE)
print("Number of Classes = %d " % num_of_classes)
print("\nStarting training ...")

model.train()
trainloss = []
for epoch in range(0, EPOCH):
    torch.manual_seed(epoch+1) # recovery reproducibility
    epoch_loss = 0             # for one full epoch

    for (batch_idx, batch) in enumerate(trainloader):
        (xs, ys) = batch                # (predictors, targets)
        xs, ys = xs.float(), ys.float()
        optimizer.zero_grad()           # prepare gradients

        output = model(xs)              # predicted pacing rate
        loss = CE(output, ys.long())    # avg per item in batch

        epoch_loss += loss.item()       # accumulate averages
        loss.backward()                 # compute gradients
        optimizer.step()                # update weights
    
    scheduler.step()
    if epoch % INTERVAL == 0:
        # print("Epoch = %4d    Loss = %0.4f" % (epoch, epoch_loss))

        model.eval()
        correct, acc = 0, 0
        with torch.no_grad():
            for xs, ys in testloader:
                xs, ys = xs.float(), ys.long()
                pred = torch.max(model(xs), 1)[1]
                correct += (pred == ys).sum().item()
            acc = (100 * float(correct / len(testdata)) )

        print("Epoch = %4d    Loss = %0.4f    Accuracy = %0.4f" % (epoch, epoch_loss, acc))

        # save checkpoint
        dt = time.strftime("%Y_%m_%d-%H_%M_%S")
        fn = str(dt) + str("-") + str(epoch) + "_ckpt.pt"

        info_dict = {
            'epoch' : epoch,
            'model_state' : model.state_dict(),
            'optimizer_state' : optimizer.state_dict()
        }
        if SAVE:
            torch.save(info_dict, fn)

print("\nDone")

PacingClassifier(
  (fc1): Linear(in_features=27, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=128, bias=True)
  (fc4): Linear(in_features=128, out_features=128, bias=True)
  (fc5): Linear(in_features=128, out_features=64, bias=True)
  (fc6): Linear(in_features=64, out_features=20, bias=True)
  (lrelu): LeakyReLU(negative_slope=0.02)
)

Batch Size = 256 
Loss = CrossEntropyLoss()
Optimizer = SGD
Max Epochs = 300 
Learning Rate = 0.001 
Number of Classes = 20 

Starting training ...
Epoch =    0    Loss = 44.5977    Accuracy = 6.0918
Epoch =   50    Loss = 33.3877    Accuracy = 16.5348
Epoch =  100    Loss = 15.5130    Accuracy = 73.0222
Epoch =  150    Loss = 3.0257    Accuracy = 97.1519
Epoch =  200    Loss = 0.8853    Accuracy = 99.2089
Epoch =  250    Loss = 0.3739    Accuracy = 99.8418

Done
