In [None]:
"""
==============================================
Name   : Eashan Adhikarla
Course : CSE498 - Adversarial Machine Learning
==============================================

Homework 1
"""
# --- Sklearn ---
from sklearn.datasets import fetch_openml
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
from sklearn import decomposition, discriminant_analysis, linear_model, svm, tree, neural_network
from sklearn.model_selection import GridSearchCV

# --- Models ---
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn import neural_network

# --- Utility ---
import os
import pickle, torch
import numpy as np, pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

rootdir = os.getcwd()

# data loading and preprocessing
dataPath = "data/statistics-5.csv"
df = pd.read_csv(dataPath)
# ----------------------------------
# Dropping columns that are not required at the moment
df = df.drop(columns=['Unnamed: 0', 'UUID', 'HOSTNAME', 'TIMESTAMP', 'THROUGHPUT (Receiver)', 'LATENCY (mean)', 'CONGESTION (Receiver)', 'BYTES (Receiver)'])

# Pre-processing
pacing = df['PACING'].values
for i, p in enumerate(pacing):
    v, _ = p.split("gbit")
    pacing[i] = float(v) # int(v)

df['PACING'] = pacing
df['CONGESTION (Sender)'] = (df['CONGESTION (Sender)'] == 'cubic').astype(int)
df['ALIAS'] = pd.factorize(df['ALIAS'])[0]

num_of_classes = len(df['PACING'].unique())

X = df[['THROUGHPUT (Sender)', 'LATENCY (min.)', 'LATENCY (max.)', 'RETRANSMITS', 'STREAMS', 'CONGESTION (Sender)', 'ALIAS']].values
y = df['PACING'].values
y = y.astype('int')

# Normalization
minmax_scale = preprocessing.MinMaxScaler().fit(df[['THROUGHPUT (Sender)', 'LATENCY (min.)', 'LATENCY (max.)', 'RETRANSMITS', 'STREAMS', 'CONGESTION (Sender)', 'ALIAS']])
df_minmax = minmax_scale.transform(df[['THROUGHPUT (Sender)', 'LATENCY (min.)', 'LATENCY (max.)', 'RETRANSMITS', 'STREAMS', 'CONGESTION (Sender)', 'ALIAS']])

final_df = pd.DataFrame(df_minmax, columns=['THROUGHPUT (Sender)', 'LATENCY (min.)', 'LATENCY (max.)', 'RETRANSMITS', 'STREAMS', 'CONGESTION (Sender)', 'ALIAS'])
X = final_df[['THROUGHPUT (Sender)', 'LATENCY (min.)', 'LATENCY (max.)', 'RETRANSMITS', 'STREAMS', 'CONGESTION (Sender)', 'ALIAS']].values
# ----------------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25,
                                                    random_state=1)

X_train = torch.tensor(X_train)
y_train = torch.tensor(y_train)
X_test  = torch.tensor(X_test)
y_test  = torch.tensor(y_test)

In [None]:
def train_and_tune(X, y, model, parameters, scoring='f1_macro', kfold=5, verbose=0):
    """
    X:          array-like of shape (n_samples, n_features)
    y:          array-like of shape (n_samples,)
    model:      (object) a sklearn model class
    parameters: (dict) contains the parameters you want to tune in the model
    metric:     (str) the metric used to evaluate the quality of the model
    return:     a trained model with the best parameters
    """
    cvSearchObj = GridSearchCV(model,
                               parameters,
                               scoring=scoring,
                               n_jobs=-1,
                               cv=kfold,
                               verbose=verbose)
    cvSearchObj.fit(X,y)
    return cvSearchObj.best_estimator_

def save_model(filename, model):
    """
    filename: Filename to save the model
    model:    Model weights to be saved
    """
    pickle.dump(model, open(filename, 'wb'))
    print("Model Saved")

def load_model(filename):
    """
    filename: Filename to load the model
    return:   Model weights that are reloaded
    """
    model_reloaded = pickle.load(open(filename, 'rb'))
    return model_reloaded



'''
==================================
Method 1: Decision Tree Classifier
==================================
'''
def DecisionTree(train, save, test): 
    filename = str(rootdir)+"checkpoint/traditional/dtreeBest_"+str(dataset)+".pkl"
    decisiontreeclassifier = DecisionTreeClassifier(random_state=999)
    if train:
        '''
        Train
        '''
        params = {'min_samples_leaf':[1,2,3]}

        dtreeBest = train_and_tune(X_train,
                                   y_train,
                                   decisiontreeclassifier,
                                   params,
                                   scoring='f1_macro',
                                   kfold=5)

        if save:
            save_model(filename, dtreeBest)

    if test:
        '''
        Test
        '''
        dtreeBest_reloaded = load_model(filename)
        pred = dtreeBest_reloaded.predict(X_test)
        acc  = dtreeBest_reloaded.score(X_test, y_test)
        
        # cf_matrix = confusion_matrix(Y_test, pred)
        # df_cm = pd.DataFrame(cf_matrix/np.sum(cf_matrix) *10, index = [i for i in classes],
        #                      columns = [i for i in classes])
        # plt.figure(figsize = (12,10))
        # sn.heatmap(df_cm, annot=True)
 
        print("Accuracy: ", acc)
    print("Method-1 completed!")

# -------------------------------------------------
DecisionTree(train=True, save=False, test=True)

In [None]:
"""Load the Boston dataset and examine its target (label) distribution."""

# Load libraries
import numpy as np
import pylab as pl
from sklearn import datasets
from sklearn.tree import DecisionTreeRegressor

################################
### ADD EXTRA LIBRARIES HERE ###
################################
from sklearn.metrics import mean_squared_error,median_absolute_error,r2_score,mean_absolute_error
from sklearn import grid_search
from sklearn.cross_validation import train_test_split

def load_data():
    """Load the Boston dataset."""

    boston = datasets.load_boston()
    return boston


def explore_city_data(city_data):
    """Calculate the Boston housing statistics."""

    # Get the labels and features from the housing data
    housing_prices = city_data.target
    housing_features = city_data.data

    ###################################
    ### Step 1. YOUR CODE GOES HERE ###
    ###################################

    # Please calculate the following values using the Numpy library
    # Size of data (number of houses)?
    # Number of features?
    # Minimum price?
    # Maximum price?
    # Calculate mean price?
    # Calculate median price?
    # Calculate standard deviation?
    number_of_houses = housing_features.shape[0]
    number_of_features = housing_features.shape[1]
    max_price = np.max(housing_prices)
    min_price = np.min(housing_prices)
    mean_price = np.mean(housing_prices)
    median_price = np.median(housing_prices)
    standard_deviation = np.std(housing_prices)

    print "number of houses:",number_of_houses
    print "number of features:",number_of_features
    print "max price of house:",max_price
    print "min price of house:",min_price
    print "mean price of house:",mean_price
    print "median price of house:",median_price
    print "standard deviation for prices of house:",standard_deviation

def performance_metric(label, prediction):
    """Calculate and return the appropriate error performance metric."""

    ###################################
    ### Step 2. YOUR CODE GOES HERE ###
    ###################################

    # http://scikit-learn.org/stable/modules/classes.html#sklearn-metrics-metrics
    #return median_absolute_error(label, prediction)
    #return r2_score(label, prediction)
    #return mean_absolute_error(label, prediction)
    return mean_squared_error(label,prediction)
    pass


def split_data(city_data):
    """Randomly shuffle the sample set. Divide it into 70 percent training and 30 percent testing data."""

    # Get the features and labels from the Boston housing data
    X, y = city_data.data, city_data.target

    ###################################
    ### Step 3. YOUR CODE GOES HERE ###
    ###################################
    X_train, X_test, y_train, y_test = train_test_split(
         X, y, test_size=0.30, train_size=0.70, random_state=42)
    return X_train, y_train, X_test, y_test


def learning_curve(depth, X_train, y_train, X_test, y_test):
    """Calculate the performance of the model after a set of training data."""

    # We will vary the training set size so that we have 50 different sizes
    sizes = np.linspace(1, len(X_train), 50)
    train_err = np.zeros(len(sizes))
    test_err = np.zeros(len(sizes))

    print "Decision Tree with Max Depth: "
    print depth
    


    for i, s in enumerate(sizes):

        # Create and fit the decision tree regressor model
        regressor = DecisionTreeRegressor(max_depth=depth)
        regressor.fit(X_train[:s], y_train[:s])

        # Find the performance on the training and testing set
        train_err[i] = performance_metric(y_train[:s], regressor.predict(X_train[:s]))
        
        test_err[i] = performance_metric(y_test, regressor.predict(X_test))

    

    pl.figure()
    pl.plot(y_train - regressor.predict(X_train))
    pl.savefig("residual_plot.png")
    # Plot learning curve graph
    learning_curve_graph(sizes, train_err, test_err, depth)


def learning_curve_graph(sizes, train_err, test_err, depth):
    """Plot training and test error as a function of the training size."""

    pl.figure()
    pl.title('Decision Trees: Performance vs Training Size')
    pl.plot(sizes, test_err, lw=2, label = 'test error')
    pl.plot(sizes, train_err, lw=2, label = 'training error')
    pl.legend()
    pl.xlabel('Training Size')
    pl.ylabel('Error')
    #pl.show()
    pl.savefig("learning_curve"+"_"+str(depth)+".png")


def model_complexity(X_train, y_train, X_test, y_test):
    """Calculate the performance of the model as model complexity increases."""

    print "Model Complexity: "

    # We will vary the depth of decision trees from 2 to 25
    max_depth = np.arange(1, 25)
    train_err = np.zeros(len(max_depth))
    test_err = np.zeros(len(max_depth))

    for i, d in enumerate(max_depth):
        # Setup a Decision Tree Regressor so that it learns a tree with depth d
        regressor = DecisionTreeRegressor(max_depth=d)

        # Fit the learner to the training data
        regressor.fit(X_train, y_train)

        # Find the performance on the training set
        train_err[i] = performance_metric(y_train, regressor.predict(X_train))

        # Find the performance on the testing set
        test_err[i] = performance_metric(y_test, regressor.predict(X_test))

    # Plot the model complexity graph
    model_complexity_graph(max_depth, train_err, test_err)


def model_complexity_graph(max_depth, train_err, test_err):
    """Plot training and test error as a function of the depth of the decision tree learn."""

    pl.figure()
    pl.title('Decision Trees: Performance vs Max Depth')
    pl.plot(max_depth, test_err, lw=2, label = 'test error')
    pl.plot(max_depth, train_err, lw=2, label = 'training error')
    pl.legend()
    pl.xlabel('Max Depth')
    pl.ylabel('Error')
    #pl.show()
    pl.savefig("model_complexity.png")


def fit_predict_model(city_data):
    """Find and tune the optimal model. Make a prediction on housing data."""

    # Get the features and labels from the Boston housing data
    X, y = city_data.data, city_data.target

    # Setup a Decision Tree Regressor
    regressor = DecisionTreeRegressor()

    parameters = {'max_depth':(1,2,3,4,5,6,7,8,9,10),
        'min_samples_split': (1, 2, 3),
        'min_samples_leaf': (1, 2, 3)
    }

    ###################################
    ### Step 4. YOUR CODE GOES HERE ###
    ###################################

    # 1. Find the best performance metric
    # should be the same as your performance_metric procedure
    # http://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html

    # 2. Use gridearch to fine tune the Decision Tree Regressor and find the best model
    # http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.GridSearchCV.html#sklearn.grid_search.GridSearchCV

    regressors = grid_search.GridSearchCV(regressor, parameters, scoring='mean_squared_error')

    regressors.fit(X,y)

    # pick the best
    reg = regressors.best_estimator_

    # Fit the learner to the training data
    print "Final Model: "
    print reg.fit(X, y)
    
    # Use the model to predict the output of a particular sample
    x = [11.95, 0.00, 18.100, 0, 0.6590, 5.6090, 90.00, 1.385, 24, 680.0, 20.20, 332.09, 12.13]
    y = reg.predict(x)
    print "House: " + str(x)
    print "Prediction: " + str(y)


def main():
    """Analyze the Boston housing data. Evaluate and validate the
    performanance of a Decision Tree regressor on the housing data.
    Fine tune the model to make prediction on unseen data."""

    # Load data
    city_data = load_data()

    # Explore the data
    explore_city_data(city_data)

    # Training/Test dataset split
    X_train, y_train, X_test, y_test = split_data(city_data)

    # Learning Curve Graphs
    max_depths = [1,2,3,4,5,6,7,8,9,10]
    for max_depth in max_depths:
        learning_curve(max_depth, X_train, y_train, X_test, y_test)

    # Model Complexity Graph
    model_complexity(X_train, y_train, X_test, y_test)

    # Tune and predict Model
    fit_predict_model(city_data)


if __name__ == "__main__":
    main()
{"mode":"full","isActive":false}

In [1]:
# from __future__ import absolute_import, print_function

# # --- System ---
# import os
# import sys
# import time
# import warnings

# # --- Utility ---
# import pandas as pd
# import numpy as np
# import math
# import random
# import logging
# import pickle
# import warnings
# warnings.filterwarnings('ignore')
# from sklearn.model_selection import train_test_split
# from sklearn import preprocessing

# # --- Plot ---
# import matplotlib.pyplot as plt
# # %matplotlib inline
# import seaborn as sns

# # --- Pytorch ---
# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# import torch.optim as optim
# import torchvision
# import torchvision.transforms as transforms
# import torch.backends.cudnn as cudnn

# from torch.utils.data import Dataset, DataLoader, TensorDataset
# from tqdm import tqdm
# from datetime import datetime
# from torch.utils.data import random_split

# from lib.dataloader import PacingDataset
# from lib.classifier import PacingClassifier
# import lib.utils

# # random weight initialization
# def seed_everything(seed=42):
#     random.seed(seed)
#     os.environ['PYTHONHASHSEED'] = str(seed)
#     np.random.seed(seed)
#     torch.manual_seed(seed)
#     torch.backends.cudnn.deterministic = True
#     torch.backends.cudnn.benchmark = False

# seed_everything()
# # ----------------------------------
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# root_dir = os.getcwd()

# # data loading and preprocessing
# dataPath = "data/statistics-5.csv"
# df = pd.read_csv(dataPath)
# # ----------------------------------
# # Dropping columns that are not required at the moment
# df = df.drop(columns=['Unnamed: 0', 'UUID', 'HOSTNAME', 'TIMESTAMP', 'THROUGHPUT (Receiver)', 'LATENCY (mean)', 'CONGESTION (Receiver)', 'BYTES (Receiver)'])

# # Pre-processing
# pacing = df['PACING'].values
# for i, p in enumerate(pacing):
#     v, _ = p.split("gbit")
#     pacing[i] = float(v) # int(v)

# df['PACING'] = pacing
# df['CONGESTION (Sender)'] = (df['CONGESTION (Sender)'] == 'cubic').astype(int)
# df['ALIAS'] = pd.factorize(df['ALIAS'])[0]

# num_of_classes = len(df['PACING'].unique())

# X = df[['THROUGHPUT (Sender)', 'LATENCY (min.)', 'LATENCY (max.)', 'RETRANSMITS', 'STREAMS', 'CONGESTION (Sender)', 'ALIAS']].values
# y = df['PACING'].values
# y = y.astype('int')

# # Normalization
# minmax_scale = preprocessing.MinMaxScaler().fit(df[['THROUGHPUT (Sender)', 'LATENCY (min.)', 'LATENCY (max.)', 'RETRANSMITS', 'STREAMS', 'CONGESTION (Sender)', 'ALIAS']])
# df_minmax = minmax_scale.transform(df[['THROUGHPUT (Sender)', 'LATENCY (min.)', 'LATENCY (max.)', 'RETRANSMITS', 'STREAMS', 'CONGESTION (Sender)', 'ALIAS']])

# final_df = pd.DataFrame(df_minmax, columns=['THROUGHPUT (Sender)', 'LATENCY (min.)', 'LATENCY (max.)', 'RETRANSMITS', 'STREAMS', 'CONGESTION (Sender)', 'ALIAS'])
# X = final_df[['THROUGHPUT (Sender)', 'LATENCY (min.)', 'LATENCY (max.)', 'RETRANSMITS', 'STREAMS', 'CONGESTION (Sender)', 'ALIAS']].values
# # ----------------------------------
# X_train, X_test, y_train, y_test = train_test_split(X, y, 
#                                                     test_size=0.25,
#                                                     random_state=1)

# X_train = torch.tensor(X_train)
# y_train = torch.tensor(y_train)
# X_test  = torch.tensor(X_test)
# y_test  = torch.tensor(y_test)

# # Hyperparameters
# EPOCH = 1000
# BATCH = 512
# LEARNING_RATE = 0.05

# INTERVAL = 50
# SAVE = False
# BESTLOSS = 10

# CE  = nn.CrossEntropyLoss()
# BCE = nn.BCELoss(reduction='mean')
# MSE = nn.MSELoss(reduction='mean') # 'mean', 'sum'. 'none'

# # Dataset w/o any tranformations
# traindata   = PacingDataset(tensors=(X_train, y_train), transform=None)
# trainloader = torch.utils.data.DataLoader(traindata, batch_size=BATCH)

# testdata    = PacingDataset(tensors=(X_test, y_test), transform=None)
# testloader = torch.utils.data.DataLoader(testdata, batch_size=1) # BATCH)

# inputFea = len(traindata[0][0])
# model = PacingClassifier (nc=num_of_classes, inputFeatures=inputFea)
# print(model)

# # optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
# optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=0.9, weight_decay=5e-4)

# print("\nBatch Size = %3d " % BATCH)
# print("Loss = " + str(CE))
# print("Optimizer = SGD")
# print("Max Epochs = %3d " % EPOCH)
# print("Learning Rate = %0.3f " % LEARNING_RATE)
# print("Number of Classes = %d " % num_of_classes)

# print("\nStarting training with saved checkpoints")

# model.train()
# for epoch in range(0, EPOCH):
#     torch.manual_seed(epoch+1) # recovery reproducibility
#     epoch_loss = 0             # for one full epoch

#     for (batch_idx, batch) in enumerate(trainloader):
#         (xs, ys) = batch                # (predictors, targets)
#         xs, ys = xs.float(), ys.float()
#         optimizer.zero_grad()           # prepare gradients

#         output = model(xs)              # predicted pacing rate
#         loss = CE(output, ys.long())    # avg per item in batch

#         epoch_loss += loss.item()       # accumulate averages
#         loss.backward()                 # compute gradients
#         optimizer.step()                # update weights

#     if epoch % INTERVAL == 0:
#         print("Epoch = %4d    Loss = %0.4f" % (epoch, epoch_loss))

#         # save checkpoint
#         dt = time.strftime("%Y_%m_%d-%H_%M_%S")
#         fn = str(dt) + str("-") + str(epoch) + "_ckpt.pt"

#         info_dict = {
#             'epoch' : epoch,
#             'model_state' : model.state_dict(),
#             'optimizer_state' : optimizer.state_dict()
#         }
#         if SAVE:
#             torch.save(info_dict, fn)

# print("\nDone")

PacingClassifier(
  (fc1): Linear(in_features=7, out_features=32, bias=True)
  (drop1): Dropout(p=0.25, inplace=False)
  (fc2): Linear(in_features=32, out_features=64, bias=True)
  (drop2): Dropout(p=0.7, inplace=False)
  (fc3): Linear(in_features=64, out_features=64, bias=True)
  (drop3): Dropout(p=0.7, inplace=False)
  (fc4): Linear(in_features=64, out_features=32, bias=True)
  (drop4): Dropout(p=0.7, inplace=False)
  (fc5): Linear(in_features=32, out_features=21, bias=True)
)

Batch Size = 512 
Loss = CrossEntropyLoss()
Optimizer = SGD
Max Epochs = 1000 
Learning Rate = 0.050 
Number of Classes = 21 

Starting training with saved checkpoints
Epoch =    0    Loss = 24.5920
Epoch =   50    Loss = 17.9660
Epoch =  100    Loss = 16.5130
Epoch =  150    Loss = 16.1677
Epoch =  200    Loss = 15.8473
Epoch =  250    Loss = 15.7375
Epoch =  300    Loss = 15.4794
Epoch =  350    Loss = 15.6110
Epoch =  400    Loss = 15.4521
Epoch =  450    Loss = 15.5056
Epoch =  500    Loss = 15.3059
Epoch 

In [None]:
correct, acc, total = 0, 0, 0
with torch.no_grad():
    for xs, ys in testloader:
        xs, ys = xs.float(), ys.long()

        output = model(xs)
        
        mse_loss = MSE(ys, output)
        bce_loss = BCE(recon, xs)
        loss = criterion(bce_loss, mu, log_var) + mse_loss

        running_loss += loss.item()
        total += ys.size(0)
        pred = torch.max(output, 1)[1]
        correct += (pred == ys).sum().item()
    acc = (100 * correct / total)
print(acc)