# COMPARE NETWORK ACCURACIES FOR DIFFERENT DATA AMOUNTS

Import necessary packages:

In [12]:
# Jupyter packages
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
from keras.callbacks import EarlyStopping
import ipywidgets as widgets

# Common packages, you know them from before 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import tensorflow as tf
import random
import math
import copy
import string
from datetime import date
from typing import *
from sklearn.model_selection import train_test_split
import pickle
import time
import os

# Setup jupyter
%matplotlib inline

# Define constants:

In [13]:
PATH = (os.getcwd().replace("notebooks", ""))
execfile(PATH + "/scripts/pymoo_implementation/problem/problem_mlp_optimization.py")

In [14]:
# Data constants
PC_NAME = "thinkPad"
DATASET_NUMBER = "random2"
DATA_PATH = f"~/belle_neural_trigger/data/{DATASET_NUMBER}.csv"

# Random seed
RANDOM_SEED = 42

# mlp setup constants
MAX_HIDDENLAYERS = 10
MAX_NEURON_PER_LAYER = 300
NUM_EPOCHS = 500 #100 # plot showed that no more than 40 epochs are necessary, else the network will be overfitting 

METRIC = "accuracy"

# genetic algorithm constants
NUM_GENERATIONS = 5 #min 2, step size instead of complete number of generations
POPULATION_SIZE = 250

example = np.array([300, 300, 300, 300, 300])
original_data = pd.read_csv(DATA_PATH, delimiter='\t');  
today = date.today() 

PLOT_PREFIX = f"plots/{PC_NAME}-set{DATASET_NUMBER}-{example}-{today}"
PICKLE_NAME = f"pickled_data/{PC_NAME}-plot-data-set{DATASET_NUMBER}-{example}{today}.p"


# Define necessary functions:

Read data function:

In [15]:
def get_data(datacount:int):
    data = original_data.iloc[:
    datacount
    , 9:]
    input = data.iloc[:, :-9]
    target = data.iloc[:, -9:-7]

    return input, target

Store data and plots function:

In [16]:
def store_data_and_plots(accuracy_list, duration_list, data_count_list, part_number):
    pickle.dump([accuracy_list, duration_list, data_count_list], open(PICKLE_NAME, "wb"))

    figure, axis = plt.subplots(2,1)

    axis[0].plot(data_count_list[1:], accuracy_list[1:], label='accuracy', color = "blue")
    axis[0].plot(data_count_list[1:], np.ones(len(accuracy_list[1:])) * accuracy_list[0], label='500.000_data_accuracy', color = "red")

    axis[0].set_xlabel("instances of data")
    axis[0].set_ylabel("accuracy")
    axis[0].set_title(f"MLP-parameters: {example}")
    axis[0].grid(True)
    axis[0].legend()

    axis[1].plot(data_count_list[1:], duration_list[1:])
    axis[1].plot(data_count_list[1:], duration_list[1:], label='duration', color="blue")
    axis[1].plot(data_count_list[1:], np.ones(len(duration_list[1:])) * duration_list[0], label='500.000_data_duration', color = "red")
    axis[1].set_xlabel("instances of data")
    axis[1].set_ylabel("calculation time [min]")
    axis[1].grid(True)
    axis[1].legend()


    plt.tight_layout()
    plt.show()
    figure.savefig(f"{PLOT_PREFIX}part{part_number}.png")


Train the network function:

In [19]:
def train_network(start, end, step, accuracy_list, duration_list, data_count_list):
    time0 = time.time()

    for i in range(start, end, step):
        input, target = get_data(i)
        accuracy = - evaluate_MLP(create_MLP_from_parameter(example), example, input, target)[0]
        accuracy_list.append(accuracy)

        time_duration = (time.time() - time0) /60
        time0 = time.time()
        duration_list.append(time_duration)

        data_count_list.append(i)
        print (f"data_count: {i}, acc:{accuracy:0.03f}, time:{time_duration:0.03f} min")
    
    return (accuracy_list, duration_list, data_count_list)

# Run Evaluation:

High datacount as best possible accuracy solution to compare to:

In [21]:
accuracy_list = []
duration_list = []
data_count_list = []

acc_res, duration_res, data_count_res = train_network(500, 501, 1, accuracy_list, duration_list, data_count_list)
store_data_and_plots(acc_res, duration_res, data_count_res, 1)

data_count: 500, acc:0.130, time:0.064 min


FileNotFoundError: [Errno 2] No such file or directory: 'pickled_data/thinkPad-plot-data-setrandom2-[300 300 300 300 300]2023-02-10.p'

Evalute accuracy with datacounts from 100 to 10000 instances in 100er steps:

In [None]:

acc_res, duration_res, data_count_res = train_network(100, 10100, 100, accuracy_list, duration_list, data_count_list)
store_data_and_plots(acc_res, duration_res, data_count_res, 1)