In [1]:
import requests
import pandas as pd
import numpy as np
import re
import sys, getopt
import csv
import pickle
import copy
import os
import math

pd.set_option('display.max_rows', 500)

import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
%matplotlib inline  
import seaborn as sns
sns.set_style("darkgrid")

import umap
from sklearn.decomposition import TruncatedSVD as tsvd

def nearZeroVarDropAuto(df,thresh=0.99):
    vVal=df.var(axis=0).values
    cs=pd.Series(vVal).sort_values(ascending=False).cumsum()
    remove=cs[cs>cs.values[-1]*thresh].index.values
    return df.drop(df.columns[remove],axis=1)

%run SodaKick_download_functions.ipynb


import torch
from torch.utils.data import DataLoader, Dataset
from torch.optim import SGD, Adagrad, Adam, Adagrad
import torch.nn as nn
import torch.nn.functional as F

In [2]:
#from ray import tune
#from ray.tune import CLIReporter
#from ray.tune.schedulers import ASHAScheduler

from hyperopt import hp, tpe, fmin, Trials
from hyperopt import STATUS_OK, STATUS_FAIL
from hyperopt.pyll import scope

In [3]:
class EarlyStopping:

    """ Stops the training if loss doesn't improve after a given number of epochs. """

    def __init__(self, patience=3, epsilon=1e-5, keepBest=True, silent=True):

        """
        Args:
            patience (int): Number of epochs without change before stopping the learning (default 3).
            epsilon (float): Minimum change in loss to be considered for early stopping (default 1e-5).
            keepBest (bool): Keep track of the best model (memory consuming).
        """

        self.patience = patience
        self.epsilon = epsilon
        self.counter = 0
        
        self.bestScore = np.inf
     
        self.keepBest = keepBest 
        self.bestModel = None

        self.earlyStop = False
        self.silent = silent

    def __call__(self, loss, model):


        """ Evaluate the loss change between epochs and activates early stop if below epsilon.

        Args:
            loss (float): current loss.
            model (torch model): the current model.
        """

        if loss > self.bestScore - self.epsilon:

            self.counter += 1
            if not self.silent:
                print('EarlyStopping counter: {:d}/{:d}'.format(self.counter,self.patience))

            if self.counter >= self.patience:
                self.earlyStop = True

        else:   

            self.counter = 0
            self.bestScore = loss

            if self.keepBest:
                self.bestModel = copy.deepcopy(model)


In [4]:
class matchesDataset(Dataset):

    """ Extend pytorch Dataset class to include cleaning and training set creation, """
    
    def __init__(self, matches, results):

        self.matches = torch.tensor(matches, dtype=torch.float32)
        self.results = torch.tensor(results, dtype=torch.float32)

    def __len__(self):

        """ Returns the len of the training sample. """
        
        return len(self.matches)
        

    def __getitem__(self, index): 

        """ Returns a word, a context word and a list of negative words for training for a given index. 

        Args:
            index (int): index for the word selection.

        Returns:
            (string, string, list of strings): selected word, context word and a randomly drawn list 
                                               of negative words.

        """
        return self.matches[index], self.results[index]

In [5]:
#https://towardsdatascience.com/quirky-keras-custom-and-asymmetric-loss-functions-for-keras-in-r-a8b5271171fe
#weighted asimmetric square error, errors by going below the value (not seeing a goal when it's there) are weighted more

def WSE(output, target, a=1.5, b=.5):
    loss = torch.mean(a/(a+b)*torch.minimum(torch.zeros(output.shape[1]),output - target)**2+\
                      b/(a+b)*torch.maximum(torch.zeros(output.shape[1]),output - target)**2)      
    return loss

def WSEl1(output, target, a=1.5, b=.5):
    loss = torch.mean(a/(a+b)*torch.abs(torch.minimum(torch.zeros(output.shape[1]),output - target))+\
                      b/(a+b)*torch.abs(torch.maximum(torch.zeros(output.shape[1]),output - target)))      
    return loss

def WSE2(output, target, a=1.5, b=.5):
    loss = np.mean(a/(a+b)*np.minimum(np.zeros(output.shape[0]),output - target)**2+\
                      b/(a+b)*np.maximum(np.zeros(output.shape[0]),output - target)**2)      
    return loss

def WSEl12(output, target, a=1.5, b=.5):
    loss = np.mean(a/(a+b)*np.abs(np.minimum(np.zeros(output.shape[0]),output - target))+\
                      b/(a+b)*np.abs(np.maximum(np.zeros(output.shape[0]),output - target)))      
    return loss

def log_cosh_loss(y_pred: torch.Tensor, y_true: torch.Tensor) -> torch.Tensor:
    def _log_cosh(x: torch.Tensor) -> torch.Tensor:
        return x + torch.nn.functional.softplus(-2. * x) - math.log(2.0)
    return torch.mean(_log_cosh(y_pred - y_true))

class LogCoshLoss(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(
        self, y_pred: torch.Tensor, y_true: torch.Tensor
    ) -> torch.Tensor:
        return log_cosh_loss(y_pred, y_true)

In [6]:
def normalize_mins(vec):
    for i in range(vec.shape[0]):
        vec[i][::8]=vec[i][::8]/90

def NormalizeData(data):
    return (data - np.min(data)) / (np.max(data) - np.min(data))

def NormalizeMatrix(data):   
    for i in range(data.shape[1]):
        data[:,i] = NormalizeData(data[:,i])

def norm_max(out):
    
    maxes=[]
    for i in range(int(out.shape[1]/8.0)):
        maxes.append(out[:,8*int(i):8*(int(i)+1)].max(axis=0))

        #maxes.append(out.max(axis=1)[8*int(i):8*(int(i)+1):8])
    denominator=np.tile(np.max(maxes,axis=0),int(out.shape[1]/8))
    return out/denominator, denominator 

with open(r'/Users/federico comitani/GitHub/sodakick/data/wainp_220303.pkl', 'rb') as pk:
    inp=pickle.load(pk)
with open(r'/Users/federico comitani/GitHub/sodakick/data/out_220303.pkl', 'rb') as pk:
    out=np.array(pickle.load(pk),dtype=float)
    
### skipping norm for now since it's already tsvd 
#NormalizeMatrix(inp)
#np.nan_to_num(inp, copy=False)

from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler()
inp = scaler.fit_transform(inp)

#normalize_mins(out)
out, denominator= norm_max(out)

In [8]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
         inp[:50000], out[:50000], test_size=0.2, random_state=32)


x_train = x_train.reshape(x_train.shape[0],1,x_train.shape[1])
#y_train = y_train.reshape(y_train.shape[0],1,y_train.shape[1])
x_test = x_test.reshape(x_test.shape[0],1,x_test.shape[1])
#y_test = y_test.reshape(y_test.shape[0],1,y_test.shape[1])

In [9]:
from numpy import floor


def conv_out_shape(h_w, kernel_size=1, stride=1, pad=0, dilation=1):

	if isinstance(h_w, list):
	    if type(kernel_size) is not tuple:
	        kernel_size = (kernel_size, kernel_size)
	    h = floor( ((h_w[0] + (2 * pad) - ( dilation * (kernel_size[0] - 1) ) - 1 )/ stride) + 1)
	    w = floor( ((h_w[1] + (2 * pad) - ( dilation * (kernel_size[1] - 1) ) - 1 )/ stride) + 1)
	    return h, w
	else:
		return floor( ((h_w + (2 * pad) - ( dilation * (kernel_size - 1) ) - 1 )/ stride) + 1)


class PrintSize(nn.Module):
    def __init__(self):
        super(PrintSize, self).__init__()
    
    def forward(self, x):
        print(x.shape)
        return x

class CNet(nn.Module):
    
    def __init__(self, inp_nodes, out_nodes, activation, final_activation, 
                 conv_layers, conv_filter_exp, conv_scaling, conv_kernel_size, conv_stride, 
                 batchnorm, pooling, 
                 dense_layers, dense_nodes, dense_scaling, dropout_percent):
        
        super(CNet, self).__init__()    
            
        self.fc = []
        self.cv = []
        self.lr_cv = []
        self.lr_fc = []
        self.pl = []
        self.act = activation
        self.fact = final_activation
        self.cl = conv_layers
        self.dl = dense_layers
        self.bn_fc = []
        self.bn_cv = []
        self.dp = []
        
        self.ps = PrintSize()
        
        if pooling<=0:
            pooling=1
            
        power=0
        for i in range(self.cl):
            if i==0:
                self.cv.append(nn.Conv1d(in_channels = 1,
                                    out_channels = 2**conv_filter_exp,
                                    kernel_size = conv_kernel_size,
                                    stride = conv_stride))
                cos=int(conv_out_shape(inp_nodes,
                                    kernel_size = conv_kernel_size,
                                    stride = conv_stride)/pooling)
            else:
                self.cv.append(nn.Conv1d(in_channels =  int(2**conv_filter_exp*(conv_scaling**(power-1))),
                                    out_channels = int(2**conv_filter_exp*(conv_scaling**(power))),
                                    kernel_size = conv_kernel_size,
                                    stride = conv_stride))
                cos=int(conv_out_shape(cos,
                                    kernel_size = conv_kernel_size,
                                    stride = conv_stride)/pooling)
                      
            self.lr_cv.append(self.act)

            if pooling>1:
                self.pl.append(nn.MaxPool1d(pooling))
            if batchnorm:
                self.bn_cv.append(nn.BatchNorm1d(int(cos*2**conv_filter_exp*(conv_scaling**(power)))))
                                                            
            power+=1
            
        self.flat = nn.Flatten()        
                
        cos*=2**conv_filter_exp*(conv_scaling**(power-1))
        power=0
        for j in range(self.dl): 
            if j==0:
                self.fc.append(nn.Linear(in_features = int(cos),
                                    out_features = dense_nodes))
            else:
                self.fc.append(nn.Linear(in_features = int(dense_nodes*(dense_scaling**(power-1))),
                                    out_features = int(dense_nodes*(dense_scaling**power))))
     
            self.lr_fc.append(nn.LeakyReLU())
                                                           
            if batchnorm:
                self.bn_fc.append(nn.BatchNorm1d(int(dense_nodes*(dense_scaling**power))))
            if dropout_percent>0:
                self.dp.append(nn.Dropout(dropout_percent))
            power+=1
            
        self.oupt = nn.Linear(int(dense_nodes*(dense_scaling**(power-1))), int(out_nodes))
     
    def reset_weights(self):

        """ Resets network weights according to chosen distribution. """

        for f in self.cv:
            nn.init.xavier_uniform_(f.weight, gain=np.sqrt(2/(1+0.01**2)))
        for f in self.fc:
            nn.init.xavier_uniform_(f.weight, gain=np.sqrt(2/(1+0.01**2)))
            
    def forward(self, x):
        
        z = x
        for i in range(self.cl):        
            z = self.cv[i](z)
            #z = self.ps(z)
            if len(self.pl)>0:
                z=self.pl[i](z)
            if len(self.bn_cv)>0:
                z=self.bn_cv[i](z)
            z=self.lr_cv[i](z)     
                        
        z = self.flat(z)
        #z = self.ps(z)
                  
        for i in range(self.dl):
            z = self.fc[i](z)
            if len(self.bn_fc)>0:
                z=self.bn_fc[i](z)
            z=self.lr_fc[i](z)
            if len(self.dp)>0:
                z=self.dp[i](z)
                
        if self.fact is not None:
            z = self.oupt(self.fact(z))
        else:
            z = self.oupt(z)
        return z
    
    def clp(self):
        with torch.no_grad():
            for i in range(self.nl):
                self.fc[i].weight.copy_ (self.fc[i].weight.data.clamp(min=0)) 
            self.oupt.weight.copy_ (self.oupt.weight.data.clamp(min=0))

In [10]:
def train(config, model=CNet, silent=True, checkpoint_dir=None):
    
    try:
        phases = ['train','val']

        #x_train, x_test, y_train, y_test = data[0], data[1], data[2], data[3]

        training_set = matchesDataset(x_train, y_train)
        trainBatch = torch.utils.data.DataLoader(training_set, batch_size=config['batch_size'], shuffle=config['shuffle'], num_workers=config['num_workers'])

        validation_set = matchesDataset(x_test, y_test)
        valBatch = torch.utils.data.DataLoader(validation_set, batch_size=config['batch_size'], shuffle=config['shuffle'], num_workers=config['num_workers'])

        earlStop = EarlyStopping(patience=config['patience'], keepBest=False)

        net = model(config['inp_nodes'], config['out_nodes'], config['activation'], config['final_activation'], 
                    config['conv_layers'], config['conv_filter_exp'], config['conv_scaling'], 
                    config['conv_kernel_size'], config['conv_stride'], 
                    config['batchnorm'], config['pooling'], 
                    config['dense_layers'], config['dense_nodes'], config['dense_scaling'], 
                    config['dropout_percent'])
            
        #net = model(config['num_layers'], config['num_nodes'], config['scaling_factor'], 
        #            config['num_nodes_out'], config['final_activation'], config['batch_norm'], config['dropout'], config['activation'])

        net.reset_weights()

        device = "cpu"
        if torch.cuda.is_available():
            device = "cuda:0"
            if torch.cuda.device_count() > 1:
                net = nn.DataParallel(net)
        net.to(device)

        if checkpoint_dir:
            model_state, optimizer_state = torch.load(
                os.path.join(checkpoint_dir, "checkpoint"))
            net.load_state_dict(model_state)
            optimizer.load_state_dict(optimizer_state)

        if config['optim']=='adam':
            optimizer = Adam(net.parameters(), lr=config['lr'])
        elif config['optim']=='adagrad':
            optimizer = Adagrad(net.parameters(), lr=config['lr'])
        else:
            print('optim error')
            return


        losses=[[],[]]
        mses=[]
        diffs=[]
        exit=False

        #for epoch in tqdm(range(epochs), desc='Epoch'):
        for epoch in range(config['epochs']):

            if exit:
                break

            for phase in phases:
                if phase == 'train':
                    net.train(True) 

                    """ Run the training of the model. """    

                    losses_batch=[]
                    for batchNum, batch in enumerate(trainBatch):

                        x = batch[0]
                        y = batch[1]

                        """ Move batches to GPU if available. """

                        if torch.cuda.is_available():
                            x = x.cuda()
                            y = y.cuda()

                        """ Core of training. """

                        loss = config['loss_f'](net(x), y)

                        optimizer.zero_grad()
                        loss.backward()
                        optimizer.step()
                        if config['clip']:
                            net.clp()

                        losses_batch.append(loss)

                    """ Early stop check. """

                    earlStop(loss, net)
                    finalepoch = epoch

                    if earlStop.earlyStop:

                        if not silent:
                            print('Limit loss improvement reached, stopping the training.')

                        exit=True 

                    #losses[0].append(torch.mean(torch.stack(losses_batch)).detach().cpu().numpy())
                else:
                    net.train(False)
                    net.eval()

                    val_loss=0
                    val_mse=0

                    losses_batch=[]
                    for batchNum, batch in enumerate(valBatch):

                        x = batch[0]
                        y = batch[1]

                        """ Move batches to GPU if available. """

                        if torch.cuda.is_available():
                            x = x.cuda()
                            y = y.cuda()

                        """ Core of training. """

                        optimizer.zero_grad()
                        output=net(x)
                        target=y
                        loss = config['loss_f'](output, target)

                        #losses_batch.append(loss)
                        val_loss+=loss.detach().numpy()
                        val_mse+=nn.MSELoss()(output, target).detach().numpy()

                    #losses[1].append(torch.mean(torch.stack(losses_batch)).detach().cpu().numpy())

                    #with tune.checkpoint_dir(epoch) as checkpoint_dir:
                    #    path = os.path.join(checkpoint_dir, "checkpoint")
                    #    torch.save((net.state_dict(), optimizer.state_dict()), path)

                    #tune.report(loss=(val_loss/batchNum), mse=(val_mse/batchNum))
                    #tune.report(loss=torch.mean(torch.stack(losses_batch)))

        return {'loss': (val_loss/batchNum), 'status': STATUS_OK , 'mse': (val_mse/batchNum)}
    
    except:
        
        return {'loss': np.nan, 'status': STATUS_FAIL, 'mse': np.nan}



In [11]:
def revert_output(output,multiplier=denominator,lineup=None):

    reframe=pd.DataFrame(output.reshape(48,8),
                 columns=['minutes','goals','assists','cards_yellow','cards_red','own_goals','goals_against','saves'])
    
    reframe[reframe<0] = 0
    if lineup is not None:
        reframe.index=lineup
        reframe.drop([x for x in reframe.index if x.startswith('dummy')], axis=0, inplace=True)
    
    
    #reframe['minutes']*=90
    reframe=reframe*denominator[:8]
    byteamframe=pd.concat([reframe.iloc[:24,:].sum(axis=0),reframe.iloc[24:,:].sum(axis=0)], axis=1).T
    
    return reframe, byteamframe[byteamframe.columns[1:]]

In [12]:
print('Baseline WSE: {:.3f}'.format(WSE2(np.array([0]*out[0].shape[0]),out[0])))
print('Baseline WSE l1: {:.3f}'.format(WSEl12(np.array([0]*out[0].shape[0]),out[0])))
print('Baseline MSE: {:.3f}'.format(WSE2(np.array([0]*out[0].shape[0]),out[0], a=1, b=1)))
print('Baseline MSE l1: {:.3f}'.format(WSEl12(np.array([0]*out[0].shape[0]),out[0], a=1, b=1)))
print('Baseline logcosh: {:.3f}'.format(log_cosh_loss(torch.tensor(np.array([0]*out[0].shape[0])),out[0])))

print(np.abs(out[1]-out[10]).sum())
print(np.abs(out[50]-out[60]).sum())
print(np.abs(out[100]-out[110]).sum())

Baseline WSE: 0.039
Baseline WSE l1: 0.052
Baseline MSE: 0.026
Baseline MSE l1: 0.035
Baseline logcosh: 0.023
36.36507936507937
24.09365079365079
34.76825396825397


In [13]:
def run_hopt(config, num_samples=10):#, gpus_per_trial=2):
    
    trials = Trials()
    result = fmin(
            fn=train,
            space=config,
            algo=tpe.suggest,
            max_evals=num_samples,
            trials=trials,
            show_progressbar=True),
            #early_stop_fn=10,
            #trials_save_file=None)
    
    
    return trials
    #return best_trained_model
    #test_acc = test_accuracy(best_trained_model, device)
    #print("Best trial test set accuracy: {}".format(test_acc))

In [13]:
config = {
        "inp_nodes": inp.shape[1],
        "out_nodes": out.shape[1], 
        "batch_size": 32, #[16, 32, 64, 128]
        "clip": False, #hp.choice('clip',[True, False]),
        "final_activation" : None,
        "optim": 'adam',#hp.choice('optim',['adam', 'adagrad']),
        "lr": 0.0001,#hp.choice('lr',[0.0001,0.001,.00001]),#hp.loguniform('lr', np.exp(np.log(1e-4)), np.exp(np.log(1e-1))),
        "batchnorm": False,#hp.choice('batch_norm',[True, False]),
        "dropout_percent": 0.0,#hp.choice('dropout',[0.0,0.1,0.2,0.3]),#hp.sample_from(lambda _: np.random.uniform(low=0.0, high=.6)),
        "shuffle": True,
        "num_workers": 4,
        "patience": 5,
        "epochs": 50,
    
        "activation": nn.LeakyReLU(),#nn.SELU(),
        "loss_f": nn.MSELoss(),#log_cosh_loss,#WSE,#hp.choice('loss_f',[WSE, nn.MSELoss()]), #, nn.L1Loss()
    
        "conv_layers": hp.choice('conv_layers', [1, 2, 3]),
        "conv_filter_exp": scope.int(hp.quniform('conv_filter_exp', 3, 5, q=1)),
        "conv_scaling": 2, #hp.uniform('conv_scaling', 2, 4),
        "conv_kernel_size": 3,#hp.choice('conv_kernel_size', [3, 5]),
        "conv_stride": 1,#hp.choice('conv_stride', [1, 2]),
        "pooling": hp.choice('pooling', [1, 2]),
        "dense_layers": hp.choice('dense_layers', [1, 2]),
        "dense_nodes": scope.int(hp.quniform('dense_nodes', 100, 400, q=50)),
        "dense_scaling": hp.uniform('dense_scaling', 0.25, 1),
    }
    
btm = run_hopt(config, num_samples=100)

100%|██████████| 100/100 [13:04:33<00:00, 470.74s/trial, best loss: 0.007982965325936675] 


In [14]:
results_df=[]

for trial in btm.trials:
    results_df.append([trial['result']['loss'],
    trial['result']['mse'],
                       
    [1,2,3][trial['misc']['vals']["conv_layers"][0]],
    #trial['misc']['vals']["conv_scaling"][0],
                       
    trial['misc']['vals']["conv_filter_exp"][0],
    #[3,5][trial['misc']['vals']["conv_kernel_size"][0]],
    #[1,2][trial['misc']['vals']["conv_stride"][0]],
    [1,2][trial['misc']['vals']["pooling"][0]],
                       
    [1,2][trial['misc']['vals']["dense_layers"][0]],
    trial['misc']['vals']["dense_scaling"][0],
    trial['misc']['vals']["dense_nodes"][0],
                       
    ])


results_df=pd.DataFrame(results_df,columns=['loss',
                                            'mse',
                                            'conv_layers',
                                            #'conv_scaling',
                                            'conv_filter_exp',
                                            #'conv_kernel_size',
                                            #'conv_stride',
                                            'conv_pooling',
                                            'dense_layers',
                                            'dense_scaling',
                                            'dense_nodes',
                                            ]).sort_values('loss')

results_df.to_hdf(r'/Users/federico comitani/GitHub/sodakick/data/hp_conv_mse_lrelu_220308.h5',key='df')

In [15]:
results_df.sort_values('loss')

Unnamed: 0,loss,mse,conv_layers,conv_filter_exp,conv_pooling,dense_layers,dense_scaling,dense_nodes
27,0.007983,0.007983,1,3.0,2,1,0.724374,400.0
33,0.008063,0.008063,1,3.0,2,1,0.555275,400.0
73,0.008067,0.008067,1,3.0,2,1,0.463533,350.0
89,0.008153,0.008153,1,3.0,2,1,0.372101,400.0
80,0.008154,0.008154,1,3.0,2,1,0.762351,350.0
37,0.008156,0.008156,1,3.0,2,1,0.47443,400.0
96,0.008181,0.008181,1,3.0,2,1,0.490358,400.0
10,0.008184,0.008184,1,3.0,2,1,0.809954,400.0
68,0.008206,0.008206,1,3.0,2,1,0.421507,400.0
49,0.008242,0.008242,1,4.0,2,1,0.9536,400.0


In [14]:
config = {
        "inp_nodes": inp.shape[1],
        "out_nodes": out.shape[1], 
        "batch_size": 32,#[16, 32, 64, 128]
        "clip": False, #hp.choice('clip',[True, False]),
        "final_activation" : None,
        "optim": 'adam',#hp.choice('optim',['adam', 'adagrad']),
        "lr": 0.0001,#hp.choice('lr',[0.0001,0.001,.00001]),#hp.loguniform('lr', np.exp(np.log(1e-4)), np.exp(np.log(1e-1))),
        "batchnorm": False,#hp.choice('batch_norm',[True, False]),
        "dropout_percent": 0.0,#hp.choice('dropout',[0.0,0.1,0.2,0.3]),#hp.sample_from(lambda _: np.random.uniform(low=0.0, high=.6)),
        "shuffle": True,
        "num_workers": 4,
        "patience": 5,
        "epochs": 50,
    
        "activation": nn.LeakyReLU(),#nn.SELU(),
        "loss_f": nn.MSELoss(),#log_cosh_loss,#WSE,#hp.choice('loss_f',[WSE, nn.MSELoss()]), #, nn.L1Loss()
    
        "conv_layers": hp.choice('conv_layers', [1, 2]),
        "conv_filter_exp": scope.int(hp.quniform('conv_filter_exp', 3, 4, q=1)),
        "conv_scaling": hp.uniform('conv_scaling', 2, 4),
        "conv_kernel_size": hp.choice('conv_kernel_size', [3, 5]),
        "conv_stride": hp.choice('conv_stride', [1, 2]),
        "pooling": hp.choice('pooling', [2, 3, 4]),
        "dense_layers": hp.choice('dense_layers', [1, 2]),
        "dense_nodes": scope.int(hp.quniform('dense_nodes', 300, 500, q=50)),
        "dense_scaling": .5,#hp.uniform('dense_scaling', 0.25, 1),
    }
    
btm = run_hopt(config, num_samples=50)

100%|██████████| 50/50 [1:52:05<00:00, 134.50s/trial, best loss: 0.007694495580672549]  


In [15]:
results_df=[]

for trial in btm.trials:
    results_df.append([trial['result']['loss'],
    trial['result']['mse'],
                       
    [1,2][trial['misc']['vals']["conv_layers"][0]],
    trial['misc']['vals']["conv_scaling"][0],
                       
    trial['misc']['vals']["conv_filter_exp"][0],
    [3,5][trial['misc']['vals']["conv_kernel_size"][0]],
    [1,2][trial['misc']['vals']["conv_stride"][0]],
    [2,3,4][trial['misc']['vals']["pooling"][0]],
                       
    [1,2][trial['misc']['vals']["dense_layers"][0]],
    trial['misc']['vals']["dense_scaling"][0],
    trial['misc']['vals']["dense_nodes"][0],
                       
    ])


results_df=pd.DataFrame(results_df,columns=['loss',
                                            'mse',
                                            'conv_layers',
                                            'conv_scaling',
                                            'conv_filter_exp',
                                            'conv_kernel_size',
                                            'conv_stride',
                                            'conv_pooling',
                                            'dense_layers',
                                            'dense_scaling',
                                            'dense_nodes',
                                            ]).sort_values('loss')

#results_df.to_hdf(r'/Users/federico comitani/GitHub/sodakick/data/hp_conv_mse_lrelu_220308.h5',key='df')

KeyError: 'dense_scaling'

In [15]:
results_df=[]

for trial in btm.trials:
    results_df.append([trial['result']['loss'],
    trial['result']['mse'],
                       
    [1,2][trial['misc']['vals']["conv_layers"][0]],
    trial['misc']['vals']["conv_scaling"][0],
                       
    trial['misc']['vals']["conv_filter_exp"][0],
    [3,5][trial['misc']['vals']["conv_kernel_size"][0]],
    [1,2][trial['misc']['vals']["conv_stride"][0]],
    [2,3,4][trial['misc']['vals']["pooling"][0]],
                       
    [1,2][trial['misc']['vals']["dense_layers"][0]],
    #trial['misc']['vals']["dense_scaling"][0],
    trial['misc']['vals']["dense_nodes"][0],
                       
    ])


results_df=pd.DataFrame(results_df,columns=['loss',
                                            'mse',
                                            'conv_layers',
                                            'conv_scaling',
                                            'conv_filter_exp',
                                            'conv_kernel_size',
                                            'conv_stride',
                                            'conv_pooling',
                                            'dense_layers',
                                            #'dense_scaling',
                                            'dense_nodes',
                                            ]).sort_values('loss')

results_df.to_hdf(r'/Users/federico comitani/GitHub/sodakick/data/hp_conv_mse_lrelu_220309.h5',key='df')

In [17]:
results_df.sort_values('loss')

Unnamed: 0,loss,mse,conv_layers,conv_scaling,conv_filter_exp,conv_kernel_size,conv_stride,conv_pooling,dense_layers,dense_nodes
46,0.008155,0.008155,1,3.560791,3.0,5,2,3,1,450.0
4,0.008187,0.008187,1,3.171815,3.0,3,2,4,1,350.0
48,0.008385,0.008385,2,3.877083,3.0,3,2,3,1,500.0
19,0.008426,0.008426,1,3.241657,3.0,3,1,4,1,400.0
21,0.008441,0.008441,1,2.190875,4.0,5,2,3,1,450.0
1,0.008454,0.008454,1,2.483373,4.0,5,1,3,1,350.0
3,0.008656,0.008656,1,3.340688,4.0,5,2,4,1,300.0
20,0.009257,0.009257,1,2.195715,4.0,5,1,2,2,450.0
41,0.009311,0.009311,1,2.776706,4.0,3,1,2,2,450.0
11,0.009389,0.009389,1,2.007851,3.0,5,2,2,2,400.0


In [18]:
config = {
        "inp_nodes": inp.shape[1],
        "out_nodes": out.shape[1], 
        "batch_size": 32,#[16, 32, 64, 128]
        "clip": False, #hp.choice('clip',[True, False]),
        "final_activation" : None,
        "optim": 'adam',#hp.choice('optim',['adam', 'adagrad']),
        "lr": 0.0001,#hp.choice('lr',[0.0001,0.001,.00001]),#hp.loguniform('lr', np.exp(np.log(1e-4)), np.exp(np.log(1e-1))),
        "batchnorm": False,#hp.choice('batch_norm',[True, False]),
        "dropout_percent": 0.0,#hp.choice('dropout',[0.0,0.1,0.2,0.3]),#hp.sample_from(lambda _: np.random.uniform(low=0.0, high=.6)),
        "shuffle": True,
        "num_workers": 4,
        "patience": 5,
        "epochs": 50,
    
        "activation": nn.LeakyReLU(),#nn.SELU(),
        "loss_f": log_cosh_loss,#WSE,#hp.choice('loss_f',[WSE, nn.MSELoss()]), #, nn.L1Loss()
    
        "conv_layers": hp.choice('conv_layers', [1, 2]),
        "conv_filter_exp": scope.int(hp.quniform('conv_filter_exp', 3, 4, q=1)),
        "conv_scaling": hp.uniform('conv_scaling', 2, 4),
        "conv_kernel_size": hp.choice('conv_kernel_size', [3, 5]),
        "conv_stride": hp.choice('conv_stride', [1, 2]),
        "pooling": hp.choice('pooling', [2, 3, 4]),
        "dense_layers": hp.choice('dense_layers', [1, 2]),
        "dense_nodes": scope.int(hp.quniform('dense_nodes', 300, 500, q=50)),
        "dense_scaling": .5,#hp.uniform('dense_scaling', 0.25, 1),
    }
    
btm = run_hopt(config, num_samples=50)

100%|██████████| 50/50 [47:16<00:00, 56.72s/trial, best loss: 0.003879781099012456]   


In [16]:
import copy

conf_final=copy.deepcopy(config)

for key,value in results_df.sort_values('loss').iloc[0].to_dict().items():
    if key in conf_final:
        conf_final[key]=value
        
conf_final

{'inp_nodes': 816,
 'out_nodes': 384,
 'batch_size': 32,
 'clip': False,
 'final_activation': None,
 'optim': 'adam',
 'lr': 0.0001,
 'batchnorm': False,
 'dropout_percent': 0.0,
 'shuffle': True,
 'num_workers': 4,
 'patience': 5,
 'epochs': 50,
 'activation': LeakyReLU(negative_slope=0.01),
 'loss_f': MSELoss(),
 'conv_layers': 1.0,
 'conv_filter_exp': 3.0,
 'conv_scaling': 3.5607912740180527,
 'conv_kernel_size': 5.0,
 'conv_stride': 2.0,
 'pooling': <hyperopt.pyll.base.Apply at 0x13d220290>,
 'dense_layers': 1.0,
 'dense_nodes': 450.0,
 'dense_scaling': 0.5}

In [25]:
from tqdm import tqdm

def train_alone(config, model=Net, silent=True, checkpoint_dir=None):
    
    phases = ['train','val']

    #x_train, x_test, y_train, y_test = data[0], data[1], data[2], data[3]

    training_set = matchesDataset(x_train, y_train)
    trainBatch = torch.utils.data.DataLoader(training_set, batch_size=config['batch_size'], shuffle=config['shuffle'], num_workers=config['num_workers'])

    validation_set = matchesDataset(x_test, y_test)
    valBatch = torch.utils.data.DataLoader(validation_set, batch_size=config['batch_size'], shuffle=config['shuffle'], num_workers=config['num_workers'])

    earlStop = EarlyStopping(patience=int(config['patience']), keepBest=True)

    net = model(int(config['num_layers']), int(config['num_nodes']), config['scaling_factor'], 
                int(config['num_nodes_out']), config['final_activation'], config['batch_norm'], config['dropout'])

    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if torch.cuda.device_count() > 1:
            net = nn.DataParallel(net)
    net.to(device)

    if checkpoint_dir:
        model_state, optimizer_state = torch.load(
            os.path.join(checkpoint_dir, "checkpoint"))
        net.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)

    if config['optim']=='adam':
        optimizer = Adam(net.parameters(), lr=config['lr'])
    elif config['optim']=='adagrad':
        optimizer = Adagrad(net.parameters(), lr=config['lr'])
    else:
        print('optim error')
        return


    losses=[[],[]]
    mses=[]
    diffs=[]
    exit=False

    for epoch in tqdm(range(config['epochs']), desc='Epoch'):
    #for epoch in range(config['epochs']):

        if exit:
            break

        for phase in phases:
            if phase == 'train':
                net.train(True) 

                """ Run the training of the model. """    

                losses_batch=[]
                for batchNum, batch in enumerate(trainBatch):

                    x = batch[0]
                    y = batch[1]

                    """ Move batches to GPU if available. """

                    if torch.cuda.is_available():
                        x = x.cuda()
                        y = y.cuda()

                    """ Core of training. """

                    loss = config['loss_f'](net(x), y)

                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                    if config['clip']:
                        net.clp()

                    losses_batch.append(loss)

                """ Early stop check. """

                earlStop(loss, net)
                finalepoch = epoch

                if earlStop.earlyStop:

                    if not silent:
                        print('Limit loss improvement reached, stopping the training.')

                    exit=True 

                #losses[0].append(torch.mean(torch.stack(losses_batch)).detach().cpu().numpy())
            else:
                net.train(False)
                net.eval()

                val_loss=0
                val_mse=0

                losses_batch=[]
                for batchNum, batch in enumerate(valBatch):

                    x = batch[0]
                    y = batch[1]

                    """ Move batches to GPU if available. """

                    if torch.cuda.is_available():
                        x = x.cuda()
                        y = y.cuda()

                    """ Core of training. """

                    optimizer.zero_grad()
                    output=net(x)
                    target=y
                    loss = config['loss_f'](output, target)

                    #losses_batch.append(loss)
                    val_loss+=loss.detach().numpy()
                    val_mse+=nn.MSELoss()(output, target).detach().numpy()

                #losses[1].append(torch.mean(torch.stack(losses_batch)).detach().cpu().numpy())

                #with tune.checkpoint_dir(epoch) as checkpoint_dir:
                #    path = os.path.join(checkpoint_dir, "checkpoint")
                #    torch.save((net.state_dict(), optimizer.state_dict()), path)

                #tune.report(loss=(val_loss/batchNum), mse=(val_mse/batchNum))
                #tune.report(loss=torch.mean(torch.stack(losses_batch)))

    return net, val_loss/batchNum


In [26]:
net,loss=train_alone(conf_final, model=Net, silent=True, checkpoint_dir=None)

Epoch:  23%|██▎       | 23/100 [02:17<07:39,  5.97s/it]


In [27]:
pred=net(torch.Tensor(inp)).detach().cpu().numpy()

In [32]:
i=1000
cats=['minutes','goals','assists','cards_yellow','cards_red','own_goals']+['goals_against','saves']

reframe, byteamframe = revert_output(pred[i])
print(byteamframe.astype(int))
reframe, byteamframe = revert_output(out[i])
print(byteamframe.astype(int))

   goals  assists  cards_yellow  cards_red  own_goals  goals_against  saves
0      2        1             2          0          0              1      3
1      1        1             2          0          0              2      4
   goals  assists  cards_yellow  cards_red  own_goals  goals_against  saves
0      1        1             4          1          0              0      2
1      0        0             1          0          0              1      1


In [None]:
#RELU? He normal initialization, scaling elu?
#Huber loss? MSLE?
