#### Key variables:
> <b>regions_stats:</b> <br>
>> <b>(KPI) error:</b> MAE metric <br>
>> <b>(KPI) mean_diff:</b> checks if the percentage of win/lose is reliable. The higher, the better. <br>
>> <b>threshold:</b> target win/lose percentage to commit a prediction. Example: if the threshold is 0.35, the model will consider only predictions above 65% or below 35% certainty <br>
>> <b>(KPI) len lost:</b> percentage of predictions lost after threshold filtering <br>
>> <b>cut_off_var:</b> cutoff for outlier removal. Not being used.

# Setup

In [1]:
import pandas as pd
import numpy as np
import random
import sklearn.metrics as skm

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
    
import json

from Utils.constants import *
import Utils.utils_file as utils_file
import Utils.model_file as model_file

import warnings
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

# Load

In [2]:
cache = True

Utils = utils_file.Utils_Class(target='Score'
                                ,default_model=0
                                ,model_type='logistic'
                                ,cache_model=cache
                                ,cache_scraping=True)

Utils.TARGET_DF = Utils.TARGET_DF[Utils.TARGET_DF[Utils.TARGET]!=2]

regions_to_feed, regions_to_predict = Utils.region_lists()
regions_to_feed = list(set(regions_to_feed + regions_to_predict))

if cache:
    try:    
        regions_feature_cols = Utils.regions_feature_cols
        regions_train_data = Utils.regions_train_data
        regions_stats = Utils.regions_stats
    except:
        print('files not found')

# SKLEARN
Initialize regions_stats: cache and metrics file

In [3]:
regions_stats = pd.DataFrame(columns=['region','model','test_size','train_size'])
regions_stats['region'] = regions_to_predict
regions_list = regions_stats['region']
regions_stats['model'] = Utils.DEFAULT_MODEL

regions_train_data = dict(zip(regions_to_predict,[[x] for x in regions_to_predict]))
regions_stats['error'] = np.nan
regions_stats['cut_off_var'] = 1.5
regions_stats['threshold'] = 0.35
regions_stats['len_lost'] = np.nan
regions_stats['mean_diff'] = np.nan

regions_feature_cols = dict(zip(regions_list,[0]*len(regions_list)))
for key in regions_feature_cols:
    regions_feature_cols[key] = PLAYER_SIMPLE_FEATURE_COLS.copy()

### TRAIN DATA SELECTION
Wich regions to use as train data. Generates a cache file.


In [4]:
regions_train_data = dict(zip(regions_to_predict, [[x] for x in regions_to_predict]))
#regions_train_data = dict(zip(regions_to_predict, [regions_to_feed]*len(regions_to_predict)))

##########

for n,region in enumerate(regions_to_predict):
    print('=========\n')
    current_accuracy = regions_stats['mean_diff'][n]
    cut_off_var = regions_stats['cut_off_var'][n]
    region_model_number = regions_stats['model'][n]
    Utils.logistic_threshold = regions_stats['threshold'][n]
    
    print(f'[{n+1} of {len(regions_to_predict)}] region {region} -> {current_accuracy}:\n')
    
    regionsToTest = [x for x in regions_to_feed if x!=region]
    random.shuffle(regionsToTest)
    for nn,regionToTest in enumerate(regionsToTest):
        regions_train_data[region].append(regionToTest)
        metric, pred, ytrain, mean_diff = Utils.generate_metric(region_model_number, regions_feature_cols[region]
                                                    , regions_train_data[region], region, reps=1)
        error=metric
        metric = mean_diff
        if round(metric,2) > round(current_accuracy,2) or np.isnan(current_accuracy):
            current_accuracy = metric
            
            print(f'{current_accuracy} -> {regionToTest} added')
        else:
            regions_train_data[region].remove(regionToTest)
    
    train_len = Utils.train_len
    regions_stats['error'][n] = error
    regions_stats['train_size'][n] = train_len
    regions_stats['test_size'][n] = len(pred)
    regions_stats['len_lost'][n] = Utils.len_lost
    regions_stats['mean_diff'][n] = current_accuracy
    
    print(f'\nlen lost: {Utils.len_lost}%')
    print(f'accuracy: {current_accuracy}')
    print(f'{region} train data: {regions_train_data[region]}\nlen: {train_len}')
    print(f'test data len: {len(pred)}\n')

mean_acc = np.mean(regions_stats['error'])
print(mean_acc)


[1 of 30] region LFL -> nan:

-0.1385714285714286 -> LJL_Tier2 added
-0.11999999999999997 -> NLC added
-0.10600000000000002 -> MSI added
-0.09800000000000002 -> REL added
-0.07999999999999999 -> Baltic added
-0.05285714285714284 -> VCS added

len lost: 0.53%
accuracy: -0.05285714285714284
LFL train data: ['LFL', 'LJL_Tier2', 'NLC', 'MSI', 'REL', 'Baltic', 'VCS']
len: 2768
test data len: 950


[2 of 30] region LPL -> nan:

-0.11833333333333333 -> Iberian added
-0.07833333333333332 -> Dutch added
-0.06 -> LCK_Tier2 added
-0.05499999999999996 -> NLC added
-0.03999999999999999 -> PG added

len lost: 0.33%
accuracy: -0.03999999999999999
LPL train data: ['LPL', 'Iberian', 'Dutch', 'LCK_Tier2', 'NLC', 'PG']
len: 4670
test data len: 1577


[3 of 30] region Prime_Tier2 -> nan:

-0.27499999999999997 -> Kespa added
-0.24 -> Ultraliga added
-0.04249999999999998 -> LFL added

len lost: 0.74%
accuracy: -0.04249999999999998
Prime_Tier2 train data: ['Prime_Tier2', 'Kespa', 'Ultraliga', 'LFL']
len: 20

### FEATURE SELECTION
Bruteforce feature selection. Generates a cache file

In [5]:
for key in regions_list:
    regions_feature_cols[key] = PLAYER_SIMPLE_FEATURE_COLS.copy()

##########

for n,region in enumerate(regions_to_predict):
    print('=========\n')

    current_accuracy = regions_stats['mean_diff'][n]
    cut_off_var = regions_stats['cut_off_var'][n]
    region_model_number = regions_stats['model'][n]
    Utils.logistic_threshold = regions_stats['threshold'][n]

    print(f'[{n+1} of {len(regions_to_predict)}] region {region} -> {current_accuracy}:\n')
    print(f'model: {region_model_number}')
    
    initialFeatures = regions_feature_cols[region].copy()
    for nn,feature in enumerate(initialFeatures):
        regions_feature_cols[region].remove(feature)
        
        metric, pred, ytrain, mean_diff = Utils.generate_metric(region_model_number, regions_feature_cols[region]
                                                    , regions_train_data[region], region, reps=1)
        error=metric
        metric = mean_diff
        if round(metric,2) > round(current_accuracy,2) or np.isnan(current_accuracy):
            current_accuracy = metric
            print(f'{feature} removed for {metric}                                                ')
        else:
            regions_feature_cols[region].append(feature)
    
    train_len = Utils.train_len
    regions_stats['error'][n] = error
    regions_stats['train_size'][n] = train_len
    regions_stats['len_lost'][n] = Utils.len_lost
    regions_stats['mean_diff'][n] = current_accuracy

    print(f'\n\nlen lost: {Utils.len_lost}%')
    print(f'accuracy: {current_accuracy}')
    print(f'{region} feature count: {len(regions_feature_cols[region])}')
    print(f'test data len: {len(pred)}\n')
    
mean_acc = np.mean(regions_stats['error'])
print(mean_acc)


[1 of 30] region LFL -> -0.05285714285714284:

model: 0


len lost: 0.52%
accuracy: -0.05285714285714284
LFL feature count: 22
test data len: 950


[2 of 30] region LPL -> -0.03999999999999999:

model: 0


len lost: 0.36%
accuracy: -0.03999999999999999
LPL feature count: 22
test data len: 1577


[3 of 30] region Prime_Tier2 -> -0.04249999999999998:

model: 0


len lost: 0.7%
accuracy: -0.04249999999999998
Prime_Tier2 feature count: 22
test data len: 128


[4 of 30] region Ultraliga -> -0.06000000000000001:

model: 0


len lost: 0.48%
accuracy: -0.06000000000000001
Ultraliga feature count: 22
test data len: 432


[5 of 30] region SuperLiga_Tier2 -> -0.03999999999999998:

model: 0
VSPM removed for -0.034999999999999976                                                


len lost: 0.85%
accuracy: -0.034999999999999976
SuperLiga_Tier2 feature count: 21
test data len: 120


[6 of 30] region SuperLiga -> -0.14666666666666667:

model: 0


len lost: 0.27%
accuracy: -0.14666666666666667
SuperLig

### MODEL SELECTION
Bruteforce model selection. Generates a cache file

In [6]:
regions_stats['error'] = np.nan

##########

for n,region in enumerate(regions_to_predict):
    print('=========\n')

    current_accuracy = regions_stats['mean_diff'][n]
    currModel = regions_stats['model'][n]
    cut_off_var = regions_stats['cut_off_var'][n]
    Utils.logistic_threshold = regions_stats['threshold'][n]
    bestModel = currModel

    print(f'[{n+1} of {len(regions_to_predict)}] region {region} -> {current_accuracy}:\n')
    print(f'current model: {currModel}\n')
    
    for model in range(len(Utils.BASE_MODELS)):
        metric, pred, ytrain, mean_diff = Utils.generate_metric(model, regions_feature_cols[region]
                                                    , regions_train_data[region], region, reps=1)
        error=metric
        metric=mean_diff
        if round(metric,2) > round(current_accuracy,2) or np.isnan(current_accuracy):
            current_accuracy = metric
            bestModel = model
        print(f'model {model} -> {metric}')

    train_len = Utils.train_len
    regions_stats['train_size'][n] = train_len
    regions_stats['model'][n] = bestModel
    regions_stats['error'][n] = error
    regions_stats['len_lost'][n] = Utils.len_lost
    regions_stats['mean_diff'][n] = current_accuracy
    
    print(f'\nlen lost: {Utils.len_lost}%')
    print(f'accuracy: {current_accuracy}')
    print(f'best model: {bestModel}\n')
    
mean_acc = np.mean(regions_stats['error'])
print(mean_acc)


[1 of 30] region LFL -> -0.05285714285714284:

current model: 0

model 0 -> -0.05285714285714284
model 1 -> -0.0675
model 2 -> -0.1275
model 3 -> -0.1275
model 4 -> -0.05500000000000001
model 5 -> -0.20333333333333334

len lost: 0.14%
accuracy: -0.05285714285714284
best model: 0


[2 of 30] region LPL -> -0.03999999999999999:

current model: 0

model 0 -> -0.03999999999999999
model 1 -> -0.0433333333333333
model 2 -> -0.16500000000000004
model 3 -> -0.16500000000000004
model 4 -> -0.03999999999999998
model 5 -> -0.16285714285714287

len lost: 0.09%
accuracy: -0.03999999999999999
best model: 0


[3 of 30] region Prime_Tier2 -> -0.04249999999999998:

current model: 0

model 0 -> -0.04249999999999998
model 1 -> -1
model 2 -> -0.16625000000000004
model 3 -> -0.16625000000000004
model 4 -> -1
model 5 -> -0.296

len lost: 0.12%
accuracy: -0.04249999999999998
best model: 0


[4 of 30] region Ultraliga -> -0.06000000000000001:

current model: 0

model 0 -> -0.06000000000000001
model 1 -> -0.0

In [7]:
print(np.mean(regions_stats['error']))
print(np.mean(regions_stats['mean_diff']))
regions_stats

0.3813333333333333
-0.04983611111111112


Unnamed: 0,region,model,test_size,train_size,error,cut_off_var,threshold,len_lost,mean_diff
0,LFL,0,950,2500,0.386,1.5,0.35,0.14,-0.052857
1,LPL,0,1577,3735,0.319,1.5,0.35,0.09,-0.04
2,Prime_Tier2,0,128,2055,0.411,1.5,0.35,0.12,-0.0425
3,Ultraliga,0,432,4140,0.403,1.5,0.35,0.12,-0.06
4,SuperLiga_Tier2,0,120,3126,0.423,1.5,0.35,0.07,-0.035
5,SuperLiga,0,121,2005,0.459,1.5,0.35,0.19,-0.146667
6,MSI,0,153,4725,0.237,1.5,0.35,0.25,-0.04
7,EMEA,0,139,2919,0.496,1.5,0.35,0.07,-0.11
8,LCO,0,377,2964,0.249,1.5,0.35,0.15,-0.01
9,VCS,0,617,1822,0.389,1.5,0.35,0.08,-0.094286


In [8]:
Utils.save_model_cache(regions_stats, regions_feature_cols, regions_train_data)

Cache saved!


# Notes