# Setup

In [1]:
import pandas as pd
import numpy as np
import random

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
    
import json

from Utils.constants import *
import Utils.utils_file as utils_file
import Utils.model_file as model_file

import warnings
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

# Load

In [2]:
cache = True

Utils = utils_file.Utils_Class(target='Score'
                                ,default_model=7
                                ,model_type='binary'
                                ,cache_model=cache
                                ,cache_scraping=True)

Utils.TARGET_DF = Utils.TARGET_DF[Utils.TARGET_DF[Utils.TARGET]!=2]

regions_to_feed, regions_to_predict = Utils.region_lists()
regions_to_feed = list(set(regions_to_feed + regions_to_predict))

if cache:
    regions_feature_cols = Utils.regions_feature_cols
    regions_train_data = Utils.regions_train_data
    regions_stats = Utils.regions_stats
    regions_list = regions_stats['region']
    regions_stats['train_size'] = 0
    regions_stats['test_size'] = 0
    regions_stats = regions_stats[regions_stats['region'].isin(regions_to_predict)].reset_index(drop=True)

#regions_to_predict.remove('MSI')
#regions_to_predict.remove('World')

#regions_to_predict = ['MSI','World']

# SKLEARN

In [3]:
regions_stats = pd.DataFrame(columns=['region','model','test_size','train_size'])
regions_stats['region'] = regions_to_predict
regions_list = regions_stats['region']
regions_stats['model'] = Utils.DEFAULT_MODEL

regions_train_data = dict(zip(regions_to_predict,[[x] for x in regions_to_predict]))
regions_stats['accuracy_0'] = np.nan
regions_stats['accuracy_1'] = np.nan
regions_stats['accuracy_2'] = np.nan
regions_stats['cut_off_var'] = 1.5

regions_feature_cols = dict(zip(regions_list,[0]*len(regions_list)))
for key in regions_feature_cols:
    regions_feature_cols[key] = PLAYER_SIMPLE_FEATURE_COLS.copy()

### TRAIN DATA SELECTION

In [4]:
regions_train_data = dict(zip(regions_to_predict, [[x] for x in regions_to_predict]))
regions_stats['accuracy_0'] = np.nan

##########

for n,region in enumerate(regions_to_predict):
    print('=========\n')
    regionFinalAcc = regions_stats['accuracy_0'][n]
    cut_off_var = regions_stats['cut_off_var'][n]
    region_model_number = regions_stats['model'][n]
    print(f'[{n+1} of {len(regions_to_predict)}] region {region} -> {regionFinalAcc}:\n')
    
    regionsToTest = [x for x in regions_to_feed]
    regionsToTest.remove(region)
    random.shuffle(regionsToTest)
    for nn,regionToTest in enumerate(regionsToTest):
        regions_train_data[region].append(regionToTest)
        
        metric, pred = Utils.generate_metric(region_model_number, regions_feature_cols[region]
                                             , regions_train_data[region], region, reps=1)
        
        if metric < regionFinalAcc or np.isnan(regionFinalAcc):
            regionFinalAcc = metric
            
            print(f'{regionFinalAcc} -> {regionToTest} added                                           ')
        else:
            regions_train_data[region].remove(regionToTest)
    
    train_len = Utils.train_len
    regions_stats['accuracy_0'][n] = regionFinalAcc
    regions_stats['train_size'][n] = train_len
    regions_stats['test_size'][n] = len(pred)
    
    print(f'\naccuracy: {regionFinalAcc}')
    print(f'{region} train data: {regions_train_data[region]}\nlen: {train_len}')
    print(f'test data len: {len(pred)}\n')

mean_acc = np.mean(regions_stats['accuracy_0'])
print(mean_acc)


[1 of 27] region LPL -> nan:

0.313 -> BRCC added                                           
0.31 -> Baltic added                                           
0.308 -> NLC added                                           
0.307 -> LVP added                                           
0.299 -> KeSPA added                                           

accuracy: 0.299
LPL train data: ['LPL', 'BRCC', 'Baltic', 'NLC', 'LVP', 'KeSPA']
len: 3991
test data len: 626


[2 of 27] region LCK -> nan:

0.374 -> LMS added                                           
0.366 -> LST added                                           
0.36 -> LJL_Tier2 added                                           
0.358 -> LCL added                                           
0.357 -> Asia added                                           

accuracy: 0.357
LCK train data: ['LCK', 'LMS', 'LST', 'LJL_Tier2', 'LCL', 'Asia']
len: 2478
test data len: 628


[3 of 27] region PCS -> nan:

0.339 -> EBL added                                 

### FEATURE SELECTION

In [5]:
regions_stats['accuracy_1'] = np.nan
for key in regions_list:
    regions_feature_cols[key] = PLAYER_SIMPLE_FEATURE_COLS.copy()

##########

for n,region in enumerate(regions_to_predict):
    print('=========\n')
    regionFinalAcc = regions_stats['accuracy_0'][n]
    cut_off_var = regions_stats['cut_off_var'][n]
    region_model_number = regions_stats['model'][n]
    print(f'[{n+1} of {len(regions_to_predict)}] region {region} -> {regionFinalAcc}:\n')
    print(f'model: {region_model_number}')
    
    initialFeatures = regions_feature_cols[region].copy()
    for nn,feature in enumerate(initialFeatures):
        regions_feature_cols[region].remove(feature)
        
        metric, pred = Utils.generate_metric(region_model_number, regions_feature_cols[region]
                                             , regions_train_data[region], region, reps=1)
        
        if metric < regionFinalAcc or np.isnan(regionFinalAcc):
            regionFinalAcc = metric
            print(f'{feature} removed for {metric}                                                ')
        else:
            regions_feature_cols[region].append(feature)
    
    train_len = Utils.train_len
    regions_stats['accuracy_1'][n] = regionFinalAcc
    regions_stats['train_size'][n] = train_len

    print(f'\n\naccuracy: {regionFinalAcc}')
    print(f'{region} feature count: {len(regions_feature_cols[region])}')
    print(f'test data len: {len(pred)}\n')
    
mean_acc = np.mean(regions_stats['accuracy_1'])
mean_train = np.mean(regions_stats['train_size'])

print(mean_train)
print(mean_acc)



[1 of 27] region LPL -> 0.299:

model: 7


accuracy: 0.299
LPL feature count: 22
test data len: 626


[2 of 27] region LCK -> 0.357:

model: 7
CSD@15 removed for 0.35                                                
VSPM removed for 0.346                                                


accuracy: 0.346
LCK feature count: 20
test data len: 628


[3 of 27] region PCS -> 0.296:

model: 7


accuracy: 0.296
PCS feature count: 22
test data len: 274


[4 of 27] region VCS -> 0.367:

model: 7


accuracy: 0.367
VCS feature count: 22
test data len: 300


[5 of 27] region Ultraliga -> 0.391:

model: 7
VSPM removed for 0.379                                                


accuracy: 0.379
Ultraliga feature count: 21
test data len: 174


[6 of 27] region LLA -> 0.348:

model: 7
Avg_assists removed for 0.335                                                


accuracy: 0.335
LLA feature count: 21
test data len: 164


[7 of 27] region SuperLiga_Tier2 -> 0.409:

model: 7


accuracy: 0.409
SuperLiga_Ti

### MODEL SELECTION

In [6]:
regions_stats['accuracy_2'] = np.nan

##########

for n,region in enumerate(regions_to_predict):
    print('=========\n')
    currAcc = regions_stats['accuracy_0'][n]
    currModel = regions_stats['model'][n]
    cut_off_var = regions_stats['cut_off_var'][n]
    print(f'[{n+1} of {len(regions_to_predict)}] region {region} -> {currAcc}:\n')
    print(f'current model: {currModel}\n')
    
    bestModelAbs = (regions_stats[regions_stats['region']==region])['accuracy_2'].iloc[0]
    for model in range(len(Utils.BASE_MODELS)):
        metric, pred = Utils.generate_metric(model, regions_feature_cols[region]
                                             , regions_train_data[region], region, reps=5)
        if metric < bestModelAbs or np.isnan(bestModelAbs):
            bestModelAbs = metric
            bestModel = model
        print(f'model {model} -> {metric}')

    train_len = Utils.train_len
    regions_stats['train_size'][n] = train_len
    regions_stats['model'][n] = bestModel
    regions_stats['accuracy_2'][n] = bestModelAbs
    
    print(f'\naccuracy: {bestModelAbs}')
    print(f'best model: {bestModel}\n')
    
mean_acc = np.mean(regions_stats['accuracy_2'])
print(mean_acc)
mean_train = np.mean(regions_stats['train_size'])
print(mean_train)


[1 of 27] region LPL -> 0.299:

current model: 7

model 0 -> 0.349
model 1 -> 0.529
model 2 -> 0.494
model 3 -> 0.367
model 4 -> 0.362
model 5 -> 0.419
model 6 -> 0.339
model 7 -> 0.299

accuracy: 0.299
best model: 7


[2 of 27] region LCK -> 0.357:

current model: 7

model 0 -> 0.434
model 1 -> 0.457
model 2 -> 0.49
model 3 -> 0.442
model 4 -> 0.456
model 5 -> 0.454
model 6 -> 0.392
model 7 -> 0.346

accuracy: 0.346
best model: 7


[3 of 27] region PCS -> 0.296:

current model: 7

model 0 -> 0.409
model 1 -> 0.42
model 2 -> 0.45
model 3 -> 0.385
model 4 -> 0.404
model 5 -> 0.488
model 6 -> 0.423
model 7 -> 0.296

accuracy: 0.296
best model: 7


[4 of 27] region VCS -> 0.367:

current model: 7

model 0 -> 0.433
model 1 -> 0.413
model 2 -> 0.481
model 3 -> 0.467
model 4 -> 0.412
model 5 -> 0.523
model 6 -> 0.423
model 7 -> 0.367

accuracy: 0.367
best model: 7


[5 of 27] region Ultraliga -> 0.391:

current model: 7

model 0 -> 0.485
model 1 -> 0.46
model 2 -> 0.486
model 3 -> 0.482
mod

In [7]:
print(np.mean(regions_stats['accuracy_0']))
print(np.mean(regions_stats['accuracy_1']))
print(np.mean(regions_stats['accuracy_2']))
regions_stats

0.3191481481481482
0.31133333333333335
0.3091481481481481


Unnamed: 0,region,model,test_size,train_size,accuracy_0,accuracy_1,accuracy_2,cut_off_var
0,LPL,7,626,3833,0.299,0.299,0.299,1.5
1,LCK,7,628,2478,0.357,0.346,0.346,1.5
2,PCS,7,274,2850,0.296,0.296,0.296,1.5
3,VCS,7,300,6498,0.367,0.367,0.367,1.5
4,Ultraliga,7,174,1114,0.391,0.379,0.379,1.5
5,LLA,7,164,3167,0.348,0.335,0.335,1.5
6,SuperLiga_Tier2,0,115,3985,0.409,0.409,0.363,1.5
7,TCL,6,161,6197,0.348,0.348,0.335,1.5
8,LFL,7,368,1077,0.38,0.37,0.37,1.5
9,Prime,7,179,2904,0.352,0.324,0.324,1.5


In [8]:
print(np.mean(regions_stats['accuracy_0']))
print(np.mean(regions_stats['accuracy_1']))
print(np.mean(regions_stats['accuracy_2']))
regions_stats

0.3191481481481482
0.31133333333333335
0.3091481481481481


Unnamed: 0,region,model,test_size,train_size,accuracy_0,accuracy_1,accuracy_2,cut_off_var
0,LPL,7,626,3833,0.299,0.299,0.299,1.5
1,LCK,7,628,2478,0.357,0.346,0.346,1.5
2,PCS,7,274,2850,0.296,0.296,0.296,1.5
3,VCS,7,300,6498,0.367,0.367,0.367,1.5
4,Ultraliga,7,174,1114,0.391,0.379,0.379,1.5
5,LLA,7,164,3167,0.348,0.335,0.335,1.5
6,SuperLiga_Tier2,0,115,3985,0.409,0.409,0.363,1.5
7,TCL,6,161,6197,0.348,0.348,0.335,1.5
8,LFL,7,368,1077,0.38,0.37,0.37,1.5
9,Prime,7,179,2904,0.352,0.324,0.324,1.5


In [9]:
regions_cache = dict()
regions_cache[CURRENT_YEAR_SEMESTER]['features'] = regions_feature_cols
regions_cache[CURRENT_YEAR_SEMESTER]['train_data'] = regions_train_data

with open(f'Data/raw_data/regions_cache.json', 'w') as fp:
    json.dump(regions_cache, fp)

regions_stats.to_pickle("Data/raw_data/regions_stats.pkl")

KeyError: '20231'

# Notes