# Setup

In [1]:
import pandas as pd
import numpy as np
import random
import sklearn.metrics as skm

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
    
import json

from Utils.constants import *
import Utils.utils_file as utils_file
import Utils.model_file as model_file

import warnings
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

# Load

In [2]:
cache = True

Utils = utils_file.Utils_Class(target='Score'
                                ,default_model=0
                                ,model_type='logistic'
                                ,cache_model=cache
                                ,cache_scraping=True)

Utils.TARGET_DF = Utils.TARGET_DF[Utils.TARGET_DF[Utils.TARGET]!=2]

regions_to_feed, regions_to_predict = Utils.region_lists()
regions_to_feed = list(set(regions_to_feed + regions_to_predict))

if cache:
    try:    
        regions_feature_cols = Utils.regions_feature_cols
        regions_train_data = Utils.regions_train_data
        regions_stats = Utils.regions_stats
    except:
        print('files not found')

# SKLEARN

In [3]:
regions_stats = pd.DataFrame(columns=['region','model','test_size','train_size'])
regions_stats['region'] = regions_to_predict
regions_list = regions_stats['region']
regions_stats['model'] = Utils.DEFAULT_MODEL

regions_train_data = dict(zip(regions_to_predict,[[x] for x in regions_to_predict]))
regions_stats['error'] = np.nan
regions_stats['cut_off_var'] = 1.5
regions_stats['threshold'] = 0.35
regions_stats['len_lost'] = np.nan

regions_feature_cols = dict(zip(regions_list,[0]*len(regions_list)))
for key in regions_feature_cols:
    regions_feature_cols[key] = PLAYER_SIMPLE_FEATURE_COLS.copy()

### TRAIN DATA SELECTION

In [4]:
regions_train_data = dict(zip(regions_to_predict, [[x] for x in regions_to_predict]))

##########

for n,region in enumerate(regions_to_predict):
    print('=========\n')
    current_accuracy = regions_stats['error'][n]
    cut_off_var = regions_stats['cut_off_var'][n]
    region_model_number = regions_stats['model'][n]
    Utils.logistic_threshold = regions_stats['threshold'][n]
    
    print(f'[{n+1} of {len(regions_to_predict)}] region {region} -> {current_accuracy}:\n')
    
    regionsToTest = [x for x in regions_to_feed]
    regionsToTest.remove(region)
    random.shuffle(regionsToTest)
    for nn,regionToTest in enumerate(regionsToTest):
        regions_train_data[region].append(regionToTest)
        
        metric, pred, ytrain = Utils.generate_metric(region_model_number, regions_feature_cols[region]
                                                    , regions_train_data[region], region, reps=5)
        
        if round(metric,2) < round(current_accuracy,2) or np.isnan(current_accuracy):
            current_accuracy = metric
            
            print(f'{current_accuracy} -> {regionToTest} added                                           ')
        else:
            regions_train_data[region].remove(regionToTest)
    
    train_len = Utils.train_len
    regions_stats['error'][n] = current_accuracy
    regions_stats['train_size'][n] = train_len
    regions_stats['test_size'][n] = len(pred)
    regions_stats['len_lost'][n] = Utils.len_lost
    
    print(f'\nlen lost: {Utils.len_lost}%')
    print(f'accuracy: {current_accuracy}')
    print(f'{region} train data: {regions_train_data[region]}\nlen: {train_len}')
    print(f'test data len: {len(pred)}\n')

mean_acc = np.mean(regions_stats['error'])
print(mean_acc)


[1 of 32] region LVP -> nan:

0.444 -> Iberian added                                           
0.405 -> LCL added                                           
0.312 -> LLA added                                           
0.305 -> TCL added                                           
0.278 -> Asia added                                           
0.268 -> Prime added                                           
0.255 -> Ultraliga added                                           
0.253 -> OPL added                                           
0.244 -> EU added                                           

len lost: 0.53%
accuracy: 0.244
LVP train data: ['LVP', 'Iberian', 'LCL', 'LLA', 'TCL', 'Asia', 'Prime', 'Ultraliga', 'OPL', 'EU']
len: 2917
test data len: 410


[2 of 32] region Belgian -> nan:

0.25 -> CK added                                           
0.148 -> NA_Tier2 added                                           
0.077 -> LJL added                                           
0.069 -> LPLO

### FEATURE SELECTION

In [5]:
for key in regions_list:
    regions_feature_cols[key] = PLAYER_SIMPLE_FEATURE_COLS.copy()

##########

for n,region in enumerate(regions_to_predict):
    print('=========\n')

    current_accuracy = regions_stats['error'][n]
    cut_off_var = regions_stats['cut_off_var'][n]
    region_model_number = regions_stats['model'][n]
    Utils.logistic_threshold = regions_stats['threshold'][n]

    print(f'[{n+1} of {len(regions_to_predict)}] region {region} -> {current_accuracy}:\n')
    print(f'model: {region_model_number}')
    
    initialFeatures = regions_feature_cols[region].copy()
    for nn,feature in enumerate(initialFeatures):
        regions_feature_cols[region].remove(feature)
        
        metric, pred, ytrain = Utils.generate_metric(region_model_number, regions_feature_cols[region]
                                                    , regions_train_data[region], region, reps=5)
        if round(metric,2) < round(current_accuracy,2) or np.isnan(current_accuracy):
            current_accuracy = metric
            print(f'{feature} removed for {metric}                                                ')
        else:
            regions_feature_cols[region].append(feature)
    
    train_len = Utils.train_len
    regions_stats['error'][n] = current_accuracy
    regions_stats['train_size'][n] = train_len
    regions_stats['len_lost'][n] = Utils.len_lost

    print(f'\n\nlen lost: {Utils.len_lost}%')
    print(f'accuracy: {current_accuracy}')
    print(f'{region} feature count: {len(regions_feature_cols[region])}')
    print(f'test data len: {len(pred)}\n')
    
mean_acc = np.mean(regions_stats['error'])
print(mean_acc)


[1 of 32] region LVP -> 0.244:

model: 0
KP% removed for 0.232                                                


len lost: 0.53%
accuracy: 0.232
LVP feature count: 21
test data len: 410


[2 of 32] region Belgian -> 0.0:

model: 0


len lost: 0.21%
accuracy: 0.0
Belgian feature count: 22
test data len: 34


[3 of 32] region GLL -> 0.136:

model: 0
DPM removed for 0.133                                                


len lost: 0.28%
accuracy: 0.133
GLL feature count: 21
test data len: 61


[4 of 32] region MSI -> 0.26:

model: 0


len lost: 0.4%
accuracy: 0.26
MSI feature count: 22
test data len: 81


[5 of 32] region EU -> 0.27:

model: 0
Games removed for 0.26                                                
CSD@15 removed for 0.249                                                


len lost: 0.39%
accuracy: 0.249
EU feature count: 20
test data len: 457


[6 of 32] region LCS_Tier2 -> 0.208:

model: 0
GD@15 removed for 0.186                                                
CSD@15 remo

### MODEL SELECTION

In [6]:
regions_stats['error'] = np.nan

##########

for n,region in enumerate(regions_to_predict):
    print('=========\n')

    current_accuracy = regions_stats['error'][n]
    currModel = regions_stats['model'][n]
    cut_off_var = regions_stats['cut_off_var'][n]
    Utils.logistic_threshold = regions_stats['threshold'][n]

    print(f'[{n+1} of {len(regions_to_predict)}] region {region} -> {current_accuracy}:\n')
    print(f'current model: {currModel}\n')
    
    bestModelAbs = (regions_stats[regions_stats['region']==region])['error'].iloc[0]
    for model in range(len(Utils.BASE_MODELS)):
        metric, pred, ytrain = Utils.generate_metric(model, regions_feature_cols[region]
                                                    , regions_train_data[region], region, reps=5)
        if round(metric,2) < round(bestModelAbs,2) or np.isnan(bestModelAbs):
            bestModelAbs = metric
            bestModel = model
        print(f'model {model} -> {metric}')

    train_len = Utils.train_len
    regions_stats['train_size'][n] = train_len
    regions_stats['model'][n] = bestModel
    regions_stats['error'][n] = bestModelAbs
    regions_stats['len_lost'][n] = Utils.len_lost
    
    print(f'\nlen lost: {Utils.len_lost}%')
    print(f'accuracy: {bestModelAbs}')
    print(f'best model: {bestModel}\n')
    
mean_acc = np.mean(regions_stats['error'])
print(mean_acc)


[1 of 32] region LVP -> nan:

current model: 0

model 0 -> 0.232
model 1 -> 0.262
model 2 -> 0.263
model 3 -> 0.28
model 4 -> 0.265
model 5 -> 0.268
model 6 -> 0.262
model 7 -> 0.232
model 8 -> 0.269
model 9 -> 0.269
model 10 -> 0.23
model 11 -> 0.23
model 12 -> 0.23

len lost: 0.66%
accuracy: 0.232
best model: 0


[2 of 32] region Belgian -> nan:

current model: 0

model 0 -> 0.0
model 1 -> 0.074
model 2 -> 0.148
model 3 -> 0.143
model 4 -> 0.074
model 5 -> 0.038
model 6 -> 0.074
model 7 -> 0.0
model 8 -> 0.115
model 9 -> 0.115
model 10 -> 0.115
model 11 -> 0.115
model 12 -> 0.115

len lost: 0.24%
accuracy: 0.0
best model: 0


[3 of 32] region GLL -> nan:

current model: 0

model 0 -> 0.133
model 1 -> 0.217
model 2 -> 0.283
model 3 -> 0.279
model 4 -> 0.222
model 5 -> 0.234
model 6 -> 0.217
model 7 -> 0.133
model 8 -> 0.267
model 9 -> 0.267
model 10 -> 0.289
model 11 -> 0.289
model 12 -> 0.289

len lost: 0.26%
accuracy: 0.133
best model: 0


[4 of 32] region MSI -> nan:

current mode

### PERCENTAGE VALIDATION

In [3]:
perc_val_df = pd.DataFrame(columns=['predicted','true'])

##########

for n,region in enumerate(regions_to_predict):
    current_accuracy = regions_stats['error'][n]
    cut_off_var = regions_stats['cut_off_var'][n]
    region_model_number = regions_stats['model'][n]
    Utils.logistic_threshold = regions_stats['threshold'][n]
    
    bestModelAbs = (regions_stats[regions_stats['region']==region])['error'].iloc[0]

    metric, pred, ytest = Utils.generate_metric(region_model_number, regions_feature_cols[region]
                                                , regions_train_data[region], region, reps=1)
    df_temp = pd.DataFrame(dict({'predicted':pred,'true':ytest}))
    df_temp['region'] = region
    perc_val_df = pd.concat([perc_val_df, df_temp])

perc_val_df['round'] = perc_val_df['predicted'].round().astype(int)
perc_val_df['result'] = (perc_val_df['true'] == perc_val_df['round']).replace({True:1,False:0})
perc_val_df['predicted'] = perc_val_df['predicted'].apply(lambda x: 1-x if x<0.5 else x)
perc_val_df.reset_index(drop=True,inplace=True)
    


In [4]:
perc_val_result_df = pd.DataFrame()
for reg in regions_to_predict:
    print('\n======================================\n')
    print(f'region: {reg}')
    for x in np.arange(0.5,0.9,0.1):
        print('==========')
        print(f'threshold: {round(x,2)}')
        threshold = x
        test_df = perc_val_df[(perc_val_df['predicted']>=threshold)
                                        & (perc_val_df['predicted']<=threshold+0.1)
                                        & (perc_val_df['region']==reg)]
        
        if len(test_df.result.unique())>1:
            result = round(test_df.result.value_counts()[1]/len(test_df),2)
            print(f'result: {round(test_df.result.value_counts()[1]/len(test_df),2)}')
            perc_val_result_df = perc_val_result_df.append(pd.Series([reg, round(threshold,2), result, len(test_df)]), ignore_index=True)

        print(f'len: {len(test_df)}')

perc_val_result_df.columns = ['region','threshold','result','len']
perc_val_result_df['diff'] = perc_val_result_df['result'] - perc_val_result_df['threshold']



region: Demacia
threshold: 0.5
result: 0.7
len: 33
threshold: 0.6
result: 0.71
len: 34
threshold: 0.7
result: 0.87
len: 31
threshold: 0.8
result: 0.82
len: 28


region: Kespa
threshold: 0.5
len: 7
threshold: 0.6
result: 0.25
len: 8
threshold: 0.7
result: 0.67
len: 12
threshold: 0.8
len: 14


region: VCS
threshold: 0.5
result: 0.58
len: 136
threshold: 0.6
result: 0.67
len: 139
threshold: 0.7
result: 0.88
len: 139
threshold: 0.8
result: 0.71
len: 122


region: GLL
threshold: 0.5
result: 0.55
len: 31
threshold: 0.6
result: 0.58
len: 33
threshold: 0.7
result: 0.84
len: 43
threshold: 0.8
result: 0.76
len: 37


region: Prime
threshold: 0.5
result: 0.57
len: 121
threshold: 0.6
result: 0.55
len: 130
threshold: 0.7
result: 0.7
len: 134
threshold: 0.8
result: 0.69
len: 67


region: LPLOL
threshold: 0.5
result: 0.45
len: 74
threshold: 0.6
result: 0.74
len: 43
threshold: 0.7
result: 0.85
len: 40
threshold: 0.8
result: 0.86
len: 66


region: World
threshold: 0.5
result: 0.43
len: 65
threshold: 0.

In [7]:
bad_diffs = perc_val_result_df[perc_val_result_df['diff']<-0.05]
print(len(bad_diffs)/len(perc_val_result_df))
bad_diffs['cont'] = 1
bad_diffs.groupby(by='region',as_index=False)['cont'].sum()

0.18461538461538463


Unnamed: 0,region,cont
0,Belgian,1
1,CBLOL,1
2,EU,1
3,Kespa,1
4,LCK,1
5,LCK_Tier2,2
6,LCO,1
7,LCS,1
8,LCS_Tier2,2
9,LEC,1


In [5]:
bad_diffs = perc_val_result_df[perc_val_result_df['diff']<-0.1]
print(len(bad_diffs)/len(perc_val_result_df))
bad_diffs['cont'] = 1
bad_diffs.groupby(by='region',as_index=False)['cont'].sum()

0.06722689075630252


Unnamed: 0,region,cont
0,Elite_Tier2,1
1,LCK_Tier2,2
2,LCS_Tier2,1
3,LVP,1
4,LVP2,1
5,Prime,1
6,Ultraliga,1


In [6]:
bad_diffs = perc_val_result_df[perc_val_result_df['diff']<-0.1]
print(len(bad_diffs)/len(perc_val_result_df))
bad_diffs['cont'] = 1
bad_diffs.groupby(by='region',as_index=False)['cont'].sum()

0.042735042735042736


Unnamed: 0,region,cont
0,LCK_Tier2,1
1,LVP,1
2,Turkey_Tier2,1
3,VCS,1
4,World,1


In [6]:
bad_diffs = perc_val_result_df[perc_val_result_df['diff']<-0.1]
print(len(bad_diffs)/len(perc_val_result_df))
bad_diffs['cont'] = 1
bad_diffs.groupby(by='region',as_index=False)['cont'].sum()

0.06140350877192982


Unnamed: 0,region,cont
0,EMEA,2
1,Elite_Tier2,1
2,NACL,1
3,Prime,1
4,SuperLiga,1
5,VCS,1


In [6]:
print(np.mean(regions_stats['error']))
regions_stats

0.1905806451612903


Unnamed: 0,region,model,test_size,train_size,error,cut_off_var,threshold,len_lost
0,Prime,0,478,2388,0.283,1.5,0.35,0.55
1,LVP,0,368,3953,0.272,1.5,0.35,0.63
2,NLC,10,255,2081,0.154,1.5,0.35,0.59
3,LVP2,0,5,1374,0.0,1.5,0.35,0.6
4,Ultraliga,0,319,3031,0.193,1.5,0.35,0.45
5,LFL,0,567,1467,0.222,1.5,0.35,0.57
6,GLL,0,231,1133,0.204,1.5,0.35,0.42
7,MSI,0,158,4725,0.058,1.5,0.35,0.59
8,EU,0,531,1637,0.23,1.5,0.35,0.71
9,LCS,2,567,2533,0.27,1.5,0.35,0.62


In [8]:
Utils.save_model_cache(regions_stats, regions_feature_cols, regions_train_data)

Cache saved!


# Notes