# Setup

In [1]:
import pandas as pd
import numpy as np
import random

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
    
import json

from Utils.constants import *
import Utils.utils_file as utils_file
import Utils.model_file as model_file

import warnings
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

# Load

In [2]:
cache = False

Utils = utils_file.Utils_Class(target='Score'
                                ,default_model=0
                                ,model_type='logistic'
                                ,cache_model=cache
                                ,cache_scraping=True)

Utils.TARGET_DF = Utils.TARGET_DF[Utils.TARGET_DF[Utils.TARGET]!=2]

regions_to_feed, regions_to_predict = Utils.region_lists()
regions_to_predict = ['LCS']
regions_to_feed = list(set(regions_to_feed + regions_to_predict))

if cache:
    regions_feature_cols = Utils.regions_feature_cols
    regions_train_data = Utils.regions_train_data
    regions_stats = Utils.regions_stats
    regions_list = regions_stats['region']
    regions_stats['train_size'] = 0
    regions_stats['test_size'] = 0
    regions_stats = regions_stats[regions_stats['region'].isin(regions_to_predict)].reset_index(drop=True)

#regions_to_predict.remove('MSI')
#regions_to_predict.remove('World')

#regions_to_predict = ['MSI','World']

# SKLEARN

In [3]:
regions_stats = pd.DataFrame(columns=['region','model','test_size','train_size'])
regions_stats['region'] = regions_to_predict
regions_list = regions_stats['region']
regions_stats['model'] = Utils.DEFAULT_MODEL

regions_train_data = dict(zip(regions_to_predict,[[x] for x in regions_to_predict]))
regions_stats['accuracy_0'] = np.nan
regions_stats['accuracy_1'] = np.nan
regions_stats['accuracy_2'] = np.nan
regions_stats['cut_off_var'] = 1.5

regions_feature_cols = dict(zip(regions_list,[0]*len(regions_list)))
for key in regions_feature_cols:
    regions_feature_cols[key] = PLAYER_SIMPLE_FEATURE_COLS.copy()

### TRAIN DATA SELECTION

In [4]:
regions_train_data = dict(zip(regions_to_predict, [[x] for x in regions_to_predict]))
regions_stats['accuracy_0'] = np.nan

##########

for n,region in enumerate(regions_to_predict):
    print('=========\n')
    regionFinalAcc = regions_stats['accuracy_0'][n]
    cut_off_var = regions_stats['cut_off_var'][n]
    region_model_number = regions_stats['model'][n]
    print(f'[{n+1} of {len(regions_to_predict)}] region {region} -> {regionFinalAcc}:\n')
    
    regionsToTest = [x for x in regions_to_feed]
    regionsToTest.remove(region)
    random.shuffle(regionsToTest)
    for nn,regionToTest in enumerate(regionsToTest):
        regions_train_data[region].append(regionToTest)
        
        metric, pred = Utils.generate_metric(region_model_number, regions_feature_cols[region]
                                             , regions_train_data[region], region, reps=1)
        
        if metric < regionFinalAcc or np.isnan(regionFinalAcc):
            regionFinalAcc = metric
            
            print(f'{regionFinalAcc} -> {regionToTest} added                                           ')
        else:
            regions_train_data[region].remove(regionToTest)
    
    train_len = Utils.train_len
    regions_stats['accuracy_0'][n] = regionFinalAcc
    regions_stats['train_size'][n] = train_len
    regions_stats['test_size'][n] = len(pred)
    
    print(f'\nlen lost: {round(Utils.len_lost,2)}%')
    print(f'accuracy: {regionFinalAcc}')
    print(f'{region} train data: {regions_train_data[region]}\nlen: {train_len}')
    print(f'test data len: {len(pred)}\n')

mean_acc = np.mean(regions_stats['accuracy_0'])
print(mean_acc)


[1 of 1] region LCS -> nan:

0.318 -> TCL added                                           
0.235 -> Demacia added                                           
0.233 -> Iberian added                                           
0.23 -> LLA added                                           
0.2 -> NLC added                                           
0.19 -> NA_Tier2 added                                           
0.177 -> EU added                                           
0.169 -> LCS_Tier2 added                                           
0.148 -> LJL added                                           

len lost: 0.7%
accuracy: 0.148
LCS train data: ['LCS', 'TCL', 'Demacia', 'Iberian', 'LLA', 'NLC', 'NA_Tier2', 'EU', 'LCS_Tier2', 'LJL']
len: 5917
test data len: 214

0.148


### FEATURE SELECTION

In [9]:
regions_stats['accuracy_1'] = np.nan
for key in regions_list:
    regions_feature_cols[key] = PLAYER_SIMPLE_FEATURE_COLS.copy()

##########

for n,region in enumerate(regions_to_predict):
    print('=========\n')
    regionFinalAcc = regions_stats['accuracy_0'][n]
    cut_off_var = regions_stats['cut_off_var'][n]
    region_model_number = regions_stats['model'][n]
    print(f'[{n+1} of {len(regions_to_predict)}] region {region} -> {regionFinalAcc}:\n')
    print(f'model: {region_model_number}')
    
    initialFeatures = regions_feature_cols[region].copy()
    for nn,feature in enumerate(initialFeatures):
        regions_feature_cols[region].remove(feature)
        
        metric, pred = Utils.generate_metric(region_model_number, regions_feature_cols[region]
                                             , regions_train_data[region], region, reps=1)
        
        if metric < regionFinalAcc or np.isnan(regionFinalAcc):
            regionFinalAcc = metric
            print(f'{feature} removed for {metric}                                                ')
        else:
            regions_feature_cols[region].append(feature)
    
    train_len = Utils.train_len
    regions_stats['accuracy_1'][n] = regionFinalAcc
    regions_stats['train_size'][n] = train_len

    print(f'\n\nlen lost: {round(Utils.len_lost,2)}%')
    print(f'accuracy: {regionFinalAcc}')
    print(f'{region} feature count: {len(regions_feature_cols[region])}')
    print(f'test data len: {len(pred)}\n')
    
mean_acc = np.mean(regions_stats['accuracy_1'])
mean_train = np.mean(regions_stats['train_size'])

print(mean_train)
print(mean_acc)



[1 of 1] region LCS -> 0.148:

model: 0
DMG% removed for 0.143                                                


len lost: 0.74%
accuracy: 0.143
LCS feature count: 21
test data len: 214

5759.0
0.143


### MODEL SELECTION

In [10]:
regions_stats['accuracy_2'] = np.nan

##########

for n,region in enumerate(regions_to_predict):
    print('=========\n')
    currAcc = regions_stats['accuracy_0'][n]
    currModel = regions_stats['model'][n]
    cut_off_var = regions_stats['cut_off_var'][n]
    print(f'[{n+1} of {len(regions_to_predict)}] region {region} -> {currAcc}:\n')
    print(f'current model: {currModel}\n')
    
    bestModelAbs = (regions_stats[regions_stats['region']==region])['accuracy_2'].iloc[0]
    for model in range(len(Utils.BASE_MODELS)):
        metric, pred = Utils.generate_metric(model, regions_feature_cols[region]
                                             , regions_train_data[region], region, reps=5)
        if metric < bestModelAbs or np.isnan(bestModelAbs):
            bestModelAbs = metric
            bestModel = model
        print(f'model {model} -> {metric}')

    train_len = Utils.train_len
    regions_stats['train_size'][n] = train_len
    regions_stats['model'][n] = bestModel
    regions_stats['accuracy_2'][n] = bestModelAbs
    
    print(f'\nlen lost: {round(Utils.len_lost,2)}%')
    print(f'accuracy: {bestModelAbs}')
    print(f'best model: {bestModel}\n')
    
mean_acc = np.mean(regions_stats['accuracy_2'])
print(mean_acc)
mean_train = np.mean(regions_stats['train_size'])
print(mean_train)


[1 of 1] region LCS -> 0.148:

current model: 0

model 0 -> 0.143

len lost: 0.74%
accuracy: 0.143
best model: 0

0.143
5759.0


In [11]:
print(np.mean(regions_stats['accuracy_0']))
print(np.mean(regions_stats['accuracy_1']))
print(np.mean(regions_stats['accuracy_2']))
regions_stats

0.148
0.143
0.143


Unnamed: 0,region,model,test_size,train_size,accuracy_0,accuracy_1,accuracy_2,cut_off_var
0,LCS,0,214,5759,0.148,0.143,0.143,1.5


In [12]:
print(np.mean(regions_stats['accuracy_0']))
print(np.mean(regions_stats['accuracy_1']))
print(np.mean(regions_stats['accuracy_2']))
regions_stats

0.148
0.143
0.143


Unnamed: 0,region,model,test_size,train_size,accuracy_0,accuracy_1,accuracy_2,cut_off_var
0,LCS,0,214,5759,0.148,0.143,0.143,1.5


In [24]:
regions_cache = dict()
regions_cache[tuple(CURRENT_YEAR_SEMESTER, 'features', 'logistic')] = regions_feature_cols
regions_cache[tuple(CURRENT_YEAR_SEMESTER, 'train_data', 'logistic')] = regions_train_data

with open(f'Data/raw_data/regions_cache.json', 'w') as fp:
    json.dump(regions_cache, fp)

regions_stats.to_pickle("Data/raw_data/regions_stats.pkl")

KeyError: '20231'

# Notes