# SETUP

Data to export:
- regionsStats
- regionsTrainData
- base_models
- regionsFeatureCols
 
To do:
 - plotOverview
 - update notebook
 - streamlit interface for use

In [26]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from scipy import stats
import random

from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, RepeatedKFold, cross_val_score
import sklearn.metrics as skm
from sklearn.metrics import accuracy_score
from sklearn.cluster import AffinityPropagation as AP
from sklearn.preprocessing import MinMaxScaler
from sklearn.covariance import EllipticEnvelope

from sklearn.ensemble import GradientBoostingRegressor, IsolationForest, RandomForestRegressor
from sklearn.linear_model import ElasticNet, SGDRegressor, BayesianRidge, LinearRegression
from sklearn.svm import SVR, OneClassSVM
from catboost import CatBoostRegressor
from sklearn.kernel_ridge import KernelRidge
from xgboost.sklearn import XGBRegressor
from lightgbm import LGBMRegressor

import re
import time
import datetime as dt
import json

import sys
import os
sys.path.append(os.path.abspath
                (os.path.join
                 (os.path.dirname("constants.py"), '..')))
from constants import *
from scripts import *

import warnings
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

hide_toggle()

# FUNCTIONS

In [27]:
def matchListToDfs(df):
    matchListDateFilter = (df[df['Date'] >= pd.to_datetime('2019-7-01',format='%Y-%m-%d')]
                                        .reset_index(drop=True).copy())
    matchListDateFilter['realSemesterYear'] = (matchListDateFilter['realSemester'].astype(str)
                                               +matchListDateFilter['realYear'].astype(str))
    matchListDateFilter['tournament_id'] = (matchListDateFilter['TournamentRegion'].astype(str)
                                               +matchListDateFilter['realSemesterYear'].astype(str))
    
    playerMatchList = matchListDateFilter.copy()
    teamMatchList = matchListDateFilter.copy()

    for color in ['Blue','Red']:
        for feature in meanFeatures:
                teamMatchList[f'Team_{color}_{feature}'] = (matchListDateFilter[[f"{position}_{color}_{feature}" for position in positions]]
                                                        .mean(skipna=True,axis=1).copy())
                teamMatchList.drop([f"{position}_{color}_{feature}" for position in positions],axis=1,inplace=True)

        for feature in sumFeatures:
                teamMatchList[f'Team_{color}_{feature}'] = (matchListDateFilter[[f"{position}_{color}_{feature}" for position in positions]]
                                                        .sum(skipna=True,axis=1).copy())
                teamMatchList.drop([f"{position}_{color}_{feature}" for position in positions],axis=1,inplace=True)

        teamMatchList.drop([f"{position}_{color}" for position in positions],axis=1,inplace=True)
        
    return playerMatchList, teamMatchList

def regionLists(df, currentYear):
    regions = df['TournamentRegion'].unique()
    regionsToFeed = [x for x in df['TournamentRegion'].unique()]
    regionsFilterTemp = ([x for x in regions if currentYear in (df[df['TournamentRegion']==x])['realYear'].unique()
                                            and currentYear-1 in (df[df['TournamentRegion']==x])['realYear'].unique()])
    regionsToPredict = []
    for region in regionsFilterTemp:
        regionsFilterSize = df[(df['realYear']==currentYear) & (df['TournamentRegion']==region)]
        if len(regionsFilterSize)>=10:
            regionsToPredict.append(region)

    #regionsToPredict = regionsToPredict[0:5]
    
    return regions, regionsToFeed, regionsToPredict

def filterNan(df,dropType):
    dfRet = df
    
    if dropType==0:
        dfRet = dfRet.dropna()
        
    elif dropType==1:
        for col in dfRet.columns:
            dfRet[col] = dfRet[col].fillna(0)
            
    return dfRet

hide_toggle()

# LOAD

In [28]:
teamDataTable = pd.read_pickle("Data/raw_data/teamDataTable.pkl")
playerDataTable = pd.read_pickle("Data/raw_data/playerDataTable.pkl")

matchList = pd.read_pickle("Data/raw_data/matchList.pkl")
matchListFill = pd.read_pickle("Data/raw_data/matchListFill.pkl")

teamMatchList = pd.read_pickle("Data/raw_data/teamMatchList.pkl")
playerMatchList = pd.read_pickle("Data/raw_data/playerMatchList.pkl")

regionsStats = pd.read_pickle("./Data/raw_data/regionsStats.pkl")

with open(f'./Data/raw_data/regionsFeatureCols.json', 'r') as fp:
    regionsFeatureCols = json.load(fp)
with open(f'./Data/raw_data/regionsTrainData.json', 'r') as fp:
    regionsTrainData = json.load(fp)
    

# PREPROCESS

In [29]:
#            0          1
dfsFilter = [matchList, matchListFill]
dfTemp = dfsFilter[1].copy()

playerMatchList, teamMatchList = matchListToDfs(dfTemp)

#             0              1
dfsContent = [teamMatchList, playerMatchList]
dfToSplit = dfsContent[0].copy()

print(f'main df size: {len(dfToSplit)}')

base_models = [
              GradientBoostingRegressor(loss='absolute_error') #0
              ,ElasticNet() #1
              ,BayesianRidge() #2
              ,LinearRegression() #3
              ,SVR() #4
              ,CatBoostRegressor(verbose=False) #5
              ,KernelRidge() #6
              ,XGBRegressor() #7
              ,RandomForestRegressor() #8
              ]

currentTarget = 'totalKills'
defaultModel = 4
infoCols = ['Date','tournament_id',currentTarget,'TournamentRegion']
featureCols = [x for x in dfToSplit.columns if x not in offCols+infoCols]

regions, regionsToFeed, regionsToPredict = regionLists(dfToSplit, currentYear)

main df size: 26314


# SKLEARN

In [30]:
regionsStats = pd.DataFrame(columns=['region','model','size'])
regionsStats['region'] = regionsToPredict
regionsList = regionsStats['region']
regionsStats['model'] = defaultModel

regionsTrainData = dict(zip(regionsList,regionsList.apply(lambda x: [x])))
regionsStats['accuracy_0'] = np.nan
regionsStats['accuracy_1'] = np.nan
regionsStats['accuracy_2'] = np.nan
regionsStats['drop_nan_type'] = 0

### TRAIN DATA SELECTION

In [31]:
%%time

regionsTrainData = dict(zip(regionsList,regionsList.apply(lambda x: [x])))
regionsStats['accuracy_0'] = np.nan

for n,region in enumerate(regionsToPredict):
    print('=========\n')
    regionFinalAcc = (regionsStats[regionsStats['region']==region])['accuracy_0'].iloc[0]
    print(f'[{n+1} of {len(regionsToPredict)}] region {region} -> {round(regionFinalAcc,3)}:\n')
    tempTournamentId = region+currentSemesterYear
    
    regionsToTest = [x for x in regionsToFeed]
    regionsToTest.remove(region)
    random.shuffle(regionsToTest)
    for nn,regionToTest in enumerate(regionsToTest):
        regionsTrainData[region].append(regionToTest)
        
        dfSklearnRegions = dfToSplit[dfToSplit['TournamentRegion'].isin(regionsTrainData[region])].copy()
        dfSklearnRegions = dfSklearnRegions.sort_values(by='Date',ascending=True).copy()
        dfSklearnRegions = filterNan(dfSklearnRegions,regionsStats['drop_nan_type'][n])
        
        xtrain,ytrain,xtest,ytest = train_test_split2(dfSklearnRegions,tempTournamentId,currentTarget,verbose=False)
        
        region_model_number = (regionsStats[regionsStats['region']==region])['model'].iloc[0]
        region_model = base_models[region_model_number]
        region_model.fit(xtrain, ytrain)
        pred = region_model.predict(xtest)
        
        metric = round(skm.mean_absolute_error(ytest, pred),2)
        
        if metric < regionFinalAcc or np.isnan(regionFinalAcc):
            regionFinalAcc = metric
            
            print(f'{round(regionFinalAcc,3)} -> {regionToTest} added                                       ')
        else:
            regionsTrainData[region].remove(regionToTest)
            
        print(f'[{nn+1} of {len(regionsToTest)}] testing: {regionToTest}                        ',end='\r')
    
    regionsStats['accuracy_0'][n] = regionFinalAcc
    regionsStats['size'][n] = len(pred)
    
    print(f'\n\naccuracy: {round(regionFinalAcc,3)}')
    print(f'{region} train data: {regionsTrainData[region]}\nlen:{len(regionsTrainData[region])}')
    print(f'test data len: {len(pred)}\n')
    
printFinalResults(regionsStats, 'accuracy_0')

hide_toggle()


[1 of 22] region LPL -> nan:

7.21 -> EU added                                       
7.19 -> Demacia added                                       
7.16 -> LCO added                                       
7.13 -> World added                                       
[53 of 53] testing: PCS                                    

accuracy: 7.13
LPL train data: ['LPL', 'EU', 'Demacia', 'LCO', 'World']
len:5
test data len: 119


[2 of 22] region LCK -> nan:

5.76 -> SuperLiga_Tier2 added                                       
5.74 -> LVP2 added                                        
5.73 -> LJL added                                       
5.7 -> LCS added                                       
5.69 -> Iberian added                                       
[53 of 53] testing: Demacia                           

accuracy: 5.69
LCK train data: ['LCK', 'SuperLiga_Tier2', 'LVP2', 'LJL', 'LCS', 'Iberian']
len:6
test data len: 248


[3 of 22] region PCS -> nan:

6.35 -> NLC added                       

7.99 -> LJL added                                          
7.97 -> Turkey_Tier2 added                                       
[53 of 53] testing: PCS                                 

accuracy: 7.97
NLC train data: ['NLC', 'BRCC', 'World', 'CBLOL', 'PG', 'LCS_Tier2', 'Elite_Tier2', 'LJL', 'Turkey_Tier2']
len:9
test data len: 70


[12 of 22] region CBLOL_Tier2 -> nan:

6.72 -> MSI added                                       
6.64 -> MCR added                                       
6.53 -> GLL added                                       
6.52 -> Elite_Tier2 added                                       
6.51 -> VCS added                                       
[53 of 53] testing: LVP                                   

accuracy: 6.51
CBLOL_Tier2 train data: ['CBLOL_Tier2', 'MSI', 'MCR', 'GLL', 'Elite_Tier2', 'VCS']
len:6
test data len: 90


[13 of 22] region EBL -> nan:

6.63 -> LJL_Tier2 added                                       
6.44 -> LLA added                                       
6

6.44 -> LCK added                                       
6.23 -> Turkey_Tier2 added                                       
6.21 -> LVP2 added                                       
6.2 -> NACL added                                       
6.18 -> LPL added                                       
6.16 -> Ultraliga added                                       
6.15 -> NA_Tier2 added                                       
6.14 -> LJL_Tier2 added                                       
6.1 -> CBLOL_Tier2 added                                       
6.09 -> LVP added                                       
6.04 -> Trinity added                                       
[53 of 53] testing: VCS                               

accuracy: 6.04
LCO train data: ['LCO', 'Prime', 'LCK', 'Turkey_Tier2', 'LVP2', 'NACL', 'LPL', 'Ultraliga', 'NA_Tier2', 'LJL_Tier2', 'CBLOL_Tier2', 'LVP', 'Trinity']
len:13
test data len: 23


mean accuracy: 6.576
avg df len: 98.36363636363636

CPU times: total: 8min 58s
Wall tim

### DROP NAN VALUES

In [33]:
%%time

for n,region in enumerate(regionsToPredict):
    print('=========\n')
    bestDropMetric = (regionsStats[regionsStats['region']==region])['accuracy_0'].iloc[0]
    bestDropType = regionsStats['drop_nan_type'][n]
    print(f'[{n+1} of {len(regionsToPredict)}] region {region} -> {round(regionFinalAcc,3)}:')
    tempTournamentId = region+currentSemesterYear
    
    for dropType in range(0,2):
        dfSklearnRegions = dfToSplit[dfToSplit['TournamentRegion'].isin(regionsTrainData[region])].copy()
        dfSklearnRegions = dfSklearnRegions.sort_values(by='Date',ascending=True).copy()
        dfSklearnRegions = filterNan(dfSklearnRegions, dropType)

        xtrain,ytrain,xtest,ytest = train_test_split2(dfSklearnRegions,tempTournamentId,currentTarget,verbose=False)
        
        region_model_number = (regionsStats[regionsStats['region']==region])['model'].iloc[0]
        region_model = base_models[region_model_number]
        region_model.fit(xtrain, ytrain)
        pred = region_model.predict(xtest)
        
        metric = round(skm.mean_absolute_error(ytest, pred),2)
        print(f'\ntesting dropType: {dropType}\naccuracy: {metric}')
        if metric < bestDropMetric or np.isnan(bestDropMetric):
            bestDropMetric = metric
            bestDropType = dropType
            print(f'dropType changed to {dropType}: {round(bestDropMetric,3)}')
    
    regionsStats['accuracy_0'][n] = bestDropMetric
    regionsStats['drop_nan_type'][n] = bestDropType
    print(f'\nfinal accuracy: {bestDropMetric}\nfinal droptype: {bestDropType}\n')
    print(f'test data len: {len(pred)}\n')
    
printFinalResults(regionsStats, 'accuracy_0')

hide_toggle()


[1 of 22] region LPL -> 6.04:

testing dropType: 0
accuracy: 7.13

testing dropType: 1
accuracy: 6.59
dropType changed to 1: 6.59

final accuracy: 6.59
final droptype: 1

test data len: 329


[2 of 22] region LCK -> 6.04:

testing dropType: 0
accuracy: 5.69

testing dropType: 1
accuracy: 5.7

final accuracy: 5.69
final droptype: 0

test data len: 248


[3 of 22] region PCS -> 6.04:

testing dropType: 0
accuracy: 6.13

testing dropType: 1
accuracy: 6.13

final accuracy: 6.13
final droptype: 0

test data len: 105


[4 of 22] region VCS -> 6.04:

testing dropType: 0
accuracy: 6.12

testing dropType: 1
accuracy: 6.12

final accuracy: 6.12
final droptype: 0

test data len: 92


[5 of 22] region Ultraliga -> 6.04:

testing dropType: 0
accuracy: 7.04

testing dropType: 1
accuracy: 7.05

final accuracy: 7.04
final droptype: 0

test data len: 108


[6 of 22] region LLA -> 6.04:

testing dropType: 0
accuracy: 6.44

testing dropType: 1
accuracy: 6.45

final accuracy: 6.44
final droptype: 0

test

### FEATURE SELECTION

In [35]:
%%time

regionsFeatureCols = dict(zip(regionsList,[0]*len(regionsList)))
for key in regionsFeatureCols:
    regionsFeatureCols[key] = featureCols.copy()

regionsStats['accuracy_1'] = np.nan

for n,region in enumerate(regionsToPredict):
    print('=========\n')
    regionFinalAcc = (regionsStats[regionsStats['region']==region])['accuracy_0'].iloc[0]
    bestDropType = regionsStats['drop_nan_type'][n]
    tempTournamentId = region+currentSemesterYear
    
    region_model_number = (regionsStats[regionsStats['region']==region])['model'].iloc[0]
    region_model = base_models[region_model_number]
    regionDataList = regionsTrainData[region]
    
    print(f'[{n+1} of {len(regionsToPredict)}] region {region} -> {round(regionFinalAcc,3)}:\n')
    print(f'model: {region_model_number}')
    
    initialFeatures = regionsFeatureCols[region].copy()
    for nn,feature in enumerate(initialFeatures):

        regionsFeatureCols[region].remove(feature)
        dfSklearnRegions = dfToSplit[regionsFeatureCols[region]+infoCols].copy()
        dfSklearnRegions = dfSklearnRegions[dfSklearnRegions['TournamentRegion'].isin(regionDataList)].copy()
        dfSklearnRegions = dfSklearnRegions.sort_values(by='Date',ascending=True).copy()
        dfSklearnRegions = filterNan(dfSklearnRegions, dropType)

        xtrain,ytrain,xtest,ytest = train_test_split2(dfSklearnRegions,tempTournamentId,currentTarget,verbose=False)
        region_model.fit(xtrain, ytrain)
        
        pred = region_model.predict(xtest)
        metric = round(skm.mean_absolute_error(ytest, pred),2)
        if metric < regionFinalAcc or np.isnan(regionFinalAcc):
            regionFinalAcc = metric
            print(f'{feature} removed for {metric}                                                ')
        else:
            regionsFeatureCols[region].append(feature)
            
        print(f'[{nn+1} of {len(initialFeatures)}] testing: {feature}                        ',end='\r')
    
    regionsStats['accuracy_1'][n] = regionFinalAcc
    regionsStats['size'][n] = len(dfSklearnRegions)
    
    print(f'\n\naccuracy: {round(regionFinalAcc,3)}')
    print(f'{region} feature count: {len(regionsFeatureCols[region])}')
    print(f'test data len: {len(pred)}\n')
    
printFinalResults(regionsStats, 'accuracy_1')

hide_toggle()


[1 of 22] region LPL -> 6.59:

model: 4
Team_Blue_DPM removed for 6.58                                                
Team_Red_DPM removed for 6.54                                                
[44 of 44] testing: Team_Red_Solo_Kills                         

accuracy: 6.54
LPL feature count: 42
test data len: 329


[2 of 22] region LCK -> 5.69:

model: 4
Team_Red_DPM removed for 5.68                                                
Team_Red_XPD@15 removed for 5.67                                                
[44 of 44] testing: Team_Red_Solo_Kills                         

accuracy: 5.67
LCK feature count: 42
test data len: 248


[3 of 22] region PCS -> 6.13:

model: 4
Team_Blue_Win_rate removed for 6.12                                                
Team_Red_DPM removed for 6.11                                                
Team_Red_GD@15 removed for 6.1                                                
[44 of 44] testing: Team_Red_Solo_Kills                         

accuracy

### MODEL SELECTION

In [36]:
%%time

regionsStats['accuracy_2'] = np.nan

for n,region in enumerate(regionsToPredict):
    print('=========\n')
    currAcc = (regionsStats[regionsStats['region']==region])['accuracy_1'].iloc[0]
    currModel = (regionsStats[regionsStats['region']==region])['model'].iloc[0]
    bestDropType = regionsStats['drop_nan_type'][n]
    print(f'[{n+1} of {len(regionsToPredict)}] region {region} -> {round(currAcc,3)}:\n')
    print(f'current model: {currModel}\n')
    
    regionDataList = regionsTrainData[region]
    tempTournamentId = region+currentSemesterYear
    dfSklearnRegions = dfToSplit[dfToSplit['TournamentRegion'].isin(regionDataList)].copy()
    dfSklearnRegions = dfSklearnRegions[regionsFeatureCols[region]+infoCols]
    dfSklearnRegions = dfSklearnRegions.sort_values(by='Date',ascending=True).copy()
    dfSklearnRegions = filterNan(dfSklearnRegions, dropType)
    
    xtrain,ytrain,xtest,ytest = train_test_split2(dfSklearnRegions,tempTournamentId,currentTarget)
    
    bestModelAbs = (regionsStats[regionsStats['region']==region])['accuracy_2'].iloc[0]
    for model in range(len(base_models)):
        base_model = base_models[model]
        base_model.fit(xtrain, ytrain)
        pred = base_model.predict(xtest)

        testdf = pd.DataFrame([list(ytest),list(pred)]).transpose()
        testdf.columns=['yteste','pred']

        metricModelAbs = round(skm.mean_absolute_error(testdf['yteste'], testdf['pred']),2)
        if metricModelAbs<bestModelAbs or np.isnan(bestModelAbs):
            bestModelAbs=metricModelAbs
            bestModel=model
        print(f'model {model} -> {metricModelAbs}')

    regionsStats['model'][n] = bestModel
    regionsStats['accuracy_2'][n] = bestModelAbs
    
    print(f'\naccuracy: {bestModelAbs}')
    print(f'best model: {bestModel}\n')
    
printFinalResults(regionsStats, 'accuracy_2')

hide_toggle()


[1 of 22] region LPL -> 6.54:

current model: 4

train len: 4556
train len no outliers: 3451
percent of len removed: 24.25%
test len: 329

model 0 -> 6.52
model 1 -> 6.57
model 2 -> 6.62
model 3 -> 6.53
model 4 -> 6.54
model 5 -> 6.53
model 6 -> 6.53
model 7 -> 6.76
model 8 -> 6.69

accuracy: 6.52
best model: 0


[2 of 22] region LCK -> 5.67:

current model: 4

train len: 4352
train len no outliers: 3236
percent of len removed: 25.64%
test len: 248

model 0 -> 5.71
model 1 -> 5.84
model 2 -> 5.84
model 3 -> 5.71
model 4 -> 5.67
model 5 -> 5.76
model 6 -> 5.72
model 7 -> 5.8
model 8 -> 5.72

accuracy: 5.67
best model: 4


[3 of 22] region PCS -> 6.1:

current model: 4

train len: 5344
train len no outliers: 4085
percent of len removed: 23.56%
test len: 105

model 0 -> 6.3
model 1 -> 6.13
model 2 -> 6.14
model 3 -> 6.2
model 4 -> 6.1
model 5 -> 6.21
model 6 -> 6.03
model 7 -> 6.39
model 8 -> 6.08

accuracy: 6.03
best model: 6


[4 of 22] region VCS -> 6.11:

current model: 4

train len:

# THRESHOLDS

In [44]:
fill=0.0

regionsStats['accuracy_3']=fill
regionsStats['ceiling']=fill
regionsStats['ceilingWin']=fill
regionsStats['ceilingPer']=fill
regionsStats['ceilingLimit']=fill
regionsStats['floor']=fill
regionsStats['floorWin']=fill
regionsStats['floorPer']=fill
regionsStats['floorLimit']=fill
regionsStats['size']=fill
regionsStats['meanPred']=fill

fB = 24.5
cB = 27.5
regionsStats['topBase'] = cB
regionsStats['botBase'] = fB

bases = ([
        
        ])

bases = pd.DataFrame(bases, columns=['Region','botBase','topBase'])

for x in range(len(bases)):
    regionsStats['botBase'][bases['Region'][x]] = bases['botBase'][x]
    regionsStats['topBase'][bases['Region'][x]] = bases['topBase'][x]

hide_toggle()

In [45]:
%%time

for reg,region in enumerate(regionsStats['region']):
    print('================================')
    print(f'{reg}/{region}')

    regionsStats['accuracy_3'][reg]=fill
    bestDropType = regionsStats['drop_nan_type'][reg]
    
    region_model_number = (regionsStats[regionsStats['region']==region])['model'].iloc[0]
    region_model = base_models[region_model_number]
    regionDataList = regionsTrainData[region]

    tempTournamentId = region+currentSemesterYear
    dfSklearnRegions = dfToSplit[dfToSplit['TournamentRegion'].isin(regionDataList)].copy()
    dfSklearnRegions = dfSklearnRegions[regionsFeatureCols[region]+infoCols]
    dfSklearnRegions = dfSklearnRegions.sort_values(by='Date',ascending=True).copy()
    dfSklearnRegions = filterNan(dfSklearnRegions, dropType)
    
    xtrain,ytrain,xtest,ytest = train_test_split2(dfSklearnRegions,tempTournamentId,currentTarget)
    xval=xtest
    yval=ytest

    for col in ['ceiling','floor']:
        
        print(col)
        regionsStats[col][reg] = 0.0
        regionsStats[f'{col}Win'][reg] = 0.0
        regionsStats[f'{col}Per'][reg] = 0.0
        regionsStats['meanPred'][reg] = 0.0
        
        for dataX,dataY in zip(['xt','xv'],['yt','yv']):
        
            if dataX=='xt': dataX=xtest.reset_index(drop=True)
            elif dataX=='xv': dataX=xval.reset_index(drop=True)

            if dataY=='yt': dataY=ytest.reset_index(drop=True)
            elif dataY=='yv': dataY=yval.reset_index(drop=True)
    
            base_model = region_model
            base_model.fit(xtrain, ytrain)

            ceilingtest = pd.DataFrame(columns=[0,1,2,3])
            if col=='ceiling': ceilingtest[0] = [i/100 for i in range(-400,200)]
            else: ceilingtest[0] = [i/100 for i in range(-300,100)]
            for x in [1,2,3]:
                ceilingtest[x] = 0
            
            meanErr=[]
            for contMean in list(range(0,10)):
                errors,predictions = evaluate1(base_model,dataX,dataY)
                
                meanErr.append(np.mean([abs(x) for x in errors]))

                errors=list(errors)
                predictions=list(predictions)

                for n in range(len(ceilingtest)):
                    tp,tests = evaluate2(predictions,dataY,ceilingtest[0][n],col,regionsStats['botBase'][reg],regionsStats['topBase'][reg],-1)
                    
                    ceilingtest[1][n] = (float(tp))+ceilingtest[1][n]
                    ceilingtest[2][n] = (tests*100)+ceilingtest[2][n]
                    ceilingtest[3][n] = ((ceilingtest[1][n]+ceilingtest[2][n])/2)+ceilingtest[3][n]
            
            meanErr=np.mean(meanErr)
            
            print('meanerr old')
            print(regionsStats['accuracy_2'][reg])
            print('meanerr')
            print(meanErr)
            print('========')
            regionsStats['accuracy_3'][reg] += meanErr
                    
            
            for x in [1,2,3]:
                ceilingtest[x] = ceilingtest[x]/10

            filterPerc = ceilingtest.loc[(ceilingtest[2]>20) & (ceilingtest[1]>=80)].copy()
            if len(filterPerc)<1: filterPerc = ceilingtest.loc[(ceilingtest[1]>=80)].copy()
            if len(filterPerc)<1: filterPerc = ceilingtest.loc[(ceilingtest[1]==max(ceilingtest[1]))].copy()


            regionsStats['meanPred'][reg] = round(np.mean(predictions)+regionsStats['meanPred'][reg],2)
            regionsStats[col][reg] = ((filterPerc[filterPerc[1]==max(filterPerc[1])].reset_index())[0][0])+regionsStats[col][reg]
            regionsStats[f'{col}Win'][reg] = round((filterPerc[filterPerc[3]==max(filterPerc[3])].reset_index())[1][0]+regionsStats[f'{col}Win'][reg],2)
            regionsStats[f'{col}Per'][reg] = round((filterPerc[filterPerc[3]==max(filterPerc[3])].reset_index())[2][0]+regionsStats[f'{col}Per'][reg],2)

            regionsStats['size'][reg] = len(predictions)
            if col=='ceiling':
                regionsStats[f'{col}Limit'][reg] = float(ceilingtest.loc[ceilingtest[1]==max(ceilingtest[1])][0][-1:])
            else:regionsStats[f'{col}Limit'][reg] = float(ceilingtest.loc[ceilingtest[1]==max(ceilingtest[1])][0].iloc[0])

        regionsStats[col][reg] = round(regionsStats[col][reg]/2,2)
        regionsStats[f'{col}Win'][reg] = round(regionsStats[f'{col}Win'][reg]/2,2)
        regionsStats[f'{col}Per'][reg] = round(regionsStats[f'{col}Per'][reg]/2,2)
        regionsStats['meanPred'][reg] = round(regionsStats['meanPred'][reg]/2,2)
        
    regionsStats['accuracy_3'][reg] = round(regionsStats['accuracy_3'][reg]/4,2)
    print(regionsStats['accuracy_3'][reg])
try:
    hyperParams.reset_index(inplace=True)
    hyperParams.index = hyperParams['n']
except:True

hide_toggle()

0/LPL
train len: 4556
train len no outliers: 3451
percent of len removed: 24.25%
test len: 329

ceiling
meanerr old
6.52
meanerr
6.502733612076303
meanerr old
6.52
meanerr
6.5353958476882585
floor
meanerr old
6.52
meanerr
6.527467130369733
meanerr old
6.52
meanerr
6.533013026576289
6.52
1/LCK
train len: 4352
train len no outliers: 3236
percent of len removed: 25.64%
test len: 248

ceiling
meanerr old
5.67
meanerr
5.6705709020658635
meanerr old
5.67
meanerr
5.6705709020658635
floor
meanerr old
5.67
meanerr
5.6705709020658635
meanerr old
5.67
meanerr
5.6705709020658635
5.67
2/PCS
train len: 5344
train len no outliers: 4085
percent of len removed: 23.56%
test len: 105

ceiling
meanerr old
6.03
meanerr
6.0340279366177985
meanerr old
6.03
meanerr
6.0340279366177985
floor
meanerr old
6.03
meanerr
6.0340279366177985
meanerr old
6.03
meanerr
6.0340279366177985
6.03
3/VCS
train len: 5800
train len no outliers: 4213
percent of len removed: 27.36%
test len: 92

ceiling
meanerr old
6.11
meanerr
6.

In [47]:
print(np.mean(regionsStats['accuracy_0']))
print(np.mean(regionsStats['accuracy_1']))
print(np.mean(regionsStats['accuracy_2']))
regionsStats

6.550454545454546
6.521363636363635
6.502727272727273


Unnamed: 0,region,model,size,accuracy_0,accuracy_1,accuracy_2,drop_nan_type,accuracy_3,ceiling,ceilingWin,ceilingPer,ceilingLimit,floor,floorWin,floorPer,floorLimit,meanPred,topBase,botBase
0,LPL,0,329.0,6.59,6.54,6.52,1,6.52,1.78,62.83,4.11,1.94,-2.16,81.09,3.65,-1.73,25.53,27.5,24.5
1,LCK,4,248.0,5.69,5.67,5.67,0,5.67,0.2,46.3,21.77,0.2,-0.18,80.65,100.0,-0.29,22.66,27.5,24.5
2,PCS,6,105.0,6.13,6.1,6.03,0,6.03,0.8,91.67,11.43,0.84,-1.65,100.0,3.81,-1.65,24.48,27.5,24.5
3,VCS,4,92.0,6.12,6.11,6.11,0,6.11,1.32,80.0,5.43,1.34,-1.91,80.0,5.43,-1.91,26.8,27.5,24.5
4,Ultraliga,4,108.0,7.04,7.04,7.05,0,7.05,0.3,80.56,66.67,1.9,-1.09,100.0,3.7,-1.09,28.53,27.5,24.5
5,LLA,4,79.0,6.44,6.38,6.38,0,6.38,0.59,80.0,6.33,0.6,-0.3,81.82,41.77,-0.46,24.23,27.5,24.5
6,TCL,0,69.0,6.39,6.33,6.29,0,6.3,1.27,100.0,1.45,1.45,-1.0,100.0,3.62,-0.99,24.82,27.5,24.5
7,LFL,4,204.0,5.8,5.8,5.8,0,5.8,0.19,80.36,27.45,1.7,-0.86,100.0,0.49,-0.86,26.25,27.5,24.5
8,Prime,4,110.0,5.57,5.55,5.55,0,5.55,0.43,80.0,45.45,1.04,-1.5,100.0,0.91,-1.5,26.26,27.5,24.5
9,LJL,6,133.0,5.43,5.43,5.38,0,5.38,1.11,100.0,1.5,1.81,-0.22,80.26,57.14,-1.05,24.55,27.5,24.5


# SAVE

In [14]:
regionsStats.to_pickle("./Data/raw_data/regionsStats.pkl")

for data,dataName in zip([regionsFeatureCols,regionsTrainData],['regionsFeatureCols','regionsTrainData']):
    with open(f'./Data/raw_data/{dataName}.json', 'w') as fp:
        json.dump(data, fp, indent=4)

# Notes

In [31]:
%%time

# regionsTrainData = dict(zip(regionsList,[0]*len(regionsList)))
# for key in regionsTrainData:
#     regionsTrainData[key] = regionsToFeed.copy()
    
# regionsStats['accuracy_0'] = np.nan

# for n,region in enumerate(regionsToPredict):
#     print('=========\n')
#     regionFinalAcc = (regionsStats[regionsStats['region']==region])['accuracy_0'].iloc[0]
#     regionsToTest = regionsTrainData[region].copy()
#     regionsToTest.remove(region)
#     tempTournamentId = region+currentSemesterYear
    
#     print(f'[{n} of {len(regionsToPredict)}] region {region} -> {round(regionFinalAcc,3)}:\n')
    
#     for nn,regionToTest in enumerate(regionsToTest):
#         regionsTrainData[region].remove(regionToTest)
        
#         dfSklearnRegions = dfToSplit[dfToSplit['TournamentRegion'].isin(regionsTrainData[region])].copy()
#         dfSklearnRegions = dfSklearnRegions.sort_values(by='Date',ascending=True).copy()
        
#         xtrain,ytrain,xtest,ytest = train_test_split2(dfSklearnRegions,tempTournamentId,verbose=False)
#         region_model_number = (regionsStats[regionsStats['region']==region])['model'].iloc[0]
#         region_model = base_models[region_model_number]
#         region_model.fit(xtrain, ytrain)
#         pred = region_model.predict(xtest)
        
#         metric = skm.mean_absolute_error(ytest, pred)
#         if metric < regionFinalAcc or np.isnan(regionFinalAcc):
#             regionFinalAcc = metric
#             print(f'{round(regionFinalAcc,3)} -> {regionToTest} removed                                       ')
#         else:
#             regionsTrainData[region].append(regionToTest)
            
#         print(f'[{nn+1} of {len(regionsToTest)}] testing: {regionToTest}                        ',end='\r')
    
#     regionsStats['accuracy_0'][n] = regionFinalAcc
#     regionsStats['size'][n] = len(dfSklearnRegions)
    
#     print(f'\n\naccuracy: {round(regionFinalAcc,3)}')
#     print(f'{region} train data: {regionsTrainData[region]}\nunique regions:{len(regionsTrainData[region])}')
#     print(f'test data len: {len(pred)}\n')
    
# printFinalResults(regionsStats, 'accuracy_0')

hide_toggle()

CPU times: total: 0 ns
Wall time: 0 ns
