# Setup

In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from xgboost import XGBRegressor, XGBClassifier

import sys
import os
import json

from utils import constants, scripts

import warnings
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'constants'

# FUNCTIONS

In [2]:
def train_test_split_binary(dfToSplitFunc, tournamentId, CURRENT_TARGET, cut_off_var, SPLIT_TYPE, verbose=True):
    
    for col in dfToSplitFunc.columns:
            dfToSplitFunc[col] = dfToSplitFunc[col].fillna(0)
    
    if SPLIT_TYPE==0:
        testData = dfToSplitFunc[dfToSplitFunc['tournamentId']==tournamentId].copy()
        xtest= testData.drop(['Date',CURRENT_TARGET],axis=1).copy()
        xtest= xtest.drop(OFF_COLS,axis=1,errors='ignore')
        ytest = testData[CURRENT_TARGET]

        trainData = dfToSplitFunc[dfToSplitFunc['tournamentId']!=tournamentId].copy()
        xtrain = trainData.drop(['Date',CURRENT_TARGET],axis=1).copy()
        xtrain = xtrain.drop(OFF_COLS,axis=1,errors='ignore')
        ytrain = trainData[CURRENT_TARGET]
        
    elif SPLIT_TYPE==1:
        #print(dfToSplitFunc.columns)
        xCols = dfToSplitFunc.drop(['Date', CURRENT_TARGET]+OFF_COLS,axis=1,errors='ignore')
        #print(xCols.columns)
        yCols = dfToSplitFunc[CURRENT_TARGET]
        xtrain, xtest, ytrain, ytest = train_test_split(xCols, yCols, test_size=0.20, shuffle=False)
    
    if False:
        ytrain_mean, ytrain_std = np.mean(ytrain), np.std(ytrain)
        cut_off = ytrain_std * cut_off_var
        lower, upper = ytrain_mean - cut_off, ytrain_mean + cut_off
        outlierMask = ytrain.apply(lambda x: False if x < lower or x > upper else True)
        if verbose:
            print(f'train len: {len(xtrain)}')
        lentemp = len(xtrain)
        #xtrain, ytrain = xtrain[outlierMask], ytrain[outlierMask]
        if verbose:
            print(f'train len no outliers: {len(xtrain)}')
            print(f'percent of len removed: {round(abs(len(xtrain)/lentemp*100-100),2)}%')
            print(f'test len: {len(xtest)}\n')
    
    return xtrain, ytrain, xtest, ytest

In [3]:
def matchListToDfs(df):
    
    matchListDateFilter = (df[df['Date'] >= pd.to_datetime('2019-7-01',format='%Y-%m-%d')]
                                        .reset_index(drop=True).copy())
    
    matchListDateFilter['realSemesterYear'] = (matchListDateFilter['realYear'].astype(str)
                                               +matchListDateFilter['realSemester'].astype(str))
    matchListDateFilter['tournamentId'] = (matchListDateFilter['regionAbrev'].astype(str)
                                               +matchListDateFilter['realSemesterYear'].astype(str)) #delete
    
    playerMatchList = matchListDateFilter.copy()
    teamMatchList = matchListDateFilter.copy()

    for color in ['Blue','Red']:
        for feature in PLAYER_SIMPLE_FEATURE_COLS:
                teamMatchList[f'Team_{color}_{feature}'] = (matchListDateFilter[[f"{position}_{color}_{feature}" 
                                                                                 for position in ROLES]]
                                                                                .mean(skipna=True,axis=1).copy())
                
                teamMatchList.drop([f"{position}_{color}_{feature}" for position in ROLES],axis=1,inplace=True)
                
        teamMatchList.drop([f"{position}_{color}" for position in ROLES],axis=1,inplace=True)
        
    return playerMatchList, teamMatchList

def regionLists(df, CURRENT_YEAR):
    regions = df['regionAbrev'].unique()
    regionsToFeed = [x for x in df['regionAbrev'].unique()]
    regionsFilterTemp = ([x for x in regions if CURRENT_YEAR in (df[df['regionAbrev']==x])['realYear'].unique()
                                            and CURRENT_YEAR-1 in (df[df['regionAbrev']==x])['realYear'].unique()])
    regionsToPredict = []
    for region in regionsFilterTemp:
        regionsFilterSize = df[(df['realYear']==CURRENT_YEAR) & (df['regionAbrev']==region)]
        if len(regionsFilterSize)>=30:
            regionsToPredict.append(region)
    
    return regions, regionsToFeed, regionsToPredict

def generateRegionDf(df, regionDataListF, regionsFeatureColsF, cut_off_var, tempTournamentIdF, CURRENT_TARGET, SPLIT_TYPE):
    
    dfTemp = df[df['regionAbrev'].isin(regionDataListF)].copy()
    tempCols = [x for x in list(dfTemp.columns) if x.replace('Team_Red_','').replace('Team_Blue_','') in regionsFeatureColsF]
    dfTemp = dfTemp[tempCols+INFO_COLS]
    dfTemp = dfTemp.sort_values(by='Date',ascending=True).copy()
    
    xtrain,ytrain,xtest,ytest = train_test_split_binary(dfTemp, tempTournamentIdF, CURRENT_TARGET, cut_off_var, SPLIT_TYPE, verbose=False)
    
    return dfTemp, xtrain, ytrain, xtest, ytest

def generateMetric(model_number, regionDataListF, regionsFeatureColsF, cut_off_var, tempTournamentIdF, CURRENT_TARGET, dfToSplit, SPLIT_TYPE):
    
    dfTemp, xtrain, ytrain, xtest, ytest = generateRegionDf(dfToSplit, regionDataListF, regionsFeatureColsF
                                                            , cut_off_var, tempTournamentIdF, CURRENT_TARGET, SPLIT_TYPE)

    errors3=0
    rep=6
    for i in range(rep):
        region_model = BASE_MODELS[model_number]
        region_model.fit(xtrain, ytrain)
        pred = region_model.predict(xtest)
        errors3 = accuracy_score(ytest, pred)+errors3
    errors2=errors3/rep
    
    metric=round(abs(errors2-1),3)
    
    return metric, pred

# Load

In [4]:
teamDataTable = pd.read_pickle("Data/raw_data/teamDataTable.pkl")
playerDataTable = pd.read_pickle("Data/raw_data/playerDataTable.pkl")

matchList = pd.read_pickle("Data/raw_data/matchList.pkl")
matchListFill = pd.read_pickle("Data/raw_data/matchListFill.pkl")
matchListFill = matchListFill.rename(columns={'TournamentRegion':'regionAbrev'})

teamMatchList = pd.read_pickle("Data/raw_data/teamMatchList.pkl")
playerMatchList = pd.read_pickle("Data/raw_data/playerMatchList.pkl")

regionsStats = pd.read_pickle("./Data/raw_data/regionsStats.pkl")

with open(f'./Data/raw_data/regionsFeatureCols.json', 'r') as fp:
    regionsFeatureCols = json.load(fp)
with open(f'./Data/raw_data/regionsTrainData.json', 'r') as fp:
    regionsTrainData = json.load(fp)
    

In [5]:
content=1
playerMatchList, teamMatchList = matchListToDfs(matchListFill)
#             0              1
dfsContent = [playerMatchList, teamMatchList]
dfToSplit = dfsContent[content].copy()

print(f'main df size: {len(dfToSplit)}')

params = {'objective': 'binary:logistic'}
BASE_MODELS = [
              RandomForestClassifier(), #0
              #XGBClassifier(params=params,num_class=2), 
              KNeighborsClassifier(algorithm = 'brute'), #1
              LinearSVC(C=0.0001), #2
              BaggingClassifier(DecisionTreeClassifier(),max_samples=0.5,max_features=1.0,n_estimators=10), #3
              AdaBoostClassifier(DecisionTreeClassifier(min_samples_split=10,max_depth=4),n_estimators=10,learning_rate=0.6), #4
              DecisionTreeClassifier(), #5
              LogisticRegression(), #6
              LogisticRegression(solver='newton-cg'), #7
              LogisticRegression(solver='newton-cg') #8
              ]

CURRENT_TARGET = 'Score'
CURRENT_YEAR = 2022
CURRENT_SEMESTER = 1
CURRENT_SEMESTER_YEAR = str(CURRENT_YEAR)+str(CURRENT_SEMESTER)
LAST_SEMESTER = abs(CURRENT_SEMESTER-1)
LAST_YEAR = CURRENT_YEAR-1 if LAST_SEMESTER==1 else CURRENT_YEAR

DEFAULT_MODEL = 8
INFO_COLS = ['Date','tournamentId',CURRENT_TARGET,'regionAbrev']
SPLIT_TYPE = 0

dfToSplit = dfToSplit[dfToSplit[CURRENT_TARGET]!=2]
dfToSplit.drop('totalKills',axis=1,inplace=True)

regions, regionsToFeed, regionsToPredict = regionLists(dfToSplit, CURRENT_YEAR)

dfToSplit = dfToSplit[dfToSplit['realSemesterYear'].astype(int)<=int(CURRENT_SEMESTER_YEAR)]

regionsToPredict.remove('MSI')
regionsToPredict.remove('World')
#regionsToPredict = regionsToPredict[:5]

main df size: 26314


# SKLEARN

In [6]:
regionsStats = pd.DataFrame(columns=['region','model','size'])
regionsStats['region'] = regionsToPredict
regionsList = regionsStats['region']
regionsStats['model'] = DEFAULT_MODEL

regionsTrainData = dict(zip(regionsList,regionsList.apply(lambda x: [x])))
regionsStats['accuracy_0'] = np.nan
regionsStats['accuracy_1'] = np.nan
regionsStats['accuracy_2'] = np.nan
regionsStats['cut_off_var'] = 1.5

regionsFeatureCols = dict(zip(regionsList,[0]*len(regionsList)))
for key in regionsFeatureCols:
    regionsFeatureCols[key] = TEAM_SIMPLE_FEATURE_COLS.copy()

### TRAIN DATA SELECTION

In [7]:
regionsTrainData = dict(zip(regionsList,regionsList.apply(lambda x: [x])))
regionsStats['accuracy_0'] = np.nan

for n,region in enumerate(regionsToPredict):
    print('=========\n')
    regionFinalAcc = regionsStats['accuracy_0'][n]
    tempTournamentId = region+CURRENT_SEMESTER_YEAR
    cut_off_var = regionsStats['cut_off_var'][n]
    region_model_number = regionsStats['model'][n]
    print(f'[{n+1} of {len(regionsToPredict)}] region {region} -> {regionFinalAcc}:\n')
    
    regionsToTest = [x for x in regionsToFeed]
    regionsToTest.remove(region)
    random.shuffle(regionsToTest)
    for nn,regionToTest in enumerate(regionsToTest):
        regionsTrainData[region].append(regionToTest)
        
        metric, pred = generateMetric(region_model_number, regionsTrainData[region], regionsFeatureCols[region]
                                      , cut_off_var, tempTournamentId, CURRENT_TARGET, dfToSplit, SPLIT_TYPE)
        
        if metric < regionFinalAcc or np.isnan(regionFinalAcc):
            regionFinalAcc = metric
            
            print(f'{regionFinalAcc} -> {regionToTest} added                                           ')
        else:
            regionsTrainData[region].remove(regionToTest)
            
        #print(f'[{nn+1} of {len(regionsToTest)}] testing: {regionToTest}                        ',end='\r')
    
    regionsStats['accuracy_0'][n] = regionFinalAcc
    regionsStats['size'][n] = len(pred)
    
    print(f'\n\naccuracy: {regionFinalAcc}')
    print(f'{region} train data: {regionsTrainData[region]}\nlen:{len(regionsTrainData[region])}')
    print(f'test data len: {len(pred)}\n')
    
printFinalResults(regionsStats, 'accuracy_0')


[1 of 25] region LPL -> nan:

0.347 -> Asia added                                           
0.327 -> World added                                           
0.296 -> NA_Tier2 added                                           
0.29 -> LST added                                           
0.279 -> Elite_Tier2 added                                           
0.266 -> EU added                                           


accuracy: 0.266
LPL train data: ['LPL', 'Asia', 'World', 'NA_Tier2', 'LST', 'Elite_Tier2', 'EU']
len:7
test data len: 297


[2 of 25] region LCK -> nan:

0.373 -> LST added                                           
0.344 -> Hitpoint_Tier2 added                                           
0.341 -> LCS added                                           


accuracy: 0.341
LCK train data: ['LCK', 'LST', 'Hitpoint_Tier2', 'LCS']
len:4
test data len: 343


[3 of 25] region PCS -> nan:

0.285 -> EBL added                                           
0.277 -> NA_Tier2 added              

### FEATURE SELECTION

In [8]:
regionsStats['accuracy_1'] = np.nan
regionsFeatureCols = dict(zip(regionsList,[0]*len(regionsList)))
for key in regionsFeatureCols:
    regionsFeatureCols[key] = TEAM_SIMPLE_FEATURE_COLS.copy()

for n,region in enumerate(regionsToPredict):
    print('=========\n')
    regionFinalAcc = regionsStats['accuracy_0'][n]
    tempTournamentId = region+CURRENT_SEMESTER_YEAR
    cut_off_var = regionsStats['cut_off_var'][n]
    region_model_number = regionsStats['model'][n]
    print(f'[{n+1} of {len(regionsToPredict)}] region {region} -> {regionFinalAcc}:\n')
    print(f'model: {region_model_number}')
    
    initialFeatures = regionsFeatureCols[region].copy()
    for nn,feature in enumerate(initialFeatures):
        regionsFeatureCols[region].remove(feature)
        
        metric, pred = generateMetric(region_model_number, regionsTrainData[region], regionsFeatureCols[region]
                                      , cut_off_var, tempTournamentId, CURRENT_TARGET, dfToSplit, SPLIT_TYPE)
        if metric < regionFinalAcc or np.isnan(regionFinalAcc):
            regionFinalAcc = metric
            print(f'{feature} removed for {metric}                                                ')
        else:
            regionsFeatureCols[region].append(feature)
            
        print(f'[{nn+1} of {len(initialFeatures)}] testing: {feature}                        ',end='\r')
    
    regionsStats['accuracy_1'][n] = regionFinalAcc
    print(f'\n\naccuracy: {regionFinalAcc}')
    print(f'{region} feature count: {len(regionsFeatureCols[region])}')
    print(f'test data len: {len(pred)}\n')
    
printFinalResults(regionsStats, 'accuracy_1')


[1 of 25] region LPL -> 0.266:

model: 8
[22 of 22] testing: CSM                                 

accuracy: 0.266
LPL feature count: 22
test data len: 297


[2 of 25] region LCK -> 0.341:

model: 8
[22 of 22] testing: CSM                                 

accuracy: 0.341
LCK feature count: 22
test data len: 343


[3 of 25] region PCS -> 0.204:

model: 8
[22 of 22] testing: CSM                                 

accuracy: 0.204
PCS feature count: 22
test data len: 137


[4 of 25] region VCS -> 0.286:

model: 8
[22 of 22] testing: CSM                                 

accuracy: 0.286
VCS feature count: 22
test data len: 161


[5 of 25] region Ultraliga -> 0.161:

model: 8
[22 of 22] testing: CSM                                 

accuracy: 0.161
Ultraliga feature count: 22
test data len: 62


[6 of 25] region LLA -> 0.333:

model: 8
[22 of 22] testing: CSM                                 

accuracy: 0.333
LLA feature count: 22
test data len: 66


[7 of 25] region TCL -> 0.207:

model: 8


### MODEL SELECTION

In [9]:
regionsStats['accuracy_2'] = np.nan

for n,region in enumerate(regionsToPredict):
    print('=========\n')
    currAcc = regionsStats['accuracy_0'][n]
    currModel = regionsStats['model'][n]
    tempTournamentId = region+CURRENT_SEMESTER_YEAR
    cut_off_var = regionsStats['cut_off_var'][n]
    print(f'[{n+1} of {len(regionsToPredict)}] region {region} -> {currAcc}:\n')
    print(f'current model: {currModel}\n')
    
    bestModelAbs = (regionsStats[regionsStats['region']==region])['accuracy_2'].iloc[0]
    for model in range(len(BASE_MODELS)):
        metricModelAbs, pred = generateMetric(model, regionsTrainData[region], regionsFeatureCols[region]
                                      , cut_off_var, tempTournamentId, CURRENT_TARGET, dfToSplit, SPLIT_TYPE)
        if metricModelAbs<bestModelAbs or np.isnan(bestModelAbs):
            bestModelAbs=metricModelAbs
            bestModel=model
        print(f'model {model} -> {metricModelAbs}')

    regionsStats['model'][n] = bestModel
    regionsStats['accuracy_2'][n] = bestModelAbs
    
    print(f'\naccuracy: {bestModelAbs}')
    print(f'best model: {bestModel}\n')
    
printFinalResults(regionsStats, 'accuracy_2')

hide_toggle()


[1 of 25] region LPL -> 0.266:

current model: 8

model 0 -> 0.38
model 1 -> 0.559
model 2 -> 0.296
model 3 -> 0.412
model 4 -> 0.369
model 5 -> 0.509
model 6 -> 0.276
model 7 -> 0.266
model 8 -> 0.266

accuracy: 0.266
best model: 7


[2 of 25] region LCK -> 0.341:

current model: 8

model 0 -> 0.391
model 1 -> 0.394
model 2 -> 0.371
model 3 -> 0.403
model 4 -> 0.356
model 5 -> 0.42
model 6 -> 0.356
model 7 -> 0.341
model 8 -> 0.341

accuracy: 0.341
best model: 7


[3 of 25] region PCS -> 0.204:

current model: 8

model 0 -> 0.223
model 1 -> 0.365
model 2 -> 0.27
model 3 -> 0.285
model 4 -> 0.263
model 5 -> 0.3
model 6 -> 0.27
model 7 -> 0.204
model 8 -> 0.204

accuracy: 0.204
best model: 7


[4 of 25] region VCS -> 0.286:

current model: 8

model 0 -> 0.32
model 1 -> 0.509
model 2 -> 0.385
model 3 -> 0.438
model 4 -> 0.398
model 5 -> 0.381
model 6 -> 0.335
model 7 -> 0.286
model 8 -> 0.286

accuracy: 0.286
best model: 7


[5 of 25] region Ultraliga -> 0.161:

current model: 8

model 

# Testing

In [10]:
print(np.mean(regionsStats['accuracy_0']))
print(np.mean(regionsStats['accuracy_1']))
print(np.mean(regionsStats['accuracy_2']))
regionsStats

0.28244
0.27688
0.25983999999999996


Unnamed: 0,region,model,size,accuracy_0,accuracy_1,accuracy_2,cut_off_var
0,LPL,7,297,0.266,0.266,0.266,1.5
1,LCK,7,343,0.341,0.341,0.341,1.5
2,PCS,7,137,0.204,0.204,0.204,1.5
3,VCS,7,161,0.286,0.286,0.286,1.5
4,Ultraliga,6,62,0.161,0.161,0.161,1.5
5,LLA,7,66,0.333,0.333,0.333,1.5
6,TCL,7,82,0.207,0.207,0.207,1.5
7,LFL,7,143,0.399,0.399,0.399,1.5
8,Prime,7,64,0.219,0.188,0.188,1.5
9,LJL,2,95,0.189,0.189,0.189,1.5


In [12]:
playerDataTableSwap = playerDataTable.copy()

if 'realYear' not in playerDataTableSwap.columns:

    playerDataTableSwap['realSemester'] = playerDataTableSwap['Semester']
    playerDataTableSwap['realYear'] = playerDataTableSwap['Year']

    playerDataTableSwap['Semester'] = playerDataTableSwap['Semester'].apply(lambda x: 0 if x==1 else 1)
    playerDataTableSwap['Year'] = playerDataTableSwap['Year'] - playerDataTableSwap['Semester']

    
    playerDataTableMerge = playerDataTableSwap[['Player','realYear','realSemester']+PLAYER_SIMPLE_FEATURE_COLS]
    playerDataTableSwap = pd.merge(playerDataTableSwap,
                            playerDataTableMerge,
                            how='left',
                            left_on=['Player','Year','Semester'],
                            right_on = ['Player','realYear','realSemester'])

playerDataTableSwap.columns = [x.replace('_x','').replace('_y','_last_season') for x in playerDataTableSwap.columns]

In [16]:
def get_team_names(name):
    tempDf = teamDataTable[(teamDataTable['Name']==name)
                           & (teamDataTable['Year'].astype(int)==CURRENT_YEAR)
                           & (teamDataTable['Semester'].astype(int)==CURRENT_SEMESTER)]
    
    namesList = tempDf[['TOP','JNG','MID','ADC','SUP']].iloc[0]
    
    return namesList

def get_feature_team_mean(namesList,feature):
    values=[]
    noDataList=[]
    for name in namesList:
        filteredTempDf = playerDataTable[(playerDataTable['Player']==name)
                                             & (playerDataTable['Year']==CURRENT_YEAR)
                                             & (playerDataTable['Semester']==CURRENT_SEMESTER)]
        
        valueToAppend = filteredTempDf[feature.replace('Team_Red_','').replace('Team_Blue_','')]
        if len(valueToAppend)>0:
            values.append(valueToAppend.iloc[0])
        else: 
            noDataList.append(name)
    
    return np.mean(values),noDataList


In [17]:
regionTest = 'CBLOL'
dfTemp = teamMatchList[teamMatchList['regionAbrev']==regionTest]
teamsSet = sorted(set(list(dfTemp['Blue'].unique())+list(dfTemp['Red'].unique())))
teamsDict = dict(zip(range(len(teamsSet)),teamsSet))
teamsDict

{0: 'CNB e-Sports Club',
 1: 'FURIA Esports',
 2: 'FURIA Uppercut',
 3: 'Flamengo Los Grandes',
 4: 'Flamengo eSports',
 5: 'Fluxo',
 6: 'INTZ e-Sports',
 7: 'INTZ eSports',
 8: 'KaBuM! e-Sports',
 9: 'LOUD',
 10: 'Liberty',
 11: 'Los Grandes',
 12: 'Netshoes Miners',
 13: 'Prodigy Esports',
 14: 'RED Canids',
 15: 'Redemption POA',
 16: 'Rensga eSports',
 17: 'Santos e-Sports',
 18: 'Team oNe eSports',
 19: 'Uppercut eSports',
 20: 'Vivo Keyd',
 21: 'Vivo Keyd Stars',
 22: 'Vorax Liberty',
 23: 'paiN Gaming'}

In [79]:
team0 = 1
team1 = 10

playerNames = teamDataTable[(teamDataTable['Name']==teamsDict[team0])
                            & (teamDataTable['Year'].astype(int)==CURRENT_YEAR)
                            & (teamDataTable['Semester'].astype(int)==CURRENT_SEMESTER)][['TOP','JNG','MID','ADC','SUP']]

try: print(f'teamBlueNames = {list(playerNames.values[0])}')
except: print('no team found')

playerNames = teamDataTable[(teamDataTable['Name']==teamsDict[team1])
                            & (teamDataTable['Year'].astype(int)==CURRENT_YEAR)
                            & (teamDataTable['Semester'].astype(int)==CURRENT_SEMESTER)][['TOP','JNG','MID','ADC','SUP']]

try:print(f'teamRedNames = {list(playerNames.values[0])}')
except: print('no team found')

teamBlueNames = ['fNb', 'Goot', 'Envy', 'Netuno', 'RedBert']
teamRedNames = ['Kiari', 'Disamis', 'Krastyel', 'Cavalo', 'Matsukaze']


In [80]:
manualNameInsert = 1

for i in range(5):
    print('=================')
    for swap in [0,1]:
        #teams
        teamBlueNames = ['fNb', 'Goot', 'Envy', 'Trigo', 'RedBert']
        teamRedNames = ['Kiari', 'accez', 'Piloto', 'Juliera', 'Cavalo']
        teamBlueTest = teamsDict[team0]
        teamRedTest = teamsDict[team1]
        if swap==1:
                teamTemp = teamBlueTest
                teamBlueTest = teamRedTest
                teamRedTest = teamTemp

                teamNamesTemp = teamBlueNames
                teamBlueNames = teamRedNames
                teamRedNames = teamNamesTemp

        #generate train data
        finalDfInput = dfToSplit[dfToSplit['regionAbrev'].isin(regionsTrainData[regionTest])].copy()
        finalDfInput = finalDfInput.sort_values(by='Date',ascending=True)
        for col in finalDfInput.columns:
                    finalDfInput[col].fillna(0,inplace=True)

        featureColsFiltered = [x for x in list(finalDfInput.columns) 
                               if x.replace('Team_Blue_','').replace('Team_Red_','') in regionsFeatureCols[regionTest]]
        
        xdata= finalDfInput[featureColsFiltered]
        ydata = finalDfInput[CURRENT_TARGET]

        #generate features to predict
        if manualNameInsert==0:
            teamBlueNames = get_team_names(teamBlueTest)
            teamRedNames = get_team_names(teamRedTest)

        inputFeatures = dfToSplit.drop([CURRENT_TARGET+'Data']+OFF_COLS,axis=1,errors='ignore').columns
        featuresDict = {}
        for feature in featureColsFiltered:
            side = feature.split('_')[1]
            if side == 'Blue':
                featuresDict[feature],noDataNames = get_feature_team_mean(teamBlueNames,feature)
                
            elif side == 'Red':
                featuresDict[feature],noDataNames = get_feature_team_mean(teamRedNames,feature)
        print(f'no data names on: {noDataNames}')

        inputDf = pd.DataFrame(featuresDict.values(),index=featuresDict.keys()).transpose()
        for col in inputDf.columns:
            inputDf[col].fillna(0,inplace=True)

        #model and prediction
        modelNum = (regionsStats[regionsStats['region']==regionTest])['model'].iloc[0]
        model = BASE_MODELS[modelNum]
        model.fit(xdata,ydata)
        prediction = teamBlueTest if model.predict(inputDf)==0 else teamRedTest
        print(prediction)

no data names on: []
FURIA Esports
no data names on: []
FURIA Esports
no data names on: []
FURIA Esports
no data names on: []
FURIA Esports
no data names on: []
FURIA Esports
no data names on: []
FURIA Esports
no data names on: []
FURIA Esports
no data names on: []
FURIA Esports
no data names on: []
FURIA Esports
no data names on: []
FURIA Esports


# Notes

In [8]:
### DROP OUTLIERS

%%time

# regionsFeatureCols = dict(zip(regionsList,[0]*len(regionsList)))
# for key in regionsFeatureCols:
#     regionsFeatureCols[key] = TEAM_SIMPLE_FEATURE_COLS.copy()

# for n,region in enumerate(regionsToPredict):
#     print('=========\n')
#     regionFinalAcc = regionsStats['accuracy_0'][n]
#     tempTournamentId = region+CURRENT_SEMESTER_YEAR
#     cut_off_var = regionsStats['cut_off_var'][n]
#     region_model_number = regionsStats['model'][n]
#     print(f'[{n+1} of {len(regionsToPredict)}] region {region} -> {regionFinalAcc}:\n')
#     print(f'current var: {cut_off_var}')
#     for var in np.arange(1.0,2.0,0.1):
        
#         metric, pred = generateMetric(region_model_number, regionsTrainData[region], regionsFeatureCols[region]
#                                       , var, tempTournamentId, CURRENT_TARGET, dfToSplit, SPLIT_TYPE)
        
#         print(f'var: {round(var,2)}, metric: {metric}')
#         if metric < regionFinalAcc or np.isnan(regionFinalAcc):
#             regionFinalAcc = metric
#             regionsStats['cut_off_var'][n] = var
#             print(f'changed to {round(var,2)} cut-off for {metric}                                                ')
            
#         #print(f'[{nn+1} of {len(initialFeatures)}] testing: {feature}                        ',end='\r')
    
#     regionsStats['accuracy_0'][n] = regionFinalAcc
#     print(f'\n\naccuracy: {regionFinalAcc}')
#     print(f'test data len: {len(pred)}\n')
    
# printFinalResults(regionsStats, 'accuracy_0')

hide_toggle()


[1 of 27] region LPL -> 0.332:

current var: 1.5
var: 1.0, metric: 0.428
var: 1.1, metric: 0.351
var: 1.2, metric: 0.37
var: 1.3, metric: 0.358
var: 1.4, metric: 0.37
var: 1.5, metric: 0.347
var: 1.6, metric: 0.373
var: 1.7, metric: 0.354
var: 1.8, metric: 0.34
var: 1.9, metric: 0.384


accuracy: 0.332
test data len: 297


[2 of 27] region LCK -> 0.372:

current var: 1.5
var: 1.0, metric: 0.39
var: 1.1, metric: 0.408
var: 1.2, metric: 0.405
var: 1.3, metric: 0.386
var: 1.4, metric: 0.404
var: 1.5, metric: 0.405
var: 1.6, metric: 0.416
var: 1.7, metric: 0.379
var: 1.8, metric: 0.401
var: 1.9, metric: 0.4


accuracy: 0.372
test data len: 343


[3 of 27] region PCS -> 0.246:

current var: 1.5
var: 1.0, metric: 0.336
var: 1.1, metric: 0.258
var: 1.2, metric: 0.276
var: 1.3, metric: 0.266
var: 1.4, metric: 0.277
var: 1.5, metric: 0.263
var: 1.6, metric: 0.283
var: 1.7, metric: 0.248
var: 1.8, metric: 0.273
var: 1.9, metric: 0.27


accuracy: 0.246
test data len: 137


[4 of 27] region VCS -

var: 1.2, metric: 0.287
var: 1.3, metric: 0.265
var: 1.4, metric: 0.243
var: 1.5, metric: 0.246
var: 1.6, metric: 0.283
var: 1.7, metric: 0.281
var: 1.8, metric: 0.276
var: 1.9, metric: 0.261


accuracy: 0.226
test data len: 90


[25 of 27] region LVP -> 0.448:

current var: 1.5
var: 1.0, metric: 0.688
var: 1.1, metric: 0.688
var: 1.2, metric: 0.5
var: 1.3, metric: 0.508
var: 1.4, metric: 0.544
var: 1.5, metric: 0.518
var: 1.6, metric: 0.536
var: 1.7, metric: 0.492
var: 1.8, metric: 0.508
var: 1.9, metric: 0.526


accuracy: 0.448
test data len: 64


[26 of 27] region NA_Tier2 -> 0.812:

current var: 1.5
var: 1.0, metric: 0.896
var: 1.1, metric: 0.904
var: 1.2, metric: 0.892
var: 1.3, metric: 0.9
var: 1.4, metric: 0.892
var: 1.5, metric: 0.892
var: 1.6, metric: 0.879
var: 1.7, metric: 0.875
var: 1.8, metric: 0.946
var: 1.9, metric: 0.862


accuracy: 0.812
test data len: 80


[27 of 27] region MSI -> nan:

current var: 1.5


ValueError: Found array with 0 sample(s) (shape=(0, 44)) while a minimum of 1 is required by BaggingClassifier.