In [22]:
import sys
from pathlib import Path

import pandas as pd

current_dir = Path('.')
current_dir = current_dir.absolute()
root_dir = current_dir.parent.parent
src_dir = root_dir / 'src'

sys.path.append(str(src_dir))

%load_ext autoreload
%autoreload 2

from dotenv import load_dotenv
load_dotenv(str(src_dir / '.env'))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


True

In [23]:
from api.containers.teams_and_winner import LoadTeamsAndWinnerOnly


def get_original_df():
    service = LoadTeamsAndWinnerOnly().service()

    return service.collect_all()

df = get_original_df()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[C.Winner] = winners


In [24]:
from infra.data_frames.pipeline import C

# Normalize teams and maps as integers
team_1_unique = set(df[C.Team1].unique())
team_2_unique = set(df[C.Team2].unique())
id_to_team = dict(enumerate(team_1_unique.union(team_2_unique), start=1))
team_to_id = {v: k for k, v in id_to_team.items()}

maps = set(df[C.Map].unique())
id_to_map = dict(enumerate(maps, start=1))
map_to_id = {v: k for k, v in id_to_map.items()}

# Create function to decode and encode teams and maps in the future
def decode_team(identifier: int):
    return id_to_team[identifier]

def encode_team(team: str):
    return team_to_id[team]

def decode_map(identifier: int):
    return id_to_map[identifier]

def encode_map(map: str):
    return map_to_id[map]


def convert_string_into_int(df: pd.DataFrame):
    df = pd.DataFrame({
        'Team': df[C.Team1].map(team_to_id),
        'Opponent': df[C.Team2].map(team_to_id),
        C.Map: df[C.Map].map(map_to_id),
        'Win': (df[C.Winner] == df[C.Team1]),

    })
    df['Win'] = df['Win'].astype(float)
    return df

df = convert_string_into_int(df)

In [25]:
# Prepare data for training

X = df[['Team', 'Opponent', C.Map]]
y = df['Win']

In [26]:
# Let's try to use logistic regression

from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X, y)

In [27]:
y_pred = model.predict(X)

df['Predicted Win'] = y_pred.astype(float)
df

Unnamed: 0,Team,Opponent,Map,Win,Predicted Win
0,893,906,3,0.0,0.490041
1,906,893,4,0.0,0.489094
2,893,906,11,0.0,0.481692
3,806,1242,3,0.0,0.485041
4,1027,281,11,0.0,0.491271
...,...,...,...,...,...
45,600,105,11,1.0,0.498631
46,600,105,9,1.0,0.500719
47,600,105,3,0.0,0.506980
48,752,698,9,0.0,0.488829


Not good results. Almost every match is predicted as a 50/50 match.

Let's check most played teams with each other.

In [28]:
df.groupby(['Team','Opponent']).size().sort_values(ascending=False)

Team  Opponent
990   1295        18
971   990         17
112   831         16
      528         14
1188  699         14
                  ..
325   1056         1
806   350          1
      296          1
325   1092         1
1320  1313         1
Length: 13541, dtype: int64

In [58]:
def get_all_matches_of_teams(team_id_1: int, team_id_2: int):
    straigt_variant = df[(df['Team'] == team_id_1) & (df['Opponent'] == team_id_2)]
    reversed_variant = df[(df['Team'] == team_id_2) & (df['Opponent'] == team_id_1)]
    return pd.concat([straigt_variant, reversed_variant])

most_played_teams = get_all_matches_of_teams(990, 1295)
most_played_teams

Unnamed: 0,Team,Opponent,Map,Win,Predicted Win
15,990,1295,4,0.0,0.481225
17,990,1295,11,0.0,0.47392
5,990,1295,5,0.0,0.480182
0,990,1295,1,0.0,0.484356
1,990,1295,5,0.0,0.480182
3,990,1295,11,1.0,0.47392
21,990,1295,5,1.0,0.480182
30,990,1295,5,0.0,0.480182
40,990,1295,1,1.0,0.484356
41,990,1295,5,0.0,0.480182


In [59]:
WON = 1.0
LOST = 0.0


def switch_team_into_team_column(df: pd.DataFrame, base_team_id: int):
    for line in df.itertuples():
        if line.Opponent == base_team_id:
            df.at[line.Index, 'Team'] = line.Opponent
            df.at[line.Index, 'Opponent'] = line.Team
            df.at[line.Index, 'Win'] = LOST if line.Win == WON else WON
    return df

switch_team_into_team_column(most_played_teams, 990)

Unnamed: 0,Team,Opponent,Map,Win,Predicted Win
15,990,1295,4,0.0,0.481225
17,990,1295,11,0.0,0.47392
5,990,1295,5,0.0,0.480182
0,990,1295,1,0.0,0.484356
1,990,1295,5,0.0,0.480182
3,990,1295,11,1.0,0.47392
21,990,1295,5,1.0,0.480182
30,990,1295,5,0.0,0.480182
40,990,1295,1,1.0,0.484356
41,990,1295,5,0.0,0.480182


In [60]:
most_played_teams.groupby([C.Map, 'Win']).size()

Map  Win
1    0.0    3
     1.0    4
4    0.0    2
     1.0    2
5    0.0    7
     1.0    1
9    0.0    3
     1.0    2
11   0.0    4
     1.0    2
dtype: int64

In [31]:
decode_team(990), decode_team(1295), decode_map(5)

('ORDER ', 'Looking For Org ', 'nuke  Nuke')

Looks like team "ORDER" not good on Nuke at all. 1 win and 7 losses. Lets other teams on Nuke with "ORDER".

In [44]:
def get_all_matches_for_team(team_id: int):
    new_df = df[(df['Opponent'] == team_id) | (df['Team'] == team_id)]

    switch_team_into_team_column(new_df, team_id)
    return new_df

def get_all_matches_for_team_on_map(team_id: int, map_id: int):
    all_matches = get_all_matches_for_team(team_id)

    return all_matches[get_all_matches_for_team(team_id)[C.Map] == map_id]

def get_all_matches_for_team_on_map_with_opponent(team_id: int, map_id: int, opponent_id: int):
    return get_all_matches_for_team_on_map(team_id, map_id)[get_all_matches_for_team_on_map(team_id, map_id)['Opponent'] == opponent_id]

In [61]:
get_all_matches_for_team_on_map(990, 5)

Unnamed: 0,Team,Opponent,Map,Win,Predicted Win
21,990,721,5,1.0,0.490214
38,990,1313,5,0.0,0.490214
35,990,971,5,0.0,0.490214
4,990,93,5,0.0,0.494511
38,990,1313,5,0.0,0.482272
6,990,93,5,1.0,0.502519
26,990,971,5,1.0,0.485845
18,990,918,5,1.0,0.486771
5,990,17,5,0.0,0.480182
5,990,17,5,0.0,0.486072


In [62]:
get_all_matches_for_team_on_map(990, 5).groupby(['Win']).size()

Win
0.0    20
1.0    17
dtype: int64

Looks right, but seems like weights are not good. This prediction does affect team that mush as map. In case between teams weight for team should be greater than weight for maps.

As test result we will be using team 990 ('ORDER') and team 1295 ('Looking For Org ') on map 5 ('Nuke').
Order won 1 time and lost 6 times on Nuke. They win probability expected to be around 14%.

In [40]:
team_id = 990
opponent_id = 1295
map_id = 5
expected_win_probability = 0.14

In [64]:
model = LinearRegression()
model.fit(X, y)

model.predict([[team_id, opponent_id, map_id]])



array([0.48018158])

In [66]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(class_weight={0: 1, 1: 1, 2: 1})
model.fit(X, y)

model.predict([[team_id, opponent_id, map_id]])



array([0.])

Almost about right, but not good enough.

In [68]:
df.groupby(['Team','Opponent']).size().sort_values(ascending=False)

Team  Opponent
990   1295        18
971   990         17
112   831         16
      528         14
1188  699         14
                  ..
325   1056         1
806   350          1
      296          1
325   1092         1
1320  1313         1
Length: 13541, dtype: int64