In [34]:
import os

In [1]:
from dataclasses import dataclass

In [32]:
import itertools

In [2]:
import pandas as pd
import numpy as np

from scipy.stats import hmean
from Levenshtein import ratio
from tqdm.notebook import tqdm

from src.data_loaders import T2Data

In [35]:
entity_coreference_report_dir = os.path.join("reports", "entity_coreference")
coref_example_table_fp = os.path.join(entity_coreference_report_dir, "eg_table.html")

In [4]:
def get_most_frequent_properties(df, ind, property_cols):
    return df.groupby(ind)[property_cols].agg(lambda s: s.mode().iloc[0]).reset_index()

In [5]:
wh_matches = (
    T2Data.get_match_df()
    .assign(
        match_id=lambda df: df["wh_match_id"].astype(np.int64),
        date=lambda df: df["datetime"].astype(int),
        score=lambda df: df["home_goals_ft"].astype(int).astype(str)
        + ":"
        + df["away_goals_ft"].astype(int).astype(str),
    )
    .loc[:, ["match_id", "score", "date"]]
    .set_index("match_id")
)

tm_matches = (
    T2Data.get_tm_matches()
    .assign(
        match_id=lambda df: df["tm_match_id"].astype(np.int64),
        date=lambda df: df["date"].pipe(pd.to_datetime).astype(int),
    )
    .loc[:, ["match_id", "score", "date"]]
    .set_index("match_id")
)

In [6]:
wh_players = (
    T2Data.get_wh_player_df()
    .rename(columns={"playerId": "player_id"})
    .set_index("player_id")
    .loc[:, ["name"]]
)

tm_players = get_most_frequent_properties(
    T2Data.get_tm_lineups().assign(player_id=lambda df: df["tm_id"].astype(np.int64)),
    "player_id",
    ["name"],
).set_index("player_id")

In [7]:
wh_teams = (
    T2Data.get_wh_team_df()
    .rename(columns={"teamId": "team_id"})
    .assign(team_id=lambda df: df["team_id"].astype(np.int64))
    .set_index("team_id").loc[:, ["name"]]
)

tm_teams = (
    T2Data.get_tm_matches()
    .pipe(
        lambda df: pd.concat(
            [
                df[["home-name", "home-tm_id"]],
                df[["away-name", "away-tm_id"]].rename(
                    columns=lambda s: s.replace("away", "home")
                ),
            ]
        )
    )
    .pipe(lambda df: get_most_frequent_properties(df, "home-tm_id", ["home-name"]))
    .rename(columns={"home-tm_id": "team_id", "home-name": "name"})
    .assign(team_id=lambda df: df["team_id"].astype(np.int64))
    .set_index("team_id")
)

In [8]:
tm_lineups = (
    T2Data.get_tm_lineups()
    .assign(
        player_id=lambda df: df["tm_id"].astype(np.int64),
        match_id=lambda df: df["tm_match_id"].astype(np.int64),
        starter=lambda df: np.where(df["starter"] == "starter", "T", "F")
    )
    .loc[:, ["player_id", "match_id", "side", "starter"]]
    .reset_index(drop=True)
)

wh_lineups = (
    T2Data.get_attendance_df()
    .rename(columns={"playerId": "player_id", "field": "side"})
    .assign(
        starter=lambda df: np.where(df["position"] == "Sub", "F", "T"),
        match_id=lambda df: df["wh_match_id"].astype(np.int64),
    )
    .loc[:, tm_lineups.columns]
    .dropna()
    .reset_index(drop=True)
)

In [9]:
tm_fixtrues = (
    T2Data.get_tm_matches()
    .assign(match_id=lambda df: df["tm_match_id"].astype(np.int64))
    .pipe(
        lambda df: pd.concat(
            [
                df.loc[:, ["match_id", f"{side}-tm_id"]]
                .assign(**{"side": side})
                .rename(columns={f"{side}-tm_id": "team_id"})
                .assign(team_id=lambda df: df["team_id"].astype(np.int64))
                for side in ["home", "away"]
            ]
        )
    )
    .reset_index(drop=True)
)

wh_fixtures = (
    T2Data.get_match_df()
    .assign(match_id=lambda df: df["wh_match_id"].astype(np.int64))
    .pipe(
        lambda df: pd.concat(
            [
                df.loc[:, ["match_id", f"{side}_teamid"]]
                .assign(**{"side": side})
                .rename(columns={f"{side}_teamid": "team_id"})
                .assign(team_id=lambda df: df["team_id"].astype(np.int64))
                for side in ["home", "away"]
            ]
        )
    )
    .reset_index(drop=True)
)

In [10]:
from encoref import CoReferenceLock, RelationPair, SearchRoll

In [11]:
sides = ["home", "away"]
starter = ["T", "F"]

home_away_pairs = [
    RelationPair(
        *[
            _df.loc[lambda df: df["side"] == side].drop("side", axis=1)
            for _df in [tm_fixtrues, wh_fixtures]
        ],
        name=f"fixture-{side}",
    )
    for side in sides
] + [
    RelationPair(
        *[
            _df.loc[lambda df: (df["side"] == side) & (df["starter"] == starter)].drop(
                ["side", "starter"], axis=1
            )
            for _df in [tm_lineups, wh_lineups]
        ],
        name=f"lineup-{side}-{starter}",
    )
    for side, starter in itertools.product(sides, starter)
]

In [12]:
indiv_pairs = [(tm_teams, wh_teams), (tm_matches, wh_matches), (tm_players, wh_players)]
relation_df_pairs = home_away_pairs#[(tm_fixtrues, wh_fixtures), (tm_lineups, wh_lineups)]

In [13]:
corl = CoReferenceLock(
    indiv_pairs,
    relation_df_pairs,
    max_depth=2,
    progress_bar=True,
)

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='potential matches size', max=1.0, style…

HBox(children=(FloatProgress(value=0.0, description='team_id matches', max=40.0, style=ProgressStyle(descripti…

HBox(children=(FloatProgress(value=0.0, description='match_id matches', max=760.0, style=ProgressStyle(descrip…

HBox(children=(FloatProgress(value=0.0, description='player_id matches', max=1209.0, style=ProgressStyle(descr…

In [None]:
corl.run_searches(
    [
        SearchRoll(es_type="team_id", max_depth=2, roll_top_n=30),
        SearchRoll(es_type="player_id", max_depth=2, roll_top_n=300),
    ]
)

In [20]:
corl.matches.loc[:, []].reset_index().rename(columns={"leaf_ind1": "tm_id", "leaf_ind2": "wh_id"})

Unnamed: 0,leaf_type,tm_id,wh_id
0,team_id,1132,184
1,team_id,11,13
2,team_id,29,31
3,team_id,31,26
4,team_id,148,30
...,...,...,...
2004,match_id,2872292,1190522
2005,match_id,2872161,1190200
2006,match_id,2872262,1190280
2007,match_id,2899700,1222107


In [30]:
ks = ["match_id", "player_id", "team_id"]

_m = []
for eid, itm, iwh in tqdm(corl.matches.index):

    kinds = ["index/spielbericht", "profil/spieler", "startseite/verein"]
    turl = "https://www.transfermarkt.com/x/{kind}/{id}".format(kind=kinds[ks.index(eid)], id=itm)
    kinds = ["Matches", "Players", "Teams"]
    wurl = "https://www.whoscored.com/{kind}/{id}/Show/x".format(kind=kinds[ks.index(eid)], id=iwh)
    
    tmname = corl.es_pair_dict[eid].df1.loc[itm,:].iloc[0]
    whname = corl.es_pair_dict[eid].df2.loc[iwh,:].iloc[0]
    
    _m.append(
        {
            "ratio":ratio(tmname, whname),
            #"eid": eid,
            #"itm": itm,
            #"iwh": iwh,
            "tmlink": turl,
            "whlink": wurl,
            "tmname": tmname,
            "whname": whname
        }
    )
    


HBox(children=(FloatProgress(value=0.0, max=2009.0), HTML(value='')))




In [37]:
pd.DataFrame(_m).sort_values("ratio").head(10).to_html(coref_example_table_fp)

In [None]:
lambda df: (df["source_type"] == "team_id")

In [25]:
corl.results._potential_matches.sort_values("leaf_type")

Unnamed: 0,distance,path_level,leaf_type,leaf_ind1,leaf_ind2,source_type,source_ind1,source_ind2,root_type,root_ind1,root_ind2,sid,path_level_bs,neighborhood_bs,depth,nh_name
44,-7.034641,1,match_id,2883717,1222046,team_id,3336,60,team_id,3336,60,team_id333660,1.0,1.0,1,team_id--fixture-away
118,-7.737934,1,match_id,2341201,738716,team_id,1084,69,team_id,1084,69,team_id108469,1.0,1.0,1,team_id--fixture-home
119,-7.737934,1,match_id,2594779,985729,team_id,1084,69,team_id,1084,69,team_id108469,1.0,1.0,1,team_id--fixture-home
120,-7.737928,1,match_id,2899793,1222293,team_id,1084,69,team_id,1084,69,team_id108469,1.0,1.0,1,team_id--fixture-home
121,-7.737913,1,match_id,1036381,434291,team_id,1084,69,team_id,1084,69,team_id108469,1.0,1.0,1,team_id--fixture-home
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,-11.296670,1,player_id,286005,323564,match_id,2698249,1080719,match_id,2698249,1080719,match_id26982491080719,1.0,1.0,1,match_id--lineup-away-F
6,-11.296670,1,player_id,258310,332836,match_id,2698249,1080719,match_id,2698249,1080719,match_id26982491080719,1.0,1.0,1,match_id--lineup-away-F
6,-10.735227,1,player_id,314175,227672,match_id,2822494,1163970,match_id,2822494,1163970,match_id28224941163970,1.0,1.0,1,match_id--lineup-home-F
663,-2.325290,0,team_id,2855,305,team_id,2855,305,team_id,2855,305,team_id2855305,1.0,1.0,0,team_id


In [None]:
ks = ["match_id", "player_id", "team_id"]

for eid, itm, iwh in corl.results._potential_matches.loc[:, ["entity_type", "ind1", "ind2"]].sample(10).values:

    kinds = ["index/spielbericht", "profil/spieler", "startseite/verein"]
    print("https://www.transfermarkt.com/x/{kind}/{id}".format(kind=kinds[ks.index(eid)], id=itm))

    kinds = ["Matches", "Players", "Teams"]
    print("https://www.whoscored.com/{kind}/{id}/Show/x".format(kind=kinds[ks.index(eid)], id=iwh))
    print("---" * 10)

In [34]:
ks = ["match_id", "player_id", "team_id"]

for eid, itm, iwh in corl.results._potential_matches.loc[:, ["entity_type", "ind1", "ind2"]].sample(10).values:

    kinds = ["index/spielbericht", "profil/spieler", "startseite/verein"]
    print("https://www.transfermarkt.com/x/{kind}/{id}".format(kind=kinds[ks.index(eid)], id=itm))

    kinds = ["Matches", "Players", "Teams"]
    print("https://www.whoscored.com/{kind}/{id}/Show/x".format(kind=kinds[ks.index(eid)], id=iwh))
    print("---" * 10)

https://www.transfermarkt.com/x/startseite/verein/12588
https://www.whoscored.com/Teams/2345/Show/x
------------------------------
https://www.transfermarkt.com/x/index/spielbericht/2331106
https://www.whoscored.com/Matches/717227/Show/x
------------------------------
https://www.transfermarkt.com/x/index/spielbericht/2223974
https://www.whoscored.com/Matches/608758/Show/x
------------------------------
https://www.transfermarkt.com/x/index/spielbericht/3154657
https://www.whoscored.com/Matches/1351555/Show/x
------------------------------
https://www.transfermarkt.com/x/index/spielbericht/2396425
https://www.whoscored.com/Matches/792736/Show/x
------------------------------
https://www.transfermarkt.com/x/index/spielbericht/2507140
https://www.whoscored.com/Matches/1432283/Show/x
------------------------------
https://www.transfermarkt.com/x/index/spielbericht/2893590
https://www.whoscored.com/Matches/1192452/Show/x
------------------------------
https://www.transfermarkt.com/x/index/

In [None]:
corl.run_searches(
    [
        SearchRoll(es_type="team_id", max_depth=2, roll_top_n=30),
        SearchRoll(es_type="team_id", max_depth=2, roll_top_n=40),
    ]
)

corl.run_searches(
    [
        SearchRoll(es_type="player_id", max_depth=2, roll_top_n=200),
    ]
)