In [None]:
import pandas as pd
import numpy as np

In [None]:
import editdistance

In [None]:
from collections import Counter

In [None]:
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

# 1. Load data

## a.) Load mention events

In [None]:
mentions_df = pd.read_csv("/mnt/idms/fberes/network/roland_garros/data/rg17_mentions_with_names_and_text.csv",sep="|")

In [None]:
mentions_df.head()

## b.) Load schedule events

In [None]:
schedule_df = pd.read_csv("/mnt/idms/fberes/network/roland_garros/data/schedule_df.csv",sep="|")

In [None]:
schedule_df.head()

# 2. Find tennis player matches

## a.) Collect players from mention and schedule events

In [None]:
c_men = mentions_df["trg_str"].value_counts()
c_men_names = c_men.index

In [None]:
active_list = list(schedule_df["playerName active"])
opponent_list = list(schedule_df["playerName opponent"])

In [None]:
c_sch = Counter(active_list+opponent_list)

c_sch.most_common()

## b.) Find best matches with edit distance

In [None]:
len(c_men), len(c_sch)

In [None]:
%%time
row_size = len(c_sch)
edit_distances = np.zeros((row_size, len(c_men)))
player_names = list(c_sch.keys())
for i in range(row_size):
    reference = player_names[i]
    #print(reference)
    edit_distances[i,:] = [editdistance.eval(reference.lower(),comp.lower()) for comp in c_men_names]

In [None]:
min_indices = edit_distances.argmin(axis=1)
player_matches = []
for i in range(row_size):
    reference, min_match = player_names[i], c_men_names[min_indices[i]]
    edit_dist = editdistance.eval(reference.lower(),min_match.lower())
    player_matches.append((reference,c_sch[reference],min_match,edit_dist))
player_matches_df = pd.DataFrame(player_matches,columns=["schedule_name","schedule_count","twitter_name","distance"])

### Edit distance distribution for best matches

In [None]:
player_matches_df["distance"].value_counts()

### Collect true matches

**Notes on name matches:** (approx: 317+29+13 matches = **359 out of 628**)
   * names with distance 1 are (pure matches)
   * with distance 2 some name must be excluded (Di Wu -?! Kai Wu)
   * with distance 3 there are name matches (Matthew Ebden - Matt Ebden) but most of them are a dismatch
   * in distance 4 I found only a few match: (Tomas Machac - Tomasz Maćczak),(Harmony Tan - Harmony)
   * in distance 5 could be a match: (Zarina Diyas - Team Zarina Diyas) DE pl. vkiket kézzel kell megtalálni: (Leonardo Mayer NEM Leo Mayer HANEM @tennismayer - viszont ez a user 1000 éve inaktív..)

In [None]:
true_matches = {}

#### i.) Pure matches

In [None]:
zero_dist_matches = player_matches_df[player_matches_df["distance"]==0]

In [None]:
update_dict = dict(zip(zero_dist_matches["schedule_name"],zero_dist_matches["twitter_name"]))
true_matches.update(update_dict)

#### ii.) 1 distance matches

In [None]:
one_dist_matches = player_matches_df[player_matches_df["distance"]==1]

In [None]:
update_dict = dict(zip(one_dist_matches["schedule_name"],one_dist_matches["twitter_name"]))
true_matches.update(update_dict)

#### iii.) 2 distance matches

In [None]:
two_dist_matches = player_matches_df[player_matches_df["distance"]==2]

In [None]:
two_dist_matches

In [None]:
name_list = ["Sofia Kenin","Michael Mmoh","Francesco Forti","Di Wu"]
two_dist_matches = two_dist_matches[~two_dist_matches["schedule_name"].isin(name_list)]

In [None]:
update_dict = dict(zip(two_dist_matches["schedule_name"],two_dist_matches["twitter_name"]))
true_matches.update(update_dict)

#### iv.) 3 distance matches

In [None]:
three_dist_matches = player_matches_df[player_matches_df["distance"]==3]

In [None]:
three_dist_matches

In [None]:
name_list = ["Mackenzie McDonald","Matthew Ebden","Nicolas Mahut","Bianca Andreescu","Benoit Paire","Ashleigh Barty","Jerzy Janowicz","Natalia Vikhlyantseva","Darian King","Daniel Evans","Jessica Moore"]
three_dist_matches = three_dist_matches[three_dist_matches["schedule_name"].isin(name_list)]

In [None]:
update_dict = dict(zip(three_dist_matches["schedule_name"],three_dist_matches["twitter_name"]))
true_matches.update(update_dict)

#### v.) most frequent players

In [None]:
already_found = list(true_matches.keys())

### TODO: Nekik még mindenféleképpen érdemes lenne utána keresni (valószínűleg kézzel)

In [None]:
big_distance = player_matches_df[player_matches_df["distance"]>3]
big_dist_frequent_player = big_distance[big_distance["schedule_count"]>2]
big_dist_frequent_player = big_dist_frequent_player[~big_dist_frequent_player["schedule_name"].isin(already_found)]
len(big_dist_frequent_player)

In [None]:
big_dist_frequent_player.head(50)
#big_dist_frequent_player.tail(38)

In [None]:
name_list = ["Nick Kyrgios","Victor Estrella Burgos","Irina-Camelia Begu","Pierre-Hugues Herbert","Zarina Diyas","Pablo Carreno Busta","Yung-Jan Chan"]
most_pop_players =  big_dist_frequent_player[ big_dist_frequent_player["schedule_name"].isin(name_list)]

In [None]:
update_dict = dict(zip(most_pop_players["schedule_name"],most_pop_players["twitter_name"]))
true_matches.update(update_dict)

In [None]:
len(true_matches)

player_matches_df[player_matches_df["distance"]<3].head(10)

big_distance.head(10)

In [None]:
sns.jointplot(x="schedule_count", y="distance", data=big_distance,kind="kde")

In [None]:
sns.jointplot(x="schedule_count", y="distance", data=big_distance)

# Visualize missing players

In [None]:
daily_players = {}
for index, row in schedule_df.iterrows():
    date, winner, loser = row["date"], row["playerName active"], row["playerName opponent"]
    court, match = row["courtName"], row["orderNumber"]
    match_id = "%s_%i" % (court, match)
    if not date in daily_players:
        daily_players[date] = {}
    else:
        daily_players[date][winner] = match_id
        daily_players[date][loser] = match_id

In [None]:
schedule_df.head()

In [None]:
daily_players.keys()

In [None]:
daily_players['2017-06-11']

In [None]:
daily_players_grouped = [(key, list(daily_players[key].keys())) for key in daily_players]

In [None]:
daily_players_df = pd.DataFrame(daily_players_grouped, columns=["date","players"])

In [None]:
daily_players_df["count"] = daily_players_df.apply(lambda x: len(x["players"]),axis=1)

In [None]:
def count_matches(player_list):
    num = 0
    for player in player_list:
        if player in true_matches:
            num += 1
    return num

In [None]:
daily_players_df["match_count"] = daily_players_df.apply(lambda x: count_matches(x["players"]),axis=1)
daily_players_df["mismatch_count"] = daily_players_df.apply(lambda x: x["count"]-x["match_count"],axis=1)

In [None]:
daily_players_df = daily_players_df.sort_values("date")

In [None]:
sns.set(style="whitegrid")

# Initialize the matplotlib figure
f, ax = plt.subplots(figsize=(15, 10))

sns.set_color_codes("pastel")
sns.barplot(x="count", y="date", data=daily_players_df,
            label="Total count", color="b")

sns.set_color_codes("muted")
sns.barplot(x="match_count", y="date", data=daily_players_df,
            label="Match count", color="b")

ax.legend(ncol=2, loc="lower right", frameon=True)
ax.set(ylabel="Date", xlabel="Number of tennis players")
sns.despine(left=True, bottom=True)