# In this notebook, I'll start the process of building and refining our algorithm (read, function... it's really only a linear combination of weights and game statistics) for calculating the game-by-game value of a player.

In [1]:
% matplotlib inline
import json
import pandas as pd
import numpy as np
import copy
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)

### First, we need to load all the data frames created in Init_DataCleaning.ipynb and merge them by season. We do this so that we'll have a superset of players.

In [2]:
filenames = ['BPL/BPL12_13.pkl', 'BPL/BPL13_14.pkl', 'French/French12_13.pkl', 'French/French13_14.pkl', 
             'Germany/Bundes12_13.pkl', 'Germany/Bundes13_14.pkl', 
             'Italy/Italy12_13.pkl', 'Italy/Italy13_14.pkl', 
             'Spanish/Spanish12_13.pkl', 'Spanish/Spanish13_14.pkl']

BPL12 = pd.read_pickle(filenames[0]).drop(' ', axis=1)
BPL13 = pd.read_pickle(filenames[1]).drop(' ', axis=1)
FRA12 = pd.read_pickle(filenames[2]).drop(' ', axis=1)
FRA13 = pd.read_pickle(filenames[3]).drop(' ', axis=1)
GER12 = pd.read_pickle(filenames[4]).drop(' ', axis=1)
GER13 = pd.read_pickle(filenames[5]).drop(' ', axis=1)
ITA12 = pd.read_pickle(filenames[6]).drop(' ', axis=1)
ITA13 = pd.read_pickle(filenames[7]).drop(' ', axis=1)
SPA12 = pd.read_pickle(filenames[8]).drop(' ', axis=1)
SPA13 = pd.read_pickle(filenames[9]).drop(' ', axis=1)


In [3]:
players12 = BPL12.merge(FRA12,left_index=True, right_index=True)
players12 = players12.merge(GER12,left_index=True, right_index=True)
players12 = players12.merge(ITA12,left_index=True, right_index=True)
players12 = players12.merge(SPA12,left_index=True, right_index=True)

In [4]:
players12.shape

(38, 1679)

#### Accounting for players who have transferred midseason, 2012-13 Season

In [5]:
playerlist12 = players12.columns
dup_list12 = np.unique([name[:-2] for name in playerlist12 if (name[-2:] == '_x' or name[-2:]=='_y')])

In [6]:
len(dup_list12)

16

In [7]:
'''To account for players switching teams and leagues mid-season, we simply just collapse
the lists into a single list and repopulate the data frame with that player's name, without any
suffix attached via the merge operation done previously'''
for name in dup_list12:
    series_x = players12[name+'_x'].to_dict()
    series_y = players12[name+'_y'].to_dict()
    new_series = {}
    for ii in np.arange(1,39):
        is_x = isinstance(series_x[ii]['was_sub'],bool)
        is_y = isinstance(series_y[ii]['was_sub'],bool)
        if is_x:
            new_series[ii] = series_x[ii]
        elif is_y:
            new_series[ii] = series_y[ii]
        else:
            new_series[ii] = series_y[ii]
    players12[name] = pd.Series(new_series)
    players12 = players12.drop([name+'_x', name+'_y'], axis=1)

In [8]:
players12.shape

(38, 1663)

Looks like it worked as it should, nice!

#### Now repeat for the next season...

In [9]:
players13 = BPL13.merge(FRA13,left_index=True, right_index=True)
players13 = players13.merge(GER13,left_index=True, right_index=True)
players13 = players13.merge(ITA13,left_index=True, right_index=True)
players13 = players13.merge(SPA13,left_index=True, right_index=True)

In [10]:
players13.shape

(38, 1659)

In [11]:
playerlist13 = players13.columns
dup_list13 = np.unique([name[:-2] for name in playerlist13 if (name[-2:] == '_x' or name[-2:]=='_y')])

In [12]:
len(dup_list13)

16

In [13]:
'''To account for players switching teams and leagues mid-season, we simply just collapse
the lists into a single list and repopulate the data frame with that player's name, without any
suffix attached via the merge operation done previously'''
for name in dup_list13:
    series_x = players13[name+'_x'].to_dict()
    series_y = players13[name+'_y'].to_dict()
    new_series = {}
    for ii in np.arange(1,39):
        is_x = isinstance(series_x[ii]['was_sub'],bool)
        is_y = isinstance(series_y[ii]['was_sub'],bool)
        if is_x:
            new_series[ii] = series_x[ii]
        elif is_y:
            new_series[ii] = series_y[ii]
        else:
            new_series[ii] = series_y[ii]
    players13[name] = pd.Series(new_series)
    players13 = players13.drop([name+'_x', name+'_y'], axis=1)

In [14]:
players13.shape

(38, 1642)

# Now for the fun part! The algorithm/metric/function... (whatever you want to call it)

In [15]:
"""
Function
--------
player_val

Inputs
------
playerdf: a dataframe keyed by player and containing rows for each individual match day.
     For each player, match day pair there is a a dictionary with the following fields:
     
     goals: list of goals scored by that player (minute, was tiebreaker, was equalizer)
     assists: list of assists created by that player (minute, was tiebreaker, was equalizer)
     was_sub: Whether or not the player was a substitute
     was_home: Whether or not the player was on the home team or not
   
Returns
-------
Nothing is returned, the function adds the following information to the input dataframe
after scanning through the data contained therein:

    current match value score
    cumulative score for the season
    moving average over the past five games to smooth out discrete events

Notes
-----
Specific game state parameters will be updated.
Written 19 Nov. 2015 by TWK
"""

def player_val(playerdf):

    # Get the name of each player that contributed in any way in the season
    playerlist = playerdf.columns
    
    # Loop through each player in the playerlist, record their value added for each match.
    for player in playerlist: 
    
        cont = 0 # Counter for consistency factor multiplication
        del_cont = 0.15 # Consistency factor
        window = 5 # Moving average to help smooth out discrete events
        value_tracker = [] # Aggregated value, will be incremented each match
        for i_match in np.arange(1,39):
            goal_list = playerdf[player][i_match]['goals']
            assist_list = playerdf[player][i_match]['assists']
            is_home = playerdf[player][i_match]['was_home']
            is_sub = playerdf[player][i_match]['was_sub']


            # Provide slight bump if player scores away from home
            if isinstance(is_home,bool) & is_home:
                away = 0
            elif isinstance(is_home,bool) & (not is_home):
                away = 0.15
            else:
                away = 0

            # Provide slight bump if player is a substitute and registers a statistic
            if isinstance(is_sub,bool) & is_sub:
                sub = 2
            elif isinstance(is_sub,bool) & (not is_sub):
                sub = 0
            else:
                sub = 0

            val4goal, num_goals = 0, 0
            val4asst, num_assts = 0, 0

            # GOALS
            #--------------
            for goal in goal_list:
                timescored = goal[0]
                is_tb = goal[1]
                is_eq = goal[2]

                # Filter out own-goals
                if timescored < 0:
                    continue

                if timescored <= 20: # If goal is scored early
                    if is_tb: tb = 2
                    else: tb = 0

                    if is_eq: eq = 1.5
                    else:  eq = 0

                elif timescored > 80:
                    if is_tb: tb = 4 # Heavily favor late gamewinning goals
                    else: tb = 0

                    if is_eq: eq = 2
                    else: eq = 0

                else:
                    if is_tb: tb = 2
                    else: tb = 0

                    if is_eq: eq = 1.75
                    else:  eq = 0

                val4goal += (1+(del_cont*cont))*(1+away)*(eq + tb + 1)
                num_goals += 1

            # ASSISTS
            #-------------------
            for assist in assist_list:
                timescored = assist[0]
                is_tb = assist[1]
                is_eq = assist[2]

                # Filter out own-goals
                if timescored < 0:
                    continue

                if timescored <= 20: # If goal is scored early
                    if is_tb: tb = 2
                    else: tb = 0

                    if is_eq: eq = 1.5
                    else:  eq = 0

                elif timescored > 80:
                    if is_tb: tb = 4 # Heavily favor late gamewinning goals
                    else: tb = 0

                    if is_eq: eq = 2
                    else: eq = 0

                else:
                    if is_tb: tb = 2
                    else: tb = 0

                    if is_eq: eq = 1.75
                    else:  eq = 0

                val4asst += (1+(del_cont*cont))*(1+away)*(eq + tb + .75)
                num_assts += 1

            # Sum up player impact on match
            match_value = val4goal + val4asst + sub 
            value_tracker.append(match_value)

            # Place calculated impact into match dictionary
            playerdf[player][i_match]['match_value'] = match_value
            playerdf[player][i_match]['cum_val'] = sum(value_tracker)
            playerdf[player][i_match]['avg_val_smooth'] = sum(value_tracker[max(0,(i_match-window)):i_match])/float(min(i_match,window))  

            if num_goals > 0 | num_assts > 0:
                cont += 1 # Increment consistency multiplier if player registers a statistic
            else:
                cont = 0  # Reset consistency multiplier if player doesn't register a statistic

### Now for the moment of truth...

In [16]:
%%time
player_val(players12)

CPU times: user 4.57 s, sys: 70.3 ms, total: 4.64 s
Wall time: 4.67 s


In [17]:
players12['Wayne Rooney'][8]

{'assists': [(44, True, False), (46, False, False)],
 'avg_val_smooth': 2.3449999999999998,
 'cum_val': 11.725,
 'goals': [(27, False, True), (65, False, False), (-11, True, False)],
 'match_value': 7.25,
 'was_home': True,
 'was_sub': False}

#### SUCCESS!!!

In [18]:
%%time
player_val(players13)

CPU times: user 4.41 s, sys: 68.6 ms, total: 4.48 s
Wall time: 4.47 s


## Save the data frames for further analysis

In [19]:
players12.to_pickle('playerdf12-13.pkl')
players13.to_pickle('playerdf13-14.pkl')