## This notebook is built as a wrapper for investigating the numpy function percentile and building a find_percentile function.

In [1]:
% matplotlib inline

import pandas as pd
import numpy as np

pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

#### Import the feature data

In [2]:
data12 = pd.read_pickle('Data/features12-13.pkl').transpose()
data13 = pd.read_pickle('Data/features13-14.pkl').transpose()

In [3]:
sorted_values_12idx = data12.season_value.values.argsort()[::-1]

In [4]:
values12 = data12.season_value[sorted_values_12idx]

In [5]:
quartiles = np.linspace(5,100,20)

Found that there's no reason to sort the value_array...

In [6]:
thresholds = np.percentile(data12.season_value,quartiles)

In [7]:
quart_mask = thresholds < data12.season_value['Gareth Bale']

In [8]:
pcntile = quartiles[quart_mask][-1]
pcntile

95.0

In [9]:
def find_player_percentile(value_array,player_name=None):
    '''Takes in calculated value array (panda series) and an individual player(string):
    Returns the percentile that player falls into with regards to that statistic'''
    
    # Extract the player value
    player_value = value_array[player_name]
    
    # Define quartiles in increments of 5 percent
    quartiles = np.linspace(5,100,20)
    
    # Find separating points, aligning with the quartiles, in the value_array
    thresholds = np.percentile(value_array, quartiles)
    
    # Compute mask to find the highest quartile that player_name is above
    quart_mask = thresholds < player_value
    pcntile = quartiles[quart_mask][-1]
    
    return pcntile
    

In [10]:
find_player_percentile(data12.season_value,"Gareth Bale")

95.0

#### Now that the function works to find where the player lies in relation to others, we can apply it to only the most relevant features (as pulled from the ML-ipynb) and get how the player rates across all features

In [11]:
quart_feats = ["season_value","nassists","ngoals","ntiebreaker_goals","ntiebreaker_assists"]
subdf12 = data12[quart_feats]
subdf13 = data13[quart_feats]

In [12]:
player = "Gareth Bale"
bale_results12 = subdf12.apply(find_player_percentile,player_name = player)
bale_results13 = subdf13.apply(find_player_percentile,player_name = player)

In [13]:
print bale_results12
print bale_results13

season_value           95
nassists               80
ngoals                 95
ntiebreaker_goals      95
ntiebreaker_assists    75
dtype: float64
season_value           95
nassists               95
ngoals                 95
ntiebreaker_goals      85
ntiebreaker_assists    90
dtype: float64
