In [2]:
import pandas as pd
import numpy as np
from lets_plot import *
LetsPlot.setup_html()

In [3]:
clark = pd.read_csv('data/clark.csv')
copper = pd.read_csv('data/copper.csv')
ogunbawale = pd.read_csv('data/ogunbawale.csv')
stewart = pd.read_csv('data/stewart.csv')
wilson = pd.read_csv('data/wilson.csv')


In [4]:
clark['name'] = 'Clark'
copper['name'] = 'Copper'
ogunbawale['name'] = 'Ogunbawale'
stewart['name'] = 'Stewart'
wilson['name'] = 'Wilson'

players = pd.concat([clark, copper, ogunbawale, stewart, wilson])

In [5]:
players.to_csv('data/wnba_players.csv')

# Plotting the distribution

In [6]:
ggplot() + geom_bar(data=clark, mapping=aes(x='AST', fill='name')) + ggsize(1000, 400)

In [7]:
ggplot() + geom_density(data=clark, mapping=aes(x='AST', color='name', fill='name'), alpha=0.05) + ggsize(1000, 400)

In [8]:
ggplot() + geom_density(data=clark, mapping=aes(x='AST', color='name', fill='name'), alpha=0.05, bw=0.25) + ggsize(1000, 400)

In [9]:
ggplot() + geom_density(data=clark, mapping=aes(x='AST', color='name', fill='name'), alpha=0.05, bw=1.) + ggsize(1000, 400)

In [96]:
ggplot() + geom_bar(data=clark, mapping=aes(x='AST', fill='name'))+ geom_vline(xintercept=clark['AST'].mean(), color='black', linetype='longdash')  + ggsize(1000, 400)

# Fraction of games above 10 assists

In [98]:
ggplot() + geom_bar(data=clark, mapping=aes(x='AST', fill='name'))+ geom_vline(xintercept=10, linetype='longdash', size=2) + ggsize(1000, 400)

In [191]:
def statistic(data):
    return (data >= 10).mean()

In [89]:
statistic(clark['AST'])

np.float64(0.3)

In [48]:
import math 
def bootstrap_sample(data):
    N = len(data)                      # Number of obsevations
    inds = np.random.randint(0, N, N)  # Resample from {1,...,N}, N times
    return data[inds]                  # Return the new sample

def bootstrap_distribution(data, statistic_fun, nsamples):
    estimates = []                       # Distribution of statistic estimates
    for i in range(nsamples):            # Resample many times
        sample = bootstrap_sample(data)  # Get a new bootstrapped sample
        stat = statistic_fun(sample)     # Compute our statistic
        estimates.append(stat)           # add it to the distribution
    return estimates

def simple_simulation_ci(data, statistic_fun, nsamples, confidence=0.95):
    # Distribution of statistic estimates
    estimates = sorted(bootstrap_distribution(data, statistic_fun, nsamples))
    margin = (1 - confidence) / 2
    lower = estimates[math.ceil(nsamples * margin)]
    upper = estimates[math.ceil(nsamples * (1 - margin))] 
    return estimates, lower, upper

In [88]:
estimates, lower, upper = simple_simulation_ci(clark['AST'], statistic, 10000)
estimates = pd.DataFrame({"Simulated fraction of games with >= 10 assists": estimates})
ggplot() + geom_bar(data=estimates, mapping=aes(x="Simulated fraction of games with >= 10 assists")) + geom_vline(xintercept=lower, color='red', linetype='longdash') + geom_vline(xintercept=upper, color='red', linetype='longdash')


# Probability of getting >= 20 assists in a game?

In [90]:
from scipy.stats import poisson

In [99]:
ggplot() + geom_bar(data=clark, mapping=aes(x='AST', fill='name'))+ geom_vline(xintercept=20, linetype='longdash', size=2) + ggsize(1000, 400)

In [112]:
from scipy.stats import poisson

lam = clark['AST'].mean()
x = np.arange(25)
y = poisson(lam).pmf(x)

In [117]:
ggplot() + geom_line(data=pd.DataFrame(dict(x=x, y=y)), mapping=aes(x='x', y='y')) + geom_histogram(data=clark, mapping=aes(x='AST', y="..density..", color='name', fill='name'), bins=25, alpha=0.05, bw=1.)

In [111]:
1 - poisson(lam).cdf(19)

np.float64(0.0004801251376509441)

In [185]:
estimates, lower, upper = simple_simulation_ci(clark['AST'], np.mean, 10000)
estimates = pd.DataFrame({"Simulated Poisson rate MLE": estimates})
ggplot() + geom_bar(data=estimates, mapping=aes(x="Simulated Poisson rate MLE")) + geom_vline(xintercept=lower, color='red', linetype='longdash') + geom_vline(xintercept=upper, color='red', linetype='longdash')

# Negative Binomial model

In [119]:
from scipy.stats import nbinom
from statsmodels.discrete.discrete_model import NegativeBinomial

model = NegativeBinomial(clark['AST'], np.ones_like(clark['AST'])).fit()
dist = model.get_distribution(1.)

In [163]:
model.params

const    2.131203
alpha    0.028236
dtype: float64

Optimization terminated successfully.
         Current function value: 2.560325
         Iterations: 4
         Function evaluations: 5
         Gradient evaluations: 5
         Hessian evaluations: 4


In [174]:
x = np.arange(25)
y = dist.pmf(x)

In [178]:
dist.args

(array([35.41599529]), array([0.80782827]))

In [175]:
ggplot() + geom_line(data=pd.DataFrame(dict(x=x, y=y)), mapping=aes(x='x', y='y')) + geom_histogram(data=clark, mapping=aes(x='AST', y="..density..", color='name', fill='name'), bins=25, alpha=0.05, bw=1.)

In [176]:
1 - dist.cdf(19)

array([0.00189241])

In [181]:
dist.logpmf(clark['AST']).sum()

np.float64(-102.41300969308647)

In [180]:
poisson(lam).logpmf(clark['AST']).sum()

np.float64(-102.93508479835576)

# Poisson parametric bootstrap

In [113]:
ggplot() + geom_line(data=pd.DataFrame(dict(x=x, y=y)), mapping=aes(x='x', y='y')) + geom_density(data=clark, mapping=aes(x='AST', color='name', fill='name'), alpha=0.05, bw=1.)

In [213]:
import math 

def bootstrap_sample(data):
    N = len(data)                      # Number of obsevations
    return poisson(data.mean()).rvs(N) # Resample from {1,...,N}, N times

def bootstrap_distribution(data, statistic_fun, nsamples):
    estimates = []                       # Distribution of statistic estimates
    for i in range(nsamples):            # Resample many times
        sample = bootstrap_sample(data)  # Get a new bootstrapped sample
        stat = statistic_fun(sample)     # Compute our statistic
        estimates.append(stat)           # add it to the distribution
    return estimates

def simple_simulation_ci(data, statistic_fun, nsamples, confidence=0.95):
    # Distribution of statistic estimates
    estimates = sorted(bootstrap_distribution(data, statistic_fun, nsamples))
    margin = (1 - confidence) / 2
    lower = estimates[math.ceil(nsamples * margin)]
    upper = estimates[math.ceil(nsamples * (1 - margin))] 
    return estimates, lower, upper

In [214]:
estimates, lower, upper = simple_simulation_ci(clark['AST'], statistic, 10000)
estimates = pd.DataFrame({"Simulated fraction of games with >= 10 assists": estimates})
ggplot() + geom_bar(data=estimates, mapping=aes(x="Simulated fraction of games with >= 10 assists")) + geom_vline(xintercept=lower, color='red', linetype='longdash') + geom_vline(xintercept=upper, color='red', linetype='longdash')


In [12]:
ggplot() + geom_bar(data=pd.concat([clark, wilson]), mapping=aes(x='AST', fill='name')) + ggsize(1000, 400)

In [13]:
ggplot() + geom_bar(data=pd.concat([clark, wilson]), mapping=aes(x='AST', fill='name'), position='nudge') + ggsize(1000, 400)

In [14]:
ggplot() + geom_bar(data=pd.concat([clark, wilson]), mapping=aes(x='AST', fill='name'), position='dodge') + ggsize(1000, 400)

In [21]:
ggplot() + geom_bar(data=pd.concat([clark, wilson]), mapping=aes(x='AST', fill='name'), position='dodge') + facet_grid(x='name') + ggsize(1000, 400)

In [48]:
ggplot() + geom_bar(data=players, mapping=aes(x='AST', fill='name')) + ggsize(1000, 400)

In [49]:
ggplot() + geom_bar(data=players, mapping=aes(x='AST', fill='name'), position='dodge') + ggsize(1000, 400)

In [59]:
ggplot() + geom_density(data=players, mapping=aes(x='AST', color='name', fill='name'), alpha=0.05) + ggsize(1000, 400)

In [None]:
ggplot() + geom_bar(data=players, mapping=aes(x='AST', fill='name'), position='') 

In [38]:
ggplot() + geom_density(data=players, mapping=aes(x='PTS', color='name', fill='name'), alpha=0.1) + ggsize(1000, 400)