In [1]:
import urllib.request, json 
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
plt.style.use('classic')
%matplotlib inline

import seaborn as sns
sns.set()

In [2]:
df = pd.read_csv("./IPL_Deliveries_Full.csv")
df.shape

(136598, 21)

In [4]:
df.dtypes

match_id             int64
inning               int64
batting_team        object
bowling_team        object
over                 int64
ball                 int64
batsman             object
non_striker         object
bowler              object
is_super_over        int64
wide_runs            int64
bye_runs             int64
legbye_runs          int64
noball_runs          int64
penalty_runs         int64
batsman_runs         int64
extra_runs           int64
total_runs           int64
player_dismissed    object
dismissal_kind      object
fielder             object
dtype: object

In [6]:
#Build a dictionary of Matches player by each batsman
played = {}
def BuildPlayedDict(x):
    #print(x.shape, x.shape[0], x.shape[1])
    for p in x.batsman.unique():
        if p in played:
            played[p] += 1
        else:
            played[p] = 1

df.groupby('match_id').apply(BuildPlayedDict)


In [7]:
#utility function used later
def trybuild(lookuplist, buildlist):
    alist = []
    for i in buildlist.index:
        try:
            #print(i)
            alist.append(lookuplist[i])
            #print(alist)
        except KeyError:
            #print('except')
            alist.append(0)
    return alist

In [8]:
#Build the Summarized dataset 'BatmanStats' to do further analysis
BatsmanStats = df.groupby('batsman').aggregate({'ball': 'count', 'batsman_runs': 'sum'})
BatsmanStats.rename(columns={'ball': 'balls', 'batsman_runs': 'runs'}, inplace=True)
BatsmanStats['strike_rate'] = BatsmanStats['runs']/BatsmanStats['balls'] * 100
BatsmanStats['matches_played'] = [played[p] for p in BatsmanStats.index]
BatsmanStats['average']= BatsmanStats['runs']/BatsmanStats['matches_played']

for dk in df.dismissal_kind.unique():
    if dk in ['nan','hit wicket', 'retired hurt', 'obstructing the field']:
        continue
    lookuplist = df[df.dismissal_kind == dk].groupby('player_dismissed')['player_dismissed'].count()
    BatsmanStats[dk] = trybuild(lookuplist, BatsmanStats)
for r in df.batsman_runs.unique():
    lookuplist = df[df.batsman_runs == r].groupby('batsman')['batsman'].count()
    BatsmanStats[str(r) + 's'] = trybuild(lookuplist, BatsmanStats)


BatsmanStats['6s/match'] = BatsmanStats['6s']/BatsmanStats['matches_played']    


In [18]:
#Filter Top 30 batsmen in the league (palyed atleast 15 game, with an average of atleast 25, ordered by
#srike rate)
bs = BatsmanStats
bs.head()
tb = bs[(bs.average > 25) & (bs.matches_played > 15)].sort_values(['strike_rate'], ascending = False)[:30]
tb

Unnamed: 0_level_0,balls,runs,strike_rate,matches_played,average,nan,caught,bowled,run out,lbw,stumped,caught and bowled,0s,4s,6s,1s,2s,5s,3s,6s/match
batsman,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
V Sehwag,1833,2728,148.827059,104,26.230769,0,70,15,7,4,1,2,737,334,106,558,96,0,2,1.019231
AB de Villiers,2231,3270,146.571044,109,30.0,0,48,17,8,6,3,1,702,275,142,918,182,0,12,1.302752
CH Gayle,2360,3451,146.228814,91,37.923077,0,47,22,1,6,2,2,1088,282,252,668,67,0,3,2.769231
DA Warner,2440,3373,138.237705,100,33.73,0,56,17,6,3,4,2,950,338,134,830,177,0,11,1.34
SR Watson,1869,2557,136.811129,91,28.098901,0,42,24,7,3,2,1,786,252,121,605,97,0,8,1.32967
MS Dhoni,2419,3270,135.179826,128,25.546875,0,60,10,6,2,2,3,810,236,140,991,231,0,11,1.09375
SK Raina,3059,4106,134.226872,143,28.713287,0,87,9,10,3,7,7,1081,360,161,1227,219,1,10,1.125874
ST Jayasuriya,573,768,134.031414,30,25.6,0,19,2,1,5,0,1,292,84,39,128,22,1,7,1.3
AC Gilchrist,1555,2069,133.054662,80,25.8625,0,53,12,2,4,3,2,737,239,92,417,66,0,4,1.15
SPD Smith,933,1241,133.01179,47,26.404255,0,21,7,3,1,0,2,291,113,33,405,87,0,4,0.702128


In [19]:
#Print('Dimension 1 for our KNN - Boundary Percentage')
tb['boundary_pct'] =  ((tb['4s'] * 4 ) + (tb['6s']  * 6))/tb['runs']
print('Complete Batsman - By Boundary Percentage')
print(tb[tb['boundary_pct'] <= tb['boundary_pct'].median()]['boundary_pct'].sort_values())
print('Power Hitters - By Boundary Percentage')
print(tb[tb['boundary_pct'] > tb['boundary_pct'].median()]['boundary_pct'].sort_values())

Complete Batsman - By Boundary Percentage
batsman
JP Duminy       0.481686
F du Plessis    0.515929
SPD Smith       0.523771
AM Rahane       0.540561
MS Dhoni        0.545566
G Gambhir       0.548707
A Symonds       0.556468
MEK Hussey      0.558422
V Kohli         0.568165
S Dhawan        0.572356
SR Tendulkar    0.581834
SK Raina        0.585972
RV Uthappa      0.586431
RG Sharma       0.587506
M Vijay         0.590203
Name: boundary_pct, dtype: float64
Power Hitters - By Boundary Percentage
batsman
AB de Villiers    0.596942
KP Pietersen      0.603397
SE Marsh          0.615730
AJ Finch          0.628834
DA Warner         0.639194
LMP Simmons       0.649682
BB McCullum       0.668583
Q de Kock         0.669421
ML Hayden         0.675700
SR Watson         0.678138
DR Smith          0.701771
V Sehwag          0.722874
AC Gilchrist      0.728855
ST Jayasuriya     0.742188
CH Gayle          0.764996
Name: boundary_pct, dtype: float64


In [20]:
#Print('Dimension 2 for our KNN - DotBall (0s) Percentage')
tb['dotball_pct'] =  tb['0s']/tb['balls']
print('Complete Batsman')
print(tb[tb['dotball_pct'] <= tb['dotball_pct'].median()]['dotball_pct'].sort_values())
print('Power Hitters')
print(tb[tb['dotball_pct'] > tb['dotball_pct'].median()]['dotball_pct'].sort_values())



Complete Batsman
batsman
SPD Smith         0.311897
AB de Villiers    0.314657
JP Duminy         0.327417
MS Dhoni          0.334849
F du Plessis      0.344595
SK Raina          0.353383
RG Sharma         0.361148
V Kohli           0.368242
KP Pietersen      0.382586
SE Marsh          0.384840
G Gambhir         0.385403
AM Rahane         0.387863
DA Warner         0.389344
MEK Hussey        0.390170
A Symonds         0.394366
Name: dotball_pct, dtype: float64
Power Hitters
batsman
RV Uthappa       0.400588
V Sehwag         0.402073
S Dhawan         0.410755
SR Watson        0.420546
SR Tendulkar     0.427593
Q de Kock        0.431095
ML Hayden        0.437426
AJ Finch         0.443519
M Vijay          0.444551
LMP Simmons      0.452756
BB McCullum      0.457011
CH Gayle         0.461017
DR Smith         0.468693
AC Gilchrist     0.473955
ST Jayasuriya    0.509599
Name: dotball_pct, dtype: float64


In [21]:
#Calculate conversions (percentage of innings greater than average) for the top 30
#batsmen
dfTop = df[df.batsman.isin(tb.index)]

convbyplyr = {}
def computeconversion(x):
    #print(x.batsman.unique())
    for p in x.batsman.unique():
        if(x.batsman_runs.sum() > ( tb.loc[p]['average'])):
            if p in convbyplyr:
                convbyplyr[p] += 1
            else:
                convbyplyr[p] = 1
    #print(x.batsman_runs.sum())

dfTop.groupby(['batsman','match_id']).apply(computeconversion)
#len(convbyplyr)
tb['conv_pct'] = [convbyplyr[p]/tb.loc[p]['matches_played'] for p in tb.index]



In [22]:
# Dimension 3 for KNN calculation - Conversion Rate
print('Power Hitters')
print(tb[tb['conv_pct'] <= tb['conv_pct'].median()]['conv_pct'].sort_values())
print('Complete Batsman')
print(tb[tb['conv_pct'] > tb['conv_pct'].median()]['conv_pct'].sort_values())

Power Hitters
batsman
V Sehwag          0.346154
AJ Finch          0.372549
AM Rahane         0.382022
SR Watson         0.384615
RV Uthappa        0.384615
AB de Villiers    0.385321
JP Duminy         0.397260
M Vijay           0.404040
SPD Smith         0.404255
SK Raina          0.405594
ML Hayden         0.406250
CH Gayle          0.406593
BB McCullum       0.408602
RG Sharma         0.408759
AC Gilchrist      0.412500
Name: conv_pct, dtype: float64
Complete Batsman
batsman
G Gambhir        0.415385
A Symonds        0.416667
S Dhawan         0.419643
Q de Kock        0.423077
V Kohli          0.424242
DR Smith         0.428571
DA Warner        0.430000
ST Jayasuriya    0.433333
KP Pietersen     0.444444
MS Dhoni         0.460938
MEK Hussey       0.465517
SR Tendulkar     0.474359
SE Marsh         0.475410
LMP Simmons      0.500000
F du Plessis     0.533333
Name: conv_pct, dtype: float64


In [39]:
tb_knn = tb.loc[:,('boundary_pct','dotball_pct','conv_pct')]
tb_knn['player_type'] = 'Complete Batsman'
tb_knn.loc[(tb_knn.boundary_pct > tb_knn.boundary_pct.median())
       & (tb_knn.dotball_pct > tb_knn.dotball_pct.median())
          & (tb_knn.conv_pct < tb_knn.conv_pct.median()),'player_type'] = 'Power Hitter'
tb_knn

Unnamed: 0_level_0,boundary_pct,dotball_pct,conv_pct,player_type
batsman,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
V Sehwag,0.722874,0.402073,0.346154,Power Hitter
AB de Villiers,0.596942,0.314657,0.385321,Complete Batsman
CH Gayle,0.764996,0.461017,0.406593,Power Hitter
DA Warner,0.639194,0.389344,0.43,Complete Batsman
SR Watson,0.678138,0.420546,0.384615,Power Hitter
MS Dhoni,0.545566,0.334849,0.460938,Complete Batsman
SK Raina,0.585972,0.353383,0.405594,Complete Batsman
ST Jayasuriya,0.742188,0.509599,0.433333,Complete Batsman
AC Gilchrist,0.728855,0.473955,0.4125,Power Hitter
SPD Smith,0.523771,0.311897,0.404255,Complete Batsman
