In [166]:
import pandas as pd
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.io as pio
import math
from scipy import stats
# renderer for jupyter notebook
pio.renderers.default='notebook'

In [167]:
pio.templates.default = "plotly_dark"

In [168]:
df_scorecard=pd.read_csv(r'./full/odi_scorecard.csv')
df_info=pd.read_csv(r'./full/odi_info.csv')

### Hypothesis

#### 1

* H(0):Mean value of batsman bowled is equal to mean value of batsman dismissed by lbw in ODI
* H(A):Mean value of batsman bowled is not equal to mean value batsman dismissed by lbw

##### Data

In [198]:
df_first=df_scorecard[(df_scorecard['wicket-method']=='bowled')|(df_scorecard['wicket-method']=='lbw')][['match-id','wicket-method']]

In [199]:
df_first['lbw']=df_first['wicket-method'].apply(lambda x: 0 if x=='bowled' else 1 )
df_first['bowled']=df_first['wicket-method'].apply(lambda x: 1 if x=='bowled' else 0 )

In [200]:
df_first=df_first.groupby(['match-id'],as_index=False).sum()

In [201]:
df_first=df_first[['lbw','bowled']]

In [202]:
df_first

Unnamed: 0,lbw,bowled
0,0,2
1,2,3
2,0,5
3,1,2
4,3,6
5,1,6
6,0,3
7,1,2
8,0,3
9,2,2


#### Visualizations

In [203]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=df_first['lbw'], histnorm='probability', name='lbw'))
fig.add_trace(go.Histogram(x=df_first['bowled'], histnorm='probability',name='bowled'))
fig.update_layout(title='Probability distribution for lbw and bowled',xaxis_title='Number of wickets',yaxis_title='Probability')
fig.show()

In [204]:
fig=ff.create_distplot([df_first['lbw'],df_first['bowled']],['LBW','Bowled'],bin_size=1,curve_type='normal')
fig.update_layout(title_text='Distribution of dimissal methods',xaxis_title='Number of wickets',yaxis_title='Density')
fig.show()

#### Hypothesis Testing

##### Paired T test

In [205]:
df_first[['lbw','bowled']].describe()

Unnamed: 0,lbw,bowled
count,1677.0,1677.0
mean,1.679785,2.774001
std,1.386744,1.684788
min,0.0,0.0
25%,1.0,2.0
50%,1.0,3.0
75%,2.0,4.0
max,8.0,9.0


In [206]:
ttest,pval=stats.ttest_rel(df_first['lbw'],df_first['bowled'])

In [207]:
print(pval)
if pval<0.05:
    print("reject null hypothesis")
else:
    print("accept null hypothesis")

8.855848765261422e-83
reject null hypothesis


#### 2

* H(0):Wickets fallen in the first 70% of the first innings is equal to the wickets fallen in the last 30% of the first innings
* H(A):Wickets fallen in the first 70% of the first innings is not equal to the wickets fallen in the last 30% of the first innings
* H(0):Wickets fallen in the first 71% of the first innings is equal to the wickets fallen in the last 29% of the first innings
* H(A):Wickets fallen in the first 71% of the first innings is not equal to the wickets fallen in the last 29% of the first innings
* H(0):Wickets fallen in the first 50% of the first innings is equal to the wickets fallen in the last 50% of the first innings
* H(A):Wickets fallen in the first 50% of the first innings is not equal to the wickets fallen in the last 50% of the first innings
* H(0):Wickets fallen in the first 10 overs of the first innings is equal to the wickets fallen in the last 10 overs of the first innings
* H(A):Wickets fallen in the first 10 overs of the first innings is not equal to the wickets fallen in the last 10 overs of the first innings

##### Data

In [264]:
temp=pd.DataFrame(data=None)
temp=df_scorecard[df_scorecard['innings']==1]
temp=temp.groupby('match-id',as_index=False).sum()
temp['total-overs']=round(temp['balls-played']/6)
temp=temp[['match-id','total-overs']]
temp['first-seventy']=round(temp['total-overs']*0.70)
temp['first-seventyone']=round(temp['total-overs']*0.71)
temp['first-fifty']=round(temp['total-overs']*0.5)
temp['last-ten']=round(temp['total-overs']-10)
temp=temp.merge(df_scorecard[(df_scorecard['innings']==1) &(df_scorecard['fall-of-wicket-overs']>0.0)],on=['match-id'])
temp['fall-of-wicket-overs']=temp['fall-of-wicket-overs'].apply(lambda x: int(x)+1)

In [265]:
df_second=pd.DataFrame({'match-id':temp['match-id']})
df_second['first-seventy-wickets']=temp[(temp['fall-of-wicket-overs']<=temp['first-seventy']) & (temp['fall-of-wicket-overs']>0)]['fall-of-wicket-overs']
df_second['last-thirty-wickets']=temp[temp['fall-of-wicket-overs']>temp['first-seventy']]['fall-of-wicket-overs']
df_second['first-seventy-wickets']=df_second['first-seventy-wickets'].apply(lambda x:1 if x>0 else 0)
df_second['last-thirty-wickets']=df_second['last-thirty-wickets'].apply(lambda x:1 if x>0 else 0)

df_second['first-seventyone-wickets']=temp[(temp['fall-of-wicket-overs']<=temp['first-seventyone']) & (temp['fall-of-wicket-overs']>0)]['fall-of-wicket-overs']
df_second['last-twentynine-wickets']=temp[temp['fall-of-wicket-overs']>temp['first-seventyone']]['fall-of-wicket-overs']
df_second['first-seventyone-wickets']=df_second['first-seventyone-wickets'].apply(lambda x:1 if x>0 else 0)
df_second['last-twentynine-wickets']=df_second['last-twentynine-wickets'].apply(lambda x:1 if x>0 else 0)

df_second['first-fifty-wickets']=temp[(temp['fall-of-wicket-overs']<=temp['first-fifty']) & (temp['fall-of-wicket-overs']>0)]['fall-of-wicket-overs']
df_second['last-fifty-wickets']=temp[temp['fall-of-wicket-overs']>temp['first-fifty']]['fall-of-wicket-overs']
df_second['first-fifty-wickets']=df_second['first-fifty-wickets'].apply(lambda x:1 if x>0 else 0)
df_second['last-fifty-wickets']=df_second['last-fifty-wickets'].apply(lambda x:1 if x>0 else 0)

df_second['first-ten-overs-wickets']=temp[(temp['fall-of-wicket-overs']<=10) & (temp['fall-of-wicket-overs']>0)]['fall-of-wicket-overs']
df_second['last-ten-overs-wickets']=temp[temp['fall-of-wicket-overs']>temp['last-ten']]['fall-of-wicket-overs']
df_second['first-ten-overs-wickets']=df_second['first-ten-overs-wickets'].apply(lambda x:1 if x>0 else 0)
df_second['last-ten-overs-wickets']=df_second['last-ten-overs-wickets'].apply(lambda x:1 if x>0 else 0)

df_second=df_second.groupby(['match-id'],as_index=False).sum()
df_second=df_second.drop(['match-id'],axis=1)

In [266]:
df_second

Unnamed: 0,first-seventy-wickets,last-thirty-wickets,first-seventyone-wickets,last-twentynine-wickets,first-fifty-wickets,last-fifty-wickets,first-ten-overs-wickets,last-ten-overs-wickets
0,3,4,3,4,3,4,1,4
1,5,5,5,5,4,6,3,4
2,4,5,4,5,4,5,1,5
3,6,2,6,2,3,5,2,1
4,7,2,7,2,5,4,0,1
5,4,5,4,5,2,7,1,3
6,2,6,2,6,1,7,1,6
7,4,3,4,3,3,4,0,3
8,4,3,5,2,2,5,1,1
9,2,5,2,5,1,6,0,3


#### Visualizations

In [267]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=df_second['first-seventy-wickets'], histnorm='probability', name='First 70%'))
fig.add_trace(go.Histogram(x=df_second['last-thirty-wickets'], histnorm='probability',name='Last 30%'))
fig.update_layout(title='Probability distribution for wickets fallen in first 70% and last 30% of first innings',xaxis_title='Number of wickets',yaxis_title='Probability')
fig.show()

In [268]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=df_second['first-seventyone-wickets'], histnorm='probability', name='First 71%'))
fig.add_trace(go.Histogram(x=df_second['last-twentynine-wickets'], histnorm='probability',name='Last 29%'))
fig.update_layout(title='Probability distribution for wickets fallen in first 71% and last 29% of first innings',xaxis_title='Number of wickets',yaxis_title='Probability')
fig.show()

In [269]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=df_second['first-fifty-wickets'], histnorm='probability', name='First 50%'))
fig.add_trace(go.Histogram(x=df_second['last-fifty-wickets'], histnorm='probability',name='Last 50%'))
fig.update_layout(title='Probability distribution for wickets fallen in first 50% and last 50% of first innings',xaxis_title='Number of wickets',yaxis_title='Probability')
fig.show()

In [270]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=df_second['first-ten-overs-wickets'], histnorm='probability', name='First 10'))
fig.add_trace(go.Histogram(x=df_second['last-ten-overs-wickets'], histnorm='probability',name='Last 10'))
fig.update_layout(title='Probability distribution for wickets fallen in first 10 overs and last 10 oversof first innings',xaxis_title='Number of wickets',yaxis_title='Probability')
fig.show()

In [271]:
fig=ff.create_distplot([df_second['first-seventy-wickets'],df_second['last-thirty-wickets']],['First 70%','Last 30%'],curve_type='normal')
fig.update_layout(title='Distribution for wickets fallen in first 70% and last 30% of first innings',xaxis_title='Number of wickets',yaxis_title='Density')
fig.show()

In [272]:
fig=ff.create_distplot([df_second['first-fifty-wickets'],df_second['last-fifty-wickets']],['First 50%','Last 50%'],curve_type='normal')
fig.update_layout(title='Distribution for wickets fallen in first 50% and last 50% of first innings',xaxis_title='Number of wickets',yaxis_title='Density')
fig.show()

In [273]:
fig=ff.create_distplot([df_second['first-ten-overs-wickets'],df_second['last-ten-overs-wickets']],['First 10 overs','Last 10 overs'],curve_type='normal')
fig.update_layout(title='Distribution for wickets fallen in first 10 overs and last 10 overs of first innings',xaxis_title='Number of wickets',yaxis_title='Density')
fig.show()

#### Hypothesis Testing

##### Paired T Test

In [274]:
df_second.describe()

Unnamed: 0,first-seventy-wickets,last-thirty-wickets,first-seventyone-wickets,last-twentynine-wickets,first-fifty-wickets,last-fifty-wickets,first-ten-overs-wickets,last-ten-overs-wickets
count,1707.0,1707.0,1707.0,1707.0,1707.0,1707.0,1707.0,1707.0
mean,3.950791,4.100762,4.062097,3.989455,2.844757,5.206796,1.325718,3.445226
std,1.650583,1.6525,1.655468,1.647374,1.447369,1.790623,1.113937,1.594545
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.0,3.0,3.0,3.0,2.0,4.0,0.0,2.0
50%,4.0,4.0,4.0,4.0,3.0,5.0,1.0,3.0
75%,5.0,5.0,5.0,5.0,4.0,6.0,2.0,5.0
max,9.0,9.0,9.0,9.0,8.0,10.0,6.0,9.0


In [281]:
ttest,pval=stats.ttest_rel(df_second['first-seventy-wickets'],df_second['last-thirty-wickets'])

In [282]:
print(pval)
if pval<0.05:
    print("reject null hypothesis")
else:
    print("accept null hypothesis")

0.020019531021699132
reject null hypothesis


In [283]:
ttest,pval=stats.ttest_rel(df_second['first-seventyone-wickets'],df_second['last-twentynine-wickets'])

In [284]:
print(pval)
if pval<0.05:
    print("reject null hypothesis")
else:
    print("accept null hypothesis")

0.25955086942355377
accept null hypothesis


In [285]:
ttest,pval=stats.ttest_rel(df_second['first-fifty-wickets'],df_second['last-fifty-wickets'])

In [286]:
print(pval)
if pval<0.05:
    print("reject null hypothesis")
else:
    print("accept null hypothesis")

6.436197034363128e-225
reject null hypothesis


In [287]:
ttest,pval=stats.ttest_rel(df_second['first-ten-overs-wickets'],df_second['last-ten-overs-wickets'])

In [288]:
print(pval)
if pval<0.05:
    print("reject null hypothesis")
else:
    print("accept null hypothesis")

8.92620766849228e-289
reject null hypothesis


#### 3

* H(0):There is an equal probability of wicket by the first category of dismissal and second category of dismissal
* H(A):There is an equal probability of wicket by the first category of dismissal and second category of dismissal

##### Data

In [289]:
df_third=df_scorecard[df_scorecard['wicket-method']!='0']

In [290]:
first_cat=['run out','hit wicket','obstructing the field','retired out','stumped']
second_cat=['caught','bowled','lbw','caught and bowled']

In [291]:
df_third['first-category']=df_third['wicket-method'].apply(lambda x: 1 if x in first_cat else 0 )
df_third['sec-category']=df_third['wicket-method'].apply(lambda x: 1 if x in second_cat else 0 )

In [292]:
df_third=df_third[['match-id','first-category','sec-category']]
df_third=df_third.groupby(['match-id'],as_index=False).sum()
df_third['wickets']=df_third['first-category']+df_third['sec-category']

In [293]:
# df_third.loc[:,'wic_batsman']=round(df_third['wic_batsman']/df_third['wickets'],3)
# df_third.loc[:,'wic_bowler']=round(df_third['wic_bowler']/df_third['wickets'],3)
df_third=df_third[['first-category','sec-category']]

In [294]:
df_third

Unnamed: 0,first-category,sec-category
0,1,15
1,1,12
2,2,15
3,0,10
4,1,15
5,1,17
6,6,12
7,3,6
8,4,13
9,0,9


#### Visualizations

In [295]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=df_third['first-category'], histnorm='probability', name='First Category'))
fig.add_trace(go.Histogram(x=df_third['sec-category'], histnorm='probability',name='Second Category'))
fig.update_layout(title='Probability distribution for wickets fallen by First category of dismissal methods and second category',xaxis_title='Number of wickets',yaxis_title='Probability')
fig.show()

In [296]:
fig=ff.create_distplot([df_third['first-category'],df_third['sec-category']],['First Category','Second Category'],curve_type='normal')
fig.update_layout(title='Density of wickets fallen by first and second category of dismissal methods',xaxis_title='Number of wickets',yaxis_title='Density')
fig.show()

#### Hypothesis Testing

##### Paired T Test

In [297]:
df_third.describe()

Unnamed: 0,first-category,sec-category
count,1707.0,1707.0
mean,1.64792,13.127709
std,1.351782,2.980216
min,0.0,4.0
25%,1.0,11.0
50%,1.0,13.0
75%,2.0,15.0
max,7.0,20.0


In [298]:
ttest,pval=stats.ttest_rel(df_third['first-category'],df_third['sec-category'])

In [299]:
print(pval)
if pval<0.05:
    print("reject null hypothesis")
else:
    print("accept null hypothesis")

0.0
reject null hypothesis


#### Current

In [300]:
# df_sample_mean_bats=[]
# df_sample_mean_bowler=[]
# for i in range(1000):
#     df_sample=df_third.sample(n=100)
#     mean_v=df_sample.mean(axis=0)
#     df_sample_mean_bats.append(mean_v[0])
#     df_sample_mean_bowler.append(mean_v[1])
# fig=ff.create_distplot([df_sample_mean_bats,df_sample_mean_bowler],['Batsman','Bowler'])
# fig.show()

### ML

#### Correlation

In [301]:
df_kohli=df_scorecard[df_scorecard['name']=='V Kohli']

In [302]:
corr_val=df_kohli.drop(['match-id'],axis=1).corr()
corr_list=[]
for i in range(corr_val.shape[0]):
    corr_list.append(corr_val.iloc[:,i])
fig = go.Figure(data=go.Heatmap(
                   z=corr_list,
                   x=corr_val.columns,
                   y=corr_val.columns))
fig.show()

In [196]:
columns = np.full((corr_val.shape[0],), True, dtype=bool)
for i in range(corr_val.shape[0]):
    for j in range(i+1, corr_val.shape[0]):
        if corr_val.iloc[i,j] >= 0.9:
            if columns[j]:
                columns[j] = False

In [197]:
selected_columns = corr_val.columns[columns]
data = df_scorecard[selected_columns]

In [194]:
data.sort_values(by='innings')

Unnamed: 0,innings,batting-position,over-batsman,runs-scored,twos,threes,sixes,balls-bowled,maiden-overs,runs-given,wickets,extras,fall-of-wicket-score,fall-of-wicket-no
0,1,1,0.1,7,0,0,0,0,0,0,0,0,13,1
29165,1,0,0.0,0,0,0,0,60,0,38,2,4,0,0
15262,1,3,0.2,76,7,0,0,0,0,0,0,0,196,4
15261,1,6,36.5,6,0,1,0,0,0,0,0,0,208,5
15260,1,5,22.5,73,5,2,0,0,0,0,0,0,0,0
15259,1,4,7.6,43,4,0,0,0,0,0,0,0,113,3
15257,1,2,0.1,35,1,2,0,0,0,0,0,0,45,2
15256,1,1,0.1,0,0,0,0,0,0,0,0,0,0,1
15265,1,7,39.3,5,0,0,0,20,0,16,2,0,214,6
29171,1,3,1.2,10,1,0,0,6,0,5,0,0,33,3
