In [31]:
import pandas as pd 
import numpy as np

import plotly.express as px
import ipywidgets as widgets

In [2]:
matches_df = pd.read_csv("data/matches.csv")
deliveries_df = pd.read_csv("data/deliveries.csv")

In [3]:
# Note: Bangalore and Bengaluru are separate Chandigarh and Mohali are separate (Categories) (7 NA values - Dubai)
matches_df.replace("Bengaluru", "Bangalore", inplace=True)
matches_df.replace("Mohali", "Chandigarh", inplace=True)
matches_df["city"].fillna("Dubai", inplace=True)
matches_df["city"] = matches_df["city"].astype("category")

In [4]:
matches_df["date"] # can be converted to datetime format
matches_df["date"] = pd.to_datetime(matches_df["date"])

In [5]:
matches_df.replace("Rising Pune Supergiants", "Rising Pune Supergiant", inplace=True)
matches_df["team1"] = matches_df["team1"].astype("category")
matches_df["team2"] = matches_df["team2"].astype("category")
matches_df["toss_winner"] = matches_df["toss_winner"].astype("category")
matches_df["toss_decision"] = matches_df["toss_decision"].astype("category")
matches_df["result"] = matches_df["result"].astype("category")
matches_df["player_of_match"] = matches_df["player_of_match"].astype("category")
matches_df["winner"] = matches_df["winner"].astype("category")

In [6]:
matches_df.replace("Rajiv Gandhi Intl. Cricket Stadium", "Rajiv Gandhi International Stadium, Uppal", inplace=True)
matches_df.replace("Punjab Cricket Association Stadium, Mohali", "Punjab Cricket Association IS Bindra Stadium, Mohali", inplace=True)
matches_df.replace("M. A. Chidambaram Stadium", "MA Chidambaram Stadium, Chepauk", inplace=True)
matches_df.replace("Feroz Shah Kotla Ground", "Feroz Shah Kotla", inplace=True)
matches_df["venue"] = matches_df["venue"].astype("category")

In [7]:
matches_df.loc[4, "umpire1"] = "VK Sharma" # 2017-04-08 (index=4) u1 = Virender Sharma (https://www.espncricinfo.com/series/8048/scorecard/1082595/royal-challengers-bangalore-vs-delhi-daredevils-5th-match-indian-premier-league-2017)
matches_df.loc[753, "umpire1"] = "Bruce Oxenford" # 2019-08-05 (index=753) u1 = Bruce Oxenford (https://www.espncricinfo.com/series/8048/scorecard/1181766/delhi-capitals-vs-sunrisers-hyderabad-eliminator-indian-premier-league-2019)
matches_df["umpire1"] = matches_df["umpire1"].astype("category")
matches_df.loc[4, "umpire2"] = "S Ravi" # 2017-04-08 (index=4) u2 = Sundaram Ravi (https://www.espncricinfo.com/series/8048/scorecard/1082595/royal-challengers-bangalore-vs-delhi-daredevils-5th-match-indian-premier-league-2017)
matches_df.loc[753, "umpire2"] = "S Ravi" # 2019-08-05 (index=753) u2 = Sundaram Ravi (https://www.espncricinfo.com/series/8048/scorecard/1181766/delhi-capitals-vs-sunrisers-hyderabad-eliminator-indian-premier-league-2019)
matches_df["umpire2"] = matches_df["umpire2"].astype("category")

In [79]:
# umpire 3 not very much concerned!
matches_df["umpire3"] = matches_df["umpire3"].astype("category") 

In [9]:
# Deliveries DF Memory Utilization
deliveries_df.replace("Rising Pune Supergiants", "Rising Pune Supergiant", inplace=True)
deliveries_df["batting_team"] = deliveries_df["batting_team"].astype("category")
deliveries_df["bowling_team"] = deliveries_df["bowling_team"].astype("category")
deliveries_df["batsman"] = deliveries_df["batsman"].astype("category")
deliveries_df["non_striker"] = deliveries_df["non_striker"].astype("category")
deliveries_df["player_dismissed"] = deliveries_df["player_dismissed"].astype("category")
deliveries_df["dismissal_kind"] = deliveries_df["dismissal_kind"].astype("category")
deliveries_df["fielder"] = deliveries_df["fielder"].astype("category")

# Plotting Batsman

In [47]:
# Most Runs
def calc_most_runs(team):
    most_runs_df = deliveries_df[["batsman", "batsman_runs", "batting_team"]]
    
    if team == "ALL":
        return most_runs_df.groupby(["batsman"]).batsman_runs.sum().sort_values(ascending=False).reset_index().head(15)
    else:
        most_runs_df = most_runs_df.loc[most_runs_df["batting_team"]==team]
        return most_runs_df.groupby(["batsman"]).batsman_runs.sum().sort_values(ascending=False).reset_index().head(15)


most_runs_df = calc_most_runs("ALL")
px.bar(data_frame=most_runs_df, x="batsman", y="batsman_runs", color="batsman", barmode="relative", title="Batsman with Most Runs Scored", labels={"batsman": "Batsman", "batsman_runs": "Runs"})


In [49]:
# Most Runs in an over 
def calc_most_runs_in_over(team):    
    most_runs_in_over_df = deliveries_df[["batsman", "over", "match_id", "batsman_runs", "batting_team"]]
    if team == "ALL":
        most_runs_in_over_df=most_runs_in_over_df.groupby(["match_id", "over", "batsman"]).batsman_runs.sum().sort_values(ascending=False).reset_index().head(15)
        return most_runs_in_over_df
    
    else:
        most_runs_in_over_df = most_runs_in_over_df.loc[most_runs_in_over_df["batting_team"]==team]
        most_runs_in_over_df=most_runs_in_over_df.groupby(["match_id", "over", "batsman"]).batsman_runs.sum().sort_values(ascending=False).reset_index().head(15)
        return most_runs_in_over_df
    
most_runs_in_over_df = calc_most_runs_in_over("ALL")
px.scatter(data_frame=most_runs_in_over_df, x="batsman", y="batsman_runs", size="batsman_runs", color="batsman", title="Most runs in an over", labels={"batsman": "Batsman", "batsman_runs": "Runs"})

In [75]:
# most fours
def calc_most_fours(team):
    most_fours_df = deliveries_df[["batsman", "batsman_runs", "batting_team"]]
    most_fours_df = most_fours_df.loc[most_fours_df["batsman_runs"]==4]
    if team == "ALL":
        return most_fours_df["batsman"].value_counts().rename_axis("batsman_name").reset_index(name='counts').head(15)
        
    else:
        most_fours_df = most_fours_df.loc[most_fours_df["batting_team"]==team]
        return most_fours_df["batsman"].value_counts().rename_axis("batsman_name").reset_index(name='counts').head(15)
        
most_fours_df = calc_most_fours("ALL")

px.bar(data_frame=most_fours_df, x="counts", y="batsman_name",orientation="h", color="batsman_name", title="Most Fours", labels={"batsman_name": "Batsman", "counts": "Number of Fours"})

In [84]:
# Most fours in an innings
def calc_most_fours_in_innings(team):
    most_fours_in_innings_df = deliveries_df[["match_id", "batsman", "batsman_runs", "batting_team"]]
    most_fours_in_innings_df = most_fours_in_innings_df.loc[most_fours_in_innings_df["batsman_runs"]==4]
    
    if team == "ALL":
        return most_fours_in_innings_df.groupby(["match_id", "batsman"]).batsman_runs.count().sort_values(ascending=False).reset_index().head(15)
    
    else:
        most_fours_in_innings_df = most_fours_in_innings_df.loc[most_fours_in_innings_df["batting_team"]==team]
        return most_fours_in_innings_df.groupby(["match_id", "batsman"]).batsman_runs.count().sort_values(ascending=False).reset_index().heat(15)

most_fours_in_innings_df = calc_most_fours_in_innings("ALL")
px.scatter(data_frame=most_fours_in_innings_df, x="batsman", y="batsman_runs", size="batsman_runs", color="batsman", title="Most Fours in an Innings", labels={"batsman": "Batsman", "batsman_runs": "Number of Fours"})

In [78]:
# Most Sixes
def calc_most_sixes(team):
    most_sixes_df = deliveries_df[["batsman", "batsman_runs", "batting_team"]]
    most_sixes_df = most_sixes_df.loc[most_sixes_df["batsman_runs"]==6]
    
    if team == "ALL":
        return most_sixes_df["batsman"].value_counts().rename_axis("batsman_name").reset_index(name='counts').head(15)
    else:
        most_sixes_df = most_sixes_df.loc[most_sixes_df["batting_team"]==team]
        return most_sixes_df["batsman"].value_counts().rename_axis("batsman_name").reset_index(name='counts').head(15)
    
most_sixes_df = calc_most_sixes("ALL")
px.bar(data_frame=most_sixes_df, x="counts", y="batsman_name",orientation="h", color="batsman_name", title="Most Sixes", labels={"batsman_name": "Batsman", "counts": "Number of Fours"})

In [85]:
# Most Sixes in an innings
def calc_most_sixes_in_innings(team):
    most_sixes_in_innings_df = deliveries_df[["match_id", "batsman", "batsman_runs", "batting_team"]]
    most_sixes_in_innings_df = most_sixes_in_innings_df.loc[most_sixes_in_innings_df["batsman_runs"]==6]

    if team == "ALL":
        return most_sixes_in_innings_df.groupby(["match_id", "batsman"]).batsman_runs.count().sort_values(ascending=False).reset_index().head(15)
    
    else:
        most_sixes_in_innings_df = most_sixes_in_innings_df.loc[most_sixes_in_innings_df["batting_team"] == team]
        return most_sixes_in_innings_df.groupby(["match_id", "batsman"]).batsman_runs.count().sort_values(ascending=False).reset_index().head(15)

most_sixes_in_innings_df = calc_most_sixes_in_innings("ALL")
px.scatter(data_frame=most_sixes_in_innings_df, x="batsman", y="batsman_runs", size="batsman_runs", color="batsman",title="Most Sixes in an Innings", labels={"batsman": "Batsman", "batsman_runs": "Number of Fours"})