In [1]:
import pandas as pd 
import numpy as np

## Importing the Matches and Deliveries Data
- Analyze each of the csv separately
- Understand each of the feature, fill in empty values with appropriate information
- Optimize the memory utilization by casting the data structure for each of the feature 

In [3]:
matches_df = pd.read_csv("data/matches.csv")
deliveries_df = pd.read_csv("data/deliveries.csv")

In [4]:
matches_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 756 entries, 0 to 755
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               756 non-null    int64 
 1   season           756 non-null    int64 
 2   city             749 non-null    object
 3   date             756 non-null    object
 4   team1            756 non-null    object
 5   team2            756 non-null    object
 6   toss_winner      756 non-null    object
 7   toss_decision    756 non-null    object
 8   result           756 non-null    object
 9   dl_applied       756 non-null    int64 
 10  winner           752 non-null    object
 11  win_by_runs      756 non-null    int64 
 12  win_by_wickets   756 non-null    int64 
 13  player_of_match  752 non-null    object
 14  venue            756 non-null    object
 15  umpire1          754 non-null    object
 16  umpire2          754 non-null    object
 17  umpire3          119 non-null    ob

In [5]:
deliveries_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 179078 entries, 0 to 179077
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   match_id          179078 non-null  int64 
 1   inning            179078 non-null  int64 
 2   batting_team      179078 non-null  object
 3   bowling_team      179078 non-null  object
 4   over              179078 non-null  int64 
 5   ball              179078 non-null  int64 
 6   batsman           179078 non-null  object
 7   non_striker       179078 non-null  object
 8   bowler            179078 non-null  object
 9   is_super_over     179078 non-null  int64 
 10  wide_runs         179078 non-null  int64 
 11  bye_runs          179078 non-null  int64 
 12  legbye_runs       179078 non-null  int64 
 13  noball_runs       179078 non-null  int64 
 14  penalty_runs      179078 non-null  int64 
 15  batsman_runs      179078 non-null  int64 
 16  extra_runs        179078 non-null  int

In [6]:
deliveries_df["dismissal_kind"].value_counts()

caught                   5348
bowled                   1581
run out                   852
lbw                       540
stumped                   278
caught and bowled         211
retired hurt               12
hit wicket                 10
obstructing the field       2
Name: dismissal_kind, dtype: int64

### Observations so far!!
- Initial looks, deliveries information seems to be complete, however, I have to understand the logic of fielder feature. 
- Matches information is incomplete, features mentioned below are missing values
    - city (7)
    - winner (4 matches no result)
    - player of the match (4)
    - umpire1 (2)
    - umpire2 (2)
    - umpire3 (637)
- I believe we can convert most of the objects to categories to optimize the memory. 

### Matches Dataframe Analysis
- Understand each of the feature
- Optimize the memory utilization 
- Fill in missing columns

In [7]:
matches_df["city"].value_counts() # Note: Bangalore and Bengaluru are separate (Categories)

Mumbai            101
Kolkata            77
Delhi              74
Bangalore          66
Hyderabad          64
Chennai            57
Jaipur             47
Chandigarh         46
Pune               38
Durban             15
Bengaluru          14
Visakhapatnam      13
Centurion          12
Ahmedabad          12
Rajkot             10
Mohali             10
Indore              9
Dharamsala          9
Johannesburg        8
Cape Town           7
Port Elizabeth      7
Abu Dhabi           7
Cuttack             7
Ranchi              7
Raipur              6
Sharjah             6
Kochi               5
Kanpur              4
Nagpur              3
East London         3
Kimberley           3
Bloemfontein        2
Name: city, dtype: int64

In [13]:
matches_df["date"] # can be converted to datetime format
matches_df["date"] = pd.to_datetime(matches_df["date"])

In [14]:
matches_df["team1"].value_counts() # Note: Rising Pune Supergiants repeated twice (Categories)

Mumbai Indians                 101
Kings XI Punjab                 91
Chennai Super Kings             89
Royal Challengers Bangalore     85
Kolkata Knight Riders           83
Delhi Daredevils                72
Rajasthan Royals                67
Sunrisers Hyderabad             63
Deccan Chargers                 43
Pune Warriors                   20
Gujarat Lions                   14
Rising Pune Supergiant           8
Rising Pune Supergiants          7
Kochi Tuskers Kerala             7
Delhi Capitals                   6
Name: team1, dtype: int64

In [15]:
matches_df["team2"].value_counts() # Note: Rising Pune Supergiants repeated twice (Categories)

Kolkata Knight Riders          95
Royal Challengers Bangalore    95
Delhi Daredevils               89
Mumbai Indians                 86
Kings XI Punjab                85
Rajasthan Royals               80
Chennai Super Kings            75
Sunrisers Hyderabad            45
Deccan Chargers                32
Pune Warriors                  26
Gujarat Lions                  16
Delhi Capitals                 10
Rising Pune Supergiant          8
Rising Pune Supergiants         7
Kochi Tuskers Kerala            7
Name: team2, dtype: int64

In [19]:
matches_df["toss_winner"].value_counts() # Note: Rising Pune Supergiants repeated twice (Categories)

Mumbai Indians                 98
Kolkata Knight Riders          92
Chennai Super Kings            89
Kings XI Punjab                81
Royal Challengers Bangalore    81
Rajasthan Royals               80
Delhi Daredevils               80
Sunrisers Hyderabad            46
Deccan Chargers                43
Pune Warriors                  20
Gujarat Lions                  15
Delhi Capitals                 10
Kochi Tuskers Kerala            8
Rising Pune Supergiants         7
Rising Pune Supergiant          6
Name: toss_winner, dtype: int64

In [20]:
matches_df["toss_decision"].value_counts() # Should be converted to categories

field    463
bat      293
Name: toss_decision, dtype: int64

In [21]:
matches_df["result"].value_counts() # Should be converted to categries

normal       743
tie            9
no result      4
Name: result, dtype: int64

In [30]:
matches_df["player_of_match"].value_counts() # only 752 values available (categories)

CH Gayle             21
AB de Villiers       20
RG Sharma            17
MS Dhoni             17
DA Warner            17
                     ..
M Ur Rahman           1
Washington Sundar     1
S Sohal               1
CRD Fernando          1
J Botha               1
Name: player_of_match, Length: 226, dtype: int64

In [32]:
matches_df["winner"].value_counts() # Rising Pune Supergiants two entries (categories) only 752 values available

Mumbai Indians                 109
Chennai Super Kings            100
Kolkata Knight Riders           92
Royal Challengers Bangalore     84
Kings XI Punjab                 82
Rajasthan Royals                75
Delhi Daredevils                67
Sunrisers Hyderabad             58
Deccan Chargers                 29
Gujarat Lions                   13
Pune Warriors                   12
Rising Pune Supergiant          10
Delhi Capitals                  10
Kochi Tuskers Kerala             6
Rising Pune Supergiants          5
Name: winner, dtype: int64

In [35]:
matches_df["venue"].value_counts() # Lot of duplicate entries with different spelling should clean this up! (categories) 

Eden Gardens                                            77
M Chinnaswamy Stadium                                   73
Wankhede Stadium                                        73
Feroz Shah Kotla                                        67
Rajiv Gandhi International Stadium, Uppal               56
MA Chidambaram Stadium, Chepauk                         49
Sawai Mansingh Stadium                                  47
Punjab Cricket Association Stadium, Mohali              35
Maharashtra Cricket Association Stadium                 21
Dr DY Patil Sports Academy                              17
Subrata Roy Sahara Stadium                              17
Kingsmead                                               15
Punjab Cricket Association IS Bindra Stadium, Mohali    14
Sardar Patel Stadium, Motera                            12
SuperSport Park                                         12
Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium     11
Brabourne Stadium                                       

In [36]:
matches_df["umpire1"].value_counts() # Only 754 values found (Categories)

HDPK Dharmasena    73
Asad Rauf          51
S Ravi             49
AK Chaudhary       43
Aleem Dar          38
                   ..
Nanda Kishore       1
SJA Taufel          1
Sundaram Ravi       1
SL Shastri          1
Ulhas Gandhe        1
Name: umpire1, Length: 61, dtype: int64

In [37]:
matches_df["umpire2"].value_counts() # Only 754 values found (Categories)

S Ravi                   57
C Shamshuddin            57
SJA Taufel               54
RJ Tucker                38
CK Nandan                36
                         ..
JD Cloete                 1
Ian Gould                 1
KN Ananthapadmanabhan     1
SD Fry                    1
SJ Davis                  1
Name: umpire2, Length: 65, dtype: int64

In [38]:
matches_df["umpire3"].value_counts() # Not very much concerned!

Nitin Menon                10
O Nandan                   10
C Shamshuddin              10
Anil Chaudhary              9
Vineet Kulkarni             8
S Ravi                      8
Anil Dandekar               7
Bruce Oxenford              7
Yeshwant Barde              7
Chris Gaffaney              6
Marais Erasmus              5
Rod Tucker                  5
Nigel Llong                 4
A Nanda Kishore             3
Nanda Kishore               3
Ian Gould                   3
A.D Deshmukh                2
Kumar Dharmasena            2
Ulhas Gandhe                2
K Ananthapadmanabhan        2
Virender Kumar Sharma       2
KN Anantapadmanabhan        1
KN Ananthapadmanabhan       1
Chettithody Shamshuddin     1
Sundaram Ravi               1
Name: umpire3, dtype: int64