# Can we determine the best-performing foreign player in the 2020 Indian Premier League season?

In [61]:
#IMPORT FUNCTIONS (NumPy, Panda, Math, MatPlotLib)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

######################################################################################################

#Basic Statistical Functions:

def standardDeviation(xList, xAvg):
    n = len(xList)
    numerator = 0
    for i in range(n):
        if math.isnan(xList[i]) == False:
            numerator = numerator + ((xList[i] - xAvg)**2)
    fraction = (1.0 * numerator) / n
    stdDev = np.sqrt(fraction)
    return stdDev
    
def square(x):
    return x * x

def divideTwoLists(xList, yList):
    if len(xList) == len(yList):
        zList = [None] * len(xList)
        for i in range(len(xList)):
            quotient = xList[i] / yList[i]
            zList[i] = quotient
    return zList

def findMean(lst):
    avg = 0
    n = len(lst)
    for value in lst:
        if math.isnan(value) == False:
            avg = avg + value
    avg = (1.0 * avg) / n
    return avg

def findZScore(lst, avg, sd):
    zSL = [None] * len(lst)
    for i in range(len(lst)):
        term = lst[i] - avg
        zScore = term / sd
        zSL[i] = zScore
        
    return zSL
    
def pearsonNum(xList, yList):
    if len(xList) == len(yList):
        n = len(xList)
        xyList = [None] * n
        for i in range(n):
            xyList[i] = xList[i] * yList[i]
        numerator = (n * np.sum(xyList)) - np.sum(xList) - np.sum(yList)
        return numerator
    else:
        return "There's Been An Error!"
    
def pearsonDenom(xList, yList):
    if len(xList) == len(yList):
        n = len(xList)
        xListS = sum(map(lambda x: square(x), xList))
        yListS = sum(map(lambda y: square(y), yList))
        
        xLeft = n * (xListS)
        yLeft = n * (yListS)
        
        xRight = square(np.sum(xList))
        yRight = square(np.sum(yList))
        
        denominator = (xLeft - xRight) * (yLeft - yRight)
        return denominator        
    else:
        return "There's Been An Error!"
    
def pearsonCoeff(xList, yList):
    return pearsonNum(xList, yList) / pearsonDenom(xList, yList)
    

# Adding Table of Foreign Players 

Input file is a CSV file containing statistics on every foreign player involved in 2020 IPL season. Due to the COVID-19 global pandemic, and other extraneous factors, many players pulled out, so I assigned a new table to contain the statistics of players that played in at least 5 games.

In [62]:
csvURL = 'https://raw.githubusercontent.com/gprasad125/cricketProject/main/fullStatistics.csv?token=AQETVXPCJV7HAUVOK3YZYUK7XXUVM'
foreignPlayers = pd.read_csv(csvURL)

activePlayers = foreignPlayers[foreignPlayers["Matches Played"] > 5]
activePlayers

HTTPError: HTTP Error 404: Not Found

Here are the statistics measured for each player, and what type of data type they are measured in.

Some important cricket-specific notes:
    - A player's Batting Average is calculated as (total # of runs scored / total # of innings the player got out)
    - A player's Strike Rate is calculated as (total # of runs scored / total # of balls faced) 
    - An "over" is a set of six balls, delivered by one bowler at a time.
    - A player's Bowling Average is calculated as (total # of wickets taken / total # of runs conceded)
    - A player's Economy Rate is calculated as (total # of runs conceded / total # of overs delivered)
    - A player's Bowling SR is calculated as (total # of wickets taken / total # of balls delivered)
    
Some of these statistics, such as balls delievered, or innings a player got out, were not included in the dataset, because other statistics utilize these statistics as part of their calculation.

Below we can see the different statistics measured, and the types of data values they carry. 

In [None]:
activePlayers.dtypes

# Defining Batsmen, Bowlers, and All-Rounders

Cricketers perform one of three distinct roles in the game. They can be batsmen (those who score runs for the team), bowlers (those who take wickets and try and limit the other team's score), or all-rounders (those who can both bat and bowl serviceably). 

Below, we use the table's data to divide the players into the categories they perform:
- Batsman are defined as "Players who spent over 70% of their total matches batting, and less than 30% of their total matches bowling." We also will drop the bowling statistics for these players as we are only interested in their batting statistics.


- Bowlers are defined as "Players who spent over 70% of their total matches bowling, and less than 50% of their total matches batting." We are taking this 50% cutoff for the greater likelihood than any bowler will bat, as compared to any batter bowling. We will also drop the batting statistics for these players, as we are only interested in their batting statistics. 


- All-Rounders are defined as "Players who spent over 50% of their total matches bowling, and over 50% of their total matches batting." We will not drop any stats as we are interested in both types of statistics for these type of players.

Something that will help us define the above categories will be finding the percentage of matches played a particular player spent batting or bowling.

In [None]:
matchesPlayed = activePlayers["Matches Played"].tolist()
battingInnings = activePlayers["Batting Innings"].tolist()
bowlingInnings = activePlayers["Bowling Innings"].tolist()

percentageBatting = divideTwoLists(battingInnings, matchesPlayed)
percentageBowling = divideTwoLists(bowlingInnings, matchesPlayed)

activePlayers.insert(5, "Percent of Matches Batting", percentageBatting)
activePlayers.insert(10, "Percent of Matches Bowling", percentageBowling)

activePlayers

Defining Batsmen

In [68]:
batsmen = activePlayers.loc[(activePlayers["Percent of Matches Batting"] > 0.7) & (activePlayers["Percent of Matches Bowling"] < 0.3)].drop(["Bowling Innings","Percent of Matches Bowling", "Overs Delivered",  "Wickets", "Bowling Average", "Economy Rate", "Bowling SR"], axis = 1)
batsmen

Unnamed: 0,Player Name,Nationality,Team,Matches Played,Batting Innings,Percent of Matches Batting,Runs Scored,Batting Average,Strike Rate
0,AB de Villiers,South Africa,Royal Challengers Bangalore,15,14,0.933333,454,45.4,158.74
1,Aaron Finch,Australia,Royal Challengers Bangalore,15,12,0.8,268,22.33,111.2
9,Eoin Morgan,England,Kolkata Knight Riders,14,14,1.0,418,41.8,138.41
18,Steve Smith,Australia,Rajasthan Royals,14,14,1.0,311,25.91,131.22
21,Jos Buttler,England,Rajasthan Royals,13,12,0.923077,328,32.8,144.49
26,Chris Gayle,West Indies,Kings XI Punjab,7,7,1.0,288,41.14,137.14
29,Nicholas Pooran,West Indies,Kings XI Punjab,14,14,1.0,353,35.3,169.71
33,Shane Watson,Australia,Chennai Super Kings,11,11,1.0,299,29.9,121.05
34,Faf du Plessis,South Africa,Chennai Super Kings,13,13,1.0,449,40.81,140.75
44,Quinton de Kock,South Africa,Mumbai Indians,16,16,1.0,503,35.92,140.5


Defining Bowlers

In [None]:
bowlers = activePlayers.loc[(activePlayers["Percent of Matches Bowling"] > 0.7) & (activePlayers["Percent of Matches Batting"] < 0.5)].drop(["Batting Innings", "Percent of Matches Batting", "Runs Scored","Batting Average", "Strike Rate"], axis = 1)
bowlers

Defining All-Rounders

In [None]:
allRounders = activePlayers.loc[(activePlayers["Percent of Matches Batting"] >= 0.5) & (activePlayers["Percent of Matches Bowling"] >= 0.5)]
allRounders

Checking that all players are accounted for

In [None]:
#Getting the individual names for each category: Batsmen, Bowlers, All-Rounders
batNames = batsmen["Player Name"].tolist()
bowlNames = bowlers["Player Name"].tolist()
allRNames = allRounders["Player Name"].tolist()

#Getting all the names from the full table
names = activePlayers["Player Name"].tolist()

#Checking each category's list of names vs the total list of names by removing duplicates

count = len(names) #the number of active players 

for name in names:
    if name in batNames:
        count = count - 1
    elif name in bowlNames:
        count = count - 1
    elif name in allRNames:
        count = count - 1

#If all names from the full table are accounted and no player appears in two tables, this should yield True
count == 0

# Batsmen Calculations

Let's begin with the calculations for the batsmen. There are three categories we measure:
    - Runs Scored, a measure of impact: Did this player contribute a large amount to the team's overall season?
    - Batting Average, a measure of consistency: Did this player perform well in most matches?
    - Strike Rate, a measure of efficiency: How quickly did this player score his runs?
    
A good place to start would be to calculate the means and medians of each value.

In [63]:
meanRuns = findMean(batsmen["Runs Scored"].tolist())
meanBatAvg = findMean(batsmen["Batting Average"].tolist())
meanSR = findMean(batsmen["Strike Rate"].tolist())

means = [meanRuns, meanBatAvg, meanSR]
means

[361.85714285714283, 34.87214285714286, 138.31642857142856]

In [64]:
medianRuns = np.median(batsmen["Runs Scored"].tolist())
medianBatAvg = np.nanmedian(batsmen["Batting Average"].tolist())
medianSR = np.nanmedian(batsmen["Strike Rate"].tolist())

medians = [medianRuns, medianBatAvg, medianSR]
medians

[336.5, 35.61, 137.77499999999998]

We can use the mean to calculate the Standard Deviation and find which players performed well-above average in all three recorded statistics.

In [65]:
sdRuns = standardDeviation(batsmen["Runs Scored"].tolist(), means[0])
sdBatAvg = standardDeviation(batsmen["Batting Average"].tolist(), means[1])
sdSR = standardDeviation(batsmen["Strike Rate"].tolist(), means[2])

stdDevs = [sdRuns, sdBatAvg, sdSR]
stdDevs

[96.14264487346553, 7.696438117329182, 14.133905843317168]

Now that we have the Standard Deviations, we can add a column to the table that tells roughly how many SDs above the mean someone's particular score is using a z-Score measure. 

In [71]:
zScoreRuns = findZScore(batsmen["Runs Scored"].tolist(), means[0], stdDevs[0])
zScoreBA = findZScore(batsmen["Batting Average"].tolist(), means[1], stdDevs[1])
zScoreSR = findZScore(batsmen["Strike Rate"].tolist(), means[2], stdDevs[2])

batsmen.insert(9, "Runs Scored Z Scores", zScoreRuns)
batsmen.insert(10, "Batting Avg. Z Scores", zScoreBA)
batsmen.insert(11, "Strike Rates Z Scores", zScoreSR)

batsmen

Unnamed: 0,Player Name,Nationality,Team,Matches Played,Batting Innings,Percent of Matches Batting,Runs Scored,Batting Average,Strike Rate,Runs Scored Z Scores,Batting Avg. Z Scores,Strike Rates Z Scores
0,AB de Villiers,South Africa,Royal Challengers Bangalore,15,14,0.933333,454,45.4,158.74,0.958397,1.367887,1.445005
1,Aaron Finch,Australia,Royal Challengers Bangalore,15,12,0.8,268,22.33,111.2,-0.976228,-1.629604,-1.918538
9,Eoin Morgan,England,Kolkata Knight Riders,14,14,1.0,418,41.8,138.41,0.583954,0.900138,0.00662
18,Steve Smith,Australia,Rajasthan Royals,14,14,1.0,311,25.91,131.22,-0.528976,-1.164453,-0.502085
21,Jos Buttler,England,Rajasthan Royals,13,12,0.923077,328,32.8,144.49,-0.352155,-0.269234,0.436792
26,Chris Gayle,West Indies,Kings XI Punjab,7,7,1.0,288,41.14,137.14,-0.768204,0.814384,-0.083234
29,Nicholas Pooran,West Indies,Kings XI Punjab,14,14,1.0,353,35.3,169.71,-0.092125,0.055592,2.221153
33,Shane Watson,Australia,Chennai Super Kings,11,11,1.0,299,29.9,121.05,-0.65379,-0.646032,-1.221632
34,Faf du Plessis,South Africa,Chennai Super Kings,13,13,1.0,449,40.81,140.75,0.906391,0.771507,0.17218
44,Quinton de Kock,South Africa,Mumbai Indians,16,16,1.0,503,35.92,140.5,1.468057,0.136148,0.154492


Now that we know the zScores for each of our three categories, we can eliminate batsmen who have negative zScores in any of the categories, since they performed below average in the 2020 season.

In [73]:
batsmen = batsmen.loc[(batsmen["Runs Scored Z Scores"] > 0) & (batsmen["Batting Avg. Z Scores"] > 0) & (batsmen["Strike Rates Z Scores"] > 0)]
batsmen

Unnamed: 0,Player Name,Nationality,Team,Matches Played,Batting Innings,Percent of Matches Batting,Runs Scored,Batting Average,Strike Rate,Runs Scored Z Scores,Batting Avg. Z Scores,Strike Rates Z Scores
0,AB de Villiers,South Africa,Royal Challengers Bangalore,15,14,0.933333,454,45.4,158.74,0.958397,1.367887,1.445005
9,Eoin Morgan,England,Kolkata Knight Riders,14,14,1.0,418,41.8,138.41,0.583954,0.900138,0.00662
34,Faf du Plessis,South Africa,Chennai Super Kings,13,13,1.0,449,40.81,140.75,0.906391,0.771507,0.17218
44,Quinton de Kock,South Africa,Mumbai Indians,16,16,1.0,503,35.92,140.5,1.468057,0.136148,0.154492
