In [1]:
# Python Project Template
# 1. Prepare Problem
# a) Load libraries
# b) Load dataset
# 2. Summarize Data
# a) Descriptive statistics
# b) Data visualizations
# 3. Prepare Data
# a) Data Cleaning
# b) Feature Selection
# c) Data Transforms
# 4. Evaluate Algorithms
# a) Split-out validation dataset
# b) Test options and evaluation metric
# c) Spot Check Algorithms
# d) Compare Algorithms
# 5. Improve Accuracy
# a) Algorithm Tuning
# b) Ensembles
# 6. Finalize Model
# a) Predictions on validation dataset
# b) Create standalone model on entire training dataset
# c) Save model for later use

In [2]:
# Load Libraries
# data Manipulation

import pandas as pd 
import numpy as np

# Data Visualization

import seaborn as sns
import matplotlib.pyplot as plt
# import sklearn

In [3]:
# Load dataset
ipl=pd.read_csv('IPL_dataset/ipl_dataset_2022.csv')

In [4]:
ipl.sample(5)

Unnamed: 0.1,Unnamed: 0,Player,Base Price,TYPE,COST IN ₹ (CR.),Cost IN $ (000),2021 Squad,Team
71,71,Shreyas Iyer,2 Cr,BATTER,12.25,1592.5,DC,Kolkata Knight Riders
551,551,Sagar Trivedi,20 Lakh,ALL-ROUNDER,,,,Unsold
203,203,Karun Nair,50 Lakh,BATTER,1.4,182.0,KKR,Rajasthan Royals
604,604,Ryan John,20 Lakh,ALL-ROUNDER,,,,Unsold
524,524,Gurnoor Singh Brar,20 Lakh,BOWLER,,,,Unsold


In [5]:
ipl.shape

(633, 8)

In [6]:
ipl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 633 entries, 0 to 632
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       633 non-null    int64  
 1   Player           633 non-null    object 
 2   Base Price       633 non-null    object 
 3   TYPE             633 non-null    object 
 4   COST IN ₹ (CR.)  237 non-null    float64
 5   Cost IN $ (000)  237 non-null    float64
 6   2021 Squad       199 non-null    object 
 7   Team             633 non-null    object 
dtypes: float64(2), int64(1), object(5)
memory usage: 39.7+ KB


In [7]:
ipl.columns

Index(['Unnamed: 0', 'Player', 'Base Price', 'TYPE', 'COST IN ₹ (CR.)',
       'Cost IN $ (000)', '2021 Squad', 'Team'],
      dtype='object')

In [8]:
ipl.drop('Unnamed: 0',axis=1,inplace=True)

In [9]:
ipl.head(110)

Unnamed: 0,Player,Base Price,TYPE,COST IN ₹ (CR.),Cost IN $ (000),2021 Squad,Team
0,Rashid Khan,Draft Pick,BOWLER,15.00,1950.0,SRH,Gujarat Titans
1,Hardik Pandya,Draft Pick,ALL-ROUNDER,15.00,1950.0,MI,Gujarat Titans
2,Lockie Ferguson,2 Cr,BOWLER,10.00,1300.0,KKR,Gujarat Titans
3,Rahul Tewatia,40 Lakh,ALL-ROUNDER,9.00,1170.0,RR,Gujarat Titans
4,Shubman Gill,Draft Pick,BATTER,8.00,1040.0,KKR,Gujarat Titans
...,...,...,...,...,...,...,...
105,Harpreet Brar,20 Lakh,ALL-ROUNDER,3.80,494.0,PBKS,Punjab Kings
106,Vaibhav Arora,20 Lakh,BOWLER,2.00,260.0,KKR,Punjab Kings
107,Raj Angad Bawa,20 Lakh,ALL-ROUNDER,2.00,260.0,,Punjab Kings
108,Nathan Ellis,75 Lakh,BOWLER,0.75,97.5,PBKS,Punjab Kings


In [10]:
ipl.isnull().sum()

Player               0
Base Price           0
TYPE                 0
COST IN ₹ (CR.)    396
Cost IN $ (000)    396
2021 Squad         434
Team                 0
dtype: int64

In [11]:
ipl['COST IN ₹ (CR.)']= ipl['COST IN ₹ (CR.)'].fillna(0)
ipl['Cost IN $ (000)']= ipl['Cost IN $ (000)'].fillna(0)

In [12]:
ipl[ipl['2021 Squad'].isnull()]

Unnamed: 0,Player,Base Price,TYPE,COST IN ₹ (CR.),Cost IN $ (000),2021 Squad,Team
6,Yash Dayal,20 Lakh,BOWLER,3.2,416.0,,Gujarat Titans
9,Abhinav Sadarangani,20 Lakh,BATTER,2.6,338.0,,Gujarat Titans
10,Matthew Wade,2 Cr,WICKETKEEPER,2.4,312.0,,Gujarat Titans
11,Alzarri Joseph,75 Lakh,BOWLER,2.4,312.0,,Gujarat Titans
17,Varun Aaron,50 Lakh,BOWLER,0.5,65.0,,Gujarat Titans
...,...,...,...,...,...,...,...
628,Sairaj Patil,20 Lakh,BATTER,0.0,0.0,,Unsold
629,Monu Singh,20 Lakh,BOWLER,0.0,0.0,,Unsold
630,Nivethan Radhakrishnan,20 Lakh,BOWLER,0.0,0.0,,Unsold
631,Lance Morris,20 Lakh,BOWLER,0.0,0.0,,Unsold


In [13]:
ipl['2021 Squad']=ipl['2021 Squad'].fillna('Not Participated')

In [14]:
ipl.isnull().sum()

Player             0
Base Price         0
TYPE               0
COST IN ₹ (CR.)    0
Cost IN $ (000)    0
2021 Squad         0
Team               0
dtype: int64

In [15]:
teams=ipl[ipl['COST IN ₹ (CR.)']>0]['Team'].unique()
print(teams)

['Gujarat Titans' 'Chennai Super Kings' 'Delhi Capitals'
 'Kolkata Knight Riders' 'Punjab Kings' 'Lucknow Super Giants'
 'Mumbai Indians' 'Royal Challengers Bangalore' 'Rajasthan Royals'
 'Sunrisers Hyderabad']


In [16]:
ipl['Status']=ipl['Team'].replace(teams,'sold')

In [17]:
ipl.sample()

Unnamed: 0,Player,Base Price,TYPE,COST IN ₹ (CR.),Cost IN $ (000),2021 Squad,Team,Status
155,Jaydev Unadkat,75 Lakh,BOWLER,1.3,169.0,RR,Mumbai Indians,sold


In [18]:
ipl.drop_duplicates( keep='first', inplace=True, ignore_index=False)

In [19]:
ipl[ipl['Player'].duplicated(keep=False)]

Unnamed: 0,Player,Base Price,TYPE,COST IN ₹ (CR.),Cost IN $ (000),2021 Squad,Team,Status
62,Lalit Yadav,20 Lakh,ALL-ROUNDER,0.65,84.5,DC,Delhi Capitals,sold
240,Amit Mishra,1.5 Cr,BOWLER,0.0,0.0,DC,Unsold,Unsold
499,Amit Mishra,20 Lakh,BOWLER,0.0,0.0,Not Participated,Unsold,Unsold
530,Lalit Yadav,20 Lakh,BOWLER,0.0,0.0,Not Participated,Unsold,Unsold
537,Shubham Singh,20 Lakh,ALL-ROUNDER,0.0,0.0,Not Participated,Unsold,Unsold
544,Shubham Singh,20 Lakh,BOWLER,0.0,0.0,Not Participated,Unsold,Unsold


In [20]:
#How many players have participate in 2022 IPL Auction
ipl.shape[0] # zero means rows

632

In [21]:
#how many types of players have players have participate
types=ipl['TYPE'].value_counts()
types.reset_index()

Unnamed: 0,TYPE,count
0,ALL-ROUNDER,241
1,BOWLER,215
2,BATTER,112
3,WICKETKEEPER,64


In [None]:
from  ydata_profiling import ProfileReport
ProfileReport(ipl,title='IPL_Report.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
types=ipl['TYPE'].value_counts()
types.reset_index()

In [None]:
plt.pie(types.values, labels=types.index, labeldistance=1.2, autopct='%1.2f%%',shadow=True, startangle=60 )
plt.title('Role of Players Participated',fontsize=15)
plt.plot()

In [None]:
status=ipl['Status'].value_counts()
status.reset_index()

In [None]:
#Player Sold Unsold Using bar graph
plt.figure(figsize=(10,5))
fig=sns.countplot(ipl,x='Status',palette=['Orange','Pink'])
plt.xlabel('Sold or Unsold')
plt.ylabel('Number of Players')
plt.title('Sold vs Unsold', fontsize=15)
plt.plot()
for p in fig.patches:
    fig.annotate(format(p.get_height(),'.0f'),(p.get_x()+ p.get_width()/2,p.get_height()),ha='center', va='center',xytext=(0,4),textcoords='offset points')


In [None]:
ipl.groupby('Status')['Player'].count()

In [None]:
# Total number of player bought by each team
plt.figure(figsize=(20,10))
fig = sns.countplot(x='Team',data=ipl[(ipl.Team !='Unsold') & (ipl.Team)])
plt.xlabel('Team Names')
plt.ylabel('Players Bought by Each Team', fontsize=12)
plt.xticks(rotation=70)
plt.plot()
for p in fig.patches:
        fig.annotate(format(p.get_height(),'.0f'),(p.get_x()+ p.get_width()/2,p.get_height()),ha='center', va='center',xytext=(0,4),textcoords='offset points')

In [None]:
ipl['retention']=ipl['Base Price']

In [None]:
ipl['retention'].replace(['2 Cr','40 Lakh','20 Lakh','1 Cr','75 Lakh','50 Lakh','30 Lakh','1.5 Cr'],'From Auction',inplace=True)

In [None]:
# Treating Base Price
ipl['Base Price'].replace('Draft Pick',0,inplace=True)

In [None]:
ipl['base_price']=ipl['Base Price'].apply(lambda x:str(x).split(' ')[0])
ipl['base_price_unit']=ipl['Base Price'].apply(lambda x:str(x).split(' ')[-1])

In [None]:
ipl['base_price'].replace('Retained',0,inplace=True)

In [None]:
ipl.head(10)

In [None]:
# Total player retained and bought
ipl.groupby(['Team','retention'])['retention'].count()[:-1]

In [None]:
plt.figure(figsize=(20,10))
fig = sns.countplot(x='Team',data=ipl[(ipl.Team !='Unsold')],hue=ipl['TYPE'])
plt.title('Players in Each Team')
plt.xlabel('Team Names')
plt.ylabel('Number of Player')

In [None]:
# highest amount spent on a single player by ach team
ipl[ipl['retention']=='From Auction'].groupby(['Team'])['COST IN ₹ (CR.)'].max()[:-1].sort_values(ascending=False)

In [None]:
# Player tetained at maximum price
ipl[ipl['retention']=='Retained'].sort_values(by ='COST IN ₹ (CR.)',ascending=False).head(1)

In [None]:
# Top 5 Bowler
ipl[(ipl['retention']=='From Auction') & (ipl['TYPE']=='BOWLER')].sort_values(by ='COST IN ₹ (CR.)',ascending=False).head(5)

In [None]:
# Top 5 BATTER
ipl[(ipl['retention']=='From Auction') & (ipl['TYPE']=='BATTER')].sort_values(by ='COST IN ₹ (CR.)',ascending=False).head(5)

In [None]:
# Top 5 ALL Rounder
ipl[(ipl['retention']=='From Auction') & (ipl['TYPE']=='ALL-ROUNDER')].sort_values(by ='COST IN ₹ (CR.)',ascending=False).head(5)


In [None]:
ipl=ipl.rename(columns={'2021 Squad':'Prev_team'})

In [None]:
unsold_players=ipl[(ipl.Prev_team!= 'Not Participated') & (ipl.Team=='Unsold')][['Player','Prev_team']]
print(unsold_players)

In [None]:
ipl.head()

In [None]:
from ydata_profiling import ProfileReport
IPL_Profile =ProfileReport(ipl,)