In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
df = pd.read_csv('/kaggle/input/top-100-batsman/top100batsman.csv')
df.head()

## Checking the null values present in the dataset 

In [None]:
df.isnull().sum()

## Exploratory Data Analysis

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

### Getting rid of unneccesary columns

In [None]:
del df['rating']
del df['player_id']

### Name Top 10 Batsman who has scored most runs in test 

In [None]:
df.sort_values(by = 'runs',ascending=False).head(10)

### Name Top 10 Batsman whose runs in test is maximum.

In [None]:
df.sort_values(by = 'high_score',ascending = False).head(10)

### Which player has played most number of matches 

In [None]:
df.sort_values(by = 'matches',ascending = False).head()

### Number of the player who have scored runs more than average

In [None]:
print(len(df[df['runs'] >= df['runs'].mean()]['batsman']))

### Names of the player who have scored runs more than average

In [None]:
df[df['runs'] >= df['runs'].mean()]['batsman'].head()

### Names of the player who has player more than average number of matches and has strike rate more than average 

In [None]:
df.head(2)

In [None]:
avg_matches      = sum(df.values[:,3])//len(df.values[:,3])
avg_strike_rate  = sum(df.values[:,8])//len(df.values[:,8])

player_ = []
for i in df.values:
    if (i[3] >= avg_matches) and i[8] >= avg_strike_rate:
        player_.append([i[1],i[2],i[3],i[8]])
        
top_players = pd.DataFrame(player_,columns = ['name','team','matches','strike_rate'])
top_players

### sixes per innings 

In [None]:
six_per_inning = np.array(round(df['sixes']/df['innings'],3))
df['six_per_inning'] = six_per_inning
df.head()

### Fours per innings 

In [None]:
fours_per_inning = np.array(round(df['fours']/df['innings'],3))
df['fours_per_inning'] = fours_per_inning
df.head()

### top 5 player with most number of fours per innings 

In [None]:
df.sort_values(by='fours_per_inning',ascending = False).head(5)

### top 5 player with most number of sixes per innings 

In [None]:
df.sort_values(by='six_per_inning',ascending = False).head(5)

### Groupby the country 

In [None]:
teams = []
for i in df['team']:
    teams.append(i)
    
teams = list(set(teams))
print(teams)

In [None]:
data = []
for team in teams:
    c = 0
    match = 0
    for i in df.values:
        if team in (i[2]):
            c += 1
            match += i[3]
            
    data.append([team,c,match,match//c])
df_ = pd.DataFrame(data,columns =['team','number_of_player','total_matches','avg_match_per_player'])
df_.sort_values(by ='number_of_player',ascending= False)

# Data Visualisation 

### Players with most runs 

In [None]:
df_runs  = df.sort_values(by = 'runs',ascending=False).head(10)
runs     = np.array(df_runs['runs'])
player   = np.array(df_runs['batsman'])

plt.subplots(figsize = (16,4))
plt.bar(player,runs)
plt.xlabel('batsmen')
plt.ylabel('runs')
plt.show()

#### team distribution of player

In [None]:
team       = df_['team'] 
tot_player = df_['number_of_player']

plt.pie(tot_player, labels=team,autopct='%1.2f')

plt.show()


### most number of double century 

In [None]:
df_double_century  = df.sort_values(by = 'double_century',ascending=False).head()

d_c    = df_double_century.values[:,10]
player = df_double_century.values[:,1]

plt.plot(player,d_c)
plt.xlabel('player')
plt.ylabel('double century')
plt.show()