# Table of Contents
* [Splitting](#split)
* [Selecting](#select)
* [Iterating](#iterate)
* [Aggregation](#aggregate)
* [Transformation](#)
* [Filteration](#)

In [1]:
import sys #only needed to determine Python version number
import pandas as pd #this is how I usually import pandas
import numpy as np
import matplotlib.pyplot as plt
import matplotlib #only needed to determine Matplotlib version number

# Enable inline plotting
%matplotlib inline

In [10]:
def readData(location):
    try:
        if location.endswith('.txt') or location.endswith('.csv'):
            return pd.read_csv(location)
        elif location.endswith('.xlsx'):
            return pd.read_excel(location)
        else:
            raise TypeError
    except TypeError:
        print("invalid file type")
    except:
        print("other exceptions")

location = './dataset/Seasons_Stats.csv'        
players_all = readData(location) #since 1995
players_all = players_all.drop(columns=['Unnamed: 0', 'blanl', 'blank2'])
print(players_all.columns)
print(players_all.index)
players_all = players_all.loc[:, ['Year', 'Player', 'Tm', 'Pos', 'G', 'GS', 'PTS', 'FG%', '3P%', '2P%', 'FT%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'PF']]
players_season_2017 = players_all[(players_all['Year']==2017)]
top_scorers = players_season_2017.sort_values(by='PTS', ascending=False).head(50)
top_scorers

Index(['Year', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'PER', 'TS%',
       '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%',
       'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP',
       'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%',
       'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV',
       'PF', 'PTS'],
      dtype='object')
RangeIndex(start=0, stop=24691, step=1)


Unnamed: 0,Year,Player,Tm,Pos,G,GS,PTS,FG%,3P%,2P%,FT%,TRB%,AST%,STL%,BLK%,TOV%,PF
24654,2017.0,Russell Westbrook,OKC,PG,81.0,81.0,2558.0,0.425,0.343,0.459,0.845,17.1,57.3,2.3,0.9,15.9,190.0
24306,2017.0,James Harden,HOU,PG,81.0,81.0,2356.0,0.44,0.347,0.53,0.847,12.2,50.7,2.0,1.0,19.5,215.0
24612,2017.0,Isaiah Thomas,BOS,PG,76.0,76.0,2199.0,0.463,0.379,0.528,0.909,4.4,32.6,1.4,0.4,10.7,167.0
24218,2017.0,Anthony Davis,NOP,C,75.0,75.0,2099.0,0.504,0.299,0.524,0.802,17.3,11.1,1.7,5.1,9.1,168.0
24625,2017.0,Karl-Anthony Towns,MIN,C,82.0,82.0,2061.0,0.542,0.367,0.582,0.832,19.4,13.2,1.0,2.9,11.3,241.0
24421,2017.0,Damian Lillard,POR,PG,75.0,75.0,2024.0,0.444,0.37,0.491,0.895,7.6,28.7,1.3,0.6,10.2,152.0
24226,2017.0,DeMar DeRozan,TOR,SG,74.0,74.0,2020.0,0.467,0.266,0.484,0.842,8.3,20.6,1.5,0.4,9.0,134.0
24216,2017.0,Stephen Curry,GSW,PG,79.0,79.0,1999.0,0.468,0.411,0.537,0.898,7.3,31.1,2.6,0.5,13.0,183.0
24365,2017.0,LeBron James,CLE,SF,74.0,74.0,1954.0,0.548,0.363,0.611,0.674,12.6,41.3,1.6,1.3,16.1,134.0
24206,2017.0,DeMarcus Cousins,TOT,C,72.0,72.0,1942.0,0.452,0.36,0.483,0.772,18.2,25.8,2.0,3.3,13.5,278.0


By “group by” we are referring to a process involving one or more of the following steps
Splitting the data into groups based on some criteria
Applying a function to each group independently
Combining the results into a data structure

In the apply step, we might wish to one of the following:
Aggregation: computing a summary statistic (or statistics) about each group. Some examples:

Compute group sums or means
Compute group sizes / counts

Transformation: perform some group-specific computations and return a like-indexed. Some examples:

Standardizing data (zscore) within group
Filling NAs within groups with a value derived from each group

Filtration: discard some groups, according to a group-wise computation that evaluates True or False. Some examples:

Discarding data that belongs to groups with only a few members
Filtering out data based on the group sum or mean

# Splitting <a name="split"></a>

In [18]:
gb_team = top_scorers.groupby(['Tm'], axis=0)
gb_team_pos = top_scorers.groupby(['Tm', 'Pos'])

In [19]:
gb_team.first()

Unnamed: 0_level_0,Year,Player,Pos,G,GS,PTS,FG%,3P%,2P%,FT%,TRB%,AST%,STL%,BLK%,TOV%,PF
Tm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
ATL,2017.0,Dennis Schroder,PG,79.0,78.0,1414.0,0.451,0.34,0.486,0.855,5.5,35.5,1.5,0.6,16.3,149.0
BOS,2017.0,Isaiah Thomas,PG,76.0,76.0,2199.0,0.463,0.379,0.528,0.909,4.4,32.6,1.4,0.4,10.7,167.0
BRK,2017.0,Brook Lopez,C,75.0,75.0,1539.0,0.474,0.346,0.536,0.81,9.6,14.8,0.8,4.2,12.1,192.0
CHI,2017.0,Jimmy Butler,SF,76.0,75.0,1816.0,0.455,0.367,0.477,0.865,9.0,24.8,2.6,0.9,9.3,112.0
CHO,2017.0,Kemba Walker,PG,79.0,79.0,1830.0,0.444,0.399,0.476,0.847,6.2,29.1,1.6,0.7,9.5,119.0
CLE,2017.0,LeBron James,SF,74.0,74.0,1954.0,0.548,0.363,0.611,0.674,12.6,41.3,1.6,1.3,16.1,134.0
DAL,2017.0,Harrison Barnes,PF,79.0,79.0,1518.0,0.468,0.351,0.492,0.861,8.2,7.8,1.2,0.5,6.8,128.0
DEN,2017.0,Nikola Jokic,C,73.0,59.0,1221.0,0.578,0.324,0.628,0.825,19.5,28.8,1.5,2.1,15.2,214.0
DET,2017.0,Tobias Harris,PF,82.0,48.0,1321.0,0.481,0.347,0.537,0.841,8.8,8.8,1.2,1.3,7.6,133.0
GSW,2017.0,Stephen Curry,PG,79.0,79.0,1999.0,0.468,0.411,0.537,0.898,7.3,31.1,2.6,0.5,13.0,183.0


In [20]:
gb_team.last()

Unnamed: 0_level_0,Year,Player,Pos,G,GS,PTS,FG%,3P%,2P%,FT%,TRB%,AST%,STL%,BLK%,TOV%,PF
Tm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
ATL,2017.0,Paul Millsap,PF,69.0,67.0,1246.0,0.442,0.311,0.486,0.768,12.5,17.9,1.9,2.3,12.1,186.0
BOS,2017.0,Isaiah Thomas,PG,76.0,76.0,2199.0,0.463,0.379,0.528,0.909,4.4,32.6,1.4,0.4,10.7,167.0
BRK,2017.0,Brook Lopez,C,75.0,75.0,1539.0,0.474,0.346,0.536,0.81,9.6,14.8,0.8,4.2,12.1,192.0
CHI,2017.0,Jimmy Butler,SF,76.0,75.0,1816.0,0.455,0.367,0.477,0.865,9.0,24.8,2.6,0.9,9.3,112.0
CHO,2017.0,Nicolas Batum,SG,77.0,77.0,1164.0,0.403,0.333,0.453,0.856,10.1,27.6,1.6,1.0,15.0,109.0
CLE,2017.0,Kyrie Irving,PG,72.0,72.0,1816.0,0.473,0.401,0.505,0.905,5.0,29.7,1.6,0.8,10.3,157.0
DAL,2017.0,Harrison Barnes,PF,79.0,79.0,1518.0,0.468,0.351,0.492,0.861,8.2,7.8,1.2,0.5,6.8,128.0
DEN,2017.0,Nikola Jokic,C,73.0,59.0,1221.0,0.578,0.324,0.628,0.825,19.5,28.8,1.5,2.1,15.2,214.0
DET,2017.0,Tobias Harris,PF,82.0,48.0,1321.0,0.481,0.347,0.537,0.841,8.8,8.8,1.2,1.3,7.6,133.0
GSW,2017.0,Kevin Durant,SF,62.0,62.0,1555.0,0.537,0.375,0.608,0.875,13.6,23.1,1.5,3.8,10.4,117.0


# Selecting <a name="select"></a>

In [13]:
gb_team.get_group('WAS')

Unnamed: 0,2P%,3P%,AST%,BLK%,FG%,FT%,G,GS,PF,PTS,Player,Pos,STL%,TOV%,TRB%,Year
24646,0.48,0.327,46.9,1.4,0.451,0.801,78.0,78.0,151.0,1805.0,John Wall,PG,2.7,16.2,6.5,2017.0
24137,0.538,0.404,16.2,0.7,0.482,0.825,77.0,77.0,169.0,1779.0,Bradley Beal,SG,1.5,9.6,5.0,2017.0


In [22]:
gb_team.groups
gb_team_pos.groups

{('ATL', 'PF'): Int64Index([24467], dtype='int64'),
 ('ATL', 'PG'): Int64Index([24575], dtype='int64'),
 ('BOS', 'PG'): Int64Index([24612], dtype='int64'),
 ('BRK', 'C'): Int64Index([24426], dtype='int64'),
 ('CHI', 'SF'): Int64Index([24175], dtype='int64'),
 ('CHO', 'PG'): Int64Index([24645], dtype='int64'),
 ('CHO', 'SG'): Int64Index([24133], dtype='int64'),
 ('CLE', 'PG'): Int64Index([24360], dtype='int64'),
 ('CLE', 'SF'): Int64Index([24365], dtype='int64'),
 ('DAL', 'PF'): Int64Index([24127], dtype='int64'),
 ('DEN', 'C'): Int64Index([24380], dtype='int64'),
 ('DET', 'PF'): Int64Index([24314], dtype='int64'),
 ('GSW', 'PG'): Int64Index([24216], dtype='int64'),
 ('GSW', 'SF'): Int64Index([24239], dtype='int64'),
 ('GSW', 'SG'): Int64Index([24617], dtype='int64'),
 ('HOU', 'PG'): Int64Index([24306], dtype='int64'),
 ('HOU', 'SG'): Int64Index([24290], dtype='int64'),
 ('IND', 'C'): Int64Index([24630], dtype='int64'),
 ('IND', 'PG'): Int64Index([24608], dtype='int64'),
 ('IND', 'SF'):

# Iterating <a name="iterate"></a>

In [23]:
for key, group in gb_team:
    print("Group '%s'" % key)
    print(group)

Group 'ATL'
         Year           Player   Tm Pos     G    GS     PTS    FG%    3P%  \
24575  2017.0  Dennis Schroder  ATL  PG  79.0  78.0  1414.0  0.451  0.340   
24467  2017.0     Paul Millsap  ATL  PF  69.0  67.0  1246.0  0.442  0.311   

         2P%    FT%  TRB%  AST%  STL%  BLK%  TOV%     PF  
24575  0.486  0.855   5.5  35.5   1.5   0.6  16.3  149.0  
24467  0.486  0.768  12.5  17.9   1.9   2.3  12.1  186.0  
Group 'BOS'
         Year         Player   Tm Pos     G    GS     PTS    FG%    3P%  \
24612  2017.0  Isaiah Thomas  BOS  PG  76.0  76.0  2199.0  0.463  0.379   

         2P%    FT%  TRB%  AST%  STL%  BLK%  TOV%     PF  
24612  0.528  0.909   4.4  32.6   1.4   0.4  10.7  167.0  
Group 'BRK'
         Year       Player   Tm Pos     G    GS     PTS    FG%    3P%    2P%  \
24426  2017.0  Brook Lopez  BRK   C  75.0  75.0  1539.0  0.474  0.346  0.536   

        FT%  TRB%  AST%  STL%  BLK%  TOV%     PF  
24426  0.81   9.6  14.8   0.8   4.2  12.1  192.0  
Group 'CHI'
         Ye

# Aggregation <a name="aggregate"></a>

In [33]:
gb_team.size()
gb_team.sum()
gb_team.mean()
gb_team.std()
gb_team.describe()
#gb_team.max()
gb_team.agg({'PTS': 'mean', 'FG%': 'max', '3P%': lambda x: np.std(x)})

Unnamed: 0_level_0,FG%,PTS,3P%
Tm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ATL,0.451,1330.0,0.0145
BOS,0.463,2199.0,0.0
BRK,0.474,1539.0,0.0
CHI,0.455,1816.0,0.0
CHO,0.444,1497.0,0.033
CLE,0.548,1885.0,0.019
DAL,0.468,1518.0,0.0
DEN,0.578,1221.0,0.0
DET,0.481,1321.0,0.0
GSW,0.537,1765.333333,0.01772
