# Data Group By Aggregation

- Aggregation over categorical groups
- https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html

In [1]:
import pandas as pd
df = pd.read_csv("healthcare-dataset-stroke-data.csv") 
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


## Groupby to get statistics per group

In [2]:
df.groupby('gender').mean()

Unnamed: 0_level_0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Female,36479.685037,43.757395,0.092184,0.037742,104.057809,29.065758,0.047094
Male,36562.541371,42.483385,0.104965,0.077069,109.08852,28.647936,0.051064
Other,56156.0,26.0,0.0,0.0,143.33,22.4,0.0


## To get a single column

In [4]:
df.groupby('gender')['age'].max()

gender
Female    82.0
Male      82.0
Other     26.0
Name: age, dtype: float64

## Multiple columns in the group by

In [100]:
df_count = df.groupby(['gender','ever_married'])['age'].count()
df_count

gender  ever_married
Female  No               993
        Yes             2001
Male    No               763
        Yes             1352
Other   No                 1
Name: age, dtype: int64

## Change the multilevel index to flat index

In [14]:
df_mean.reset_index()

Unnamed: 0,gender,ever_married,age
0,Female,No,20.0
1,Female,Yes,53.0
2,Male,No,15.0
3,Male,Yes,56.0
4,Other,No,26.0


## Multiple aggregation functions using `.agg`
- `agg` can use the string names for common operations
- or pass the function itself.

In [102]:
df.groupby(['gender','ever_married'])['age']\
.agg(max_age='max', min_age=min, count_age='count') 

Unnamed: 0_level_0,Unnamed: 1_level_0,max_age,min_age,count_age
gender,ever_married,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,No,82.0,0.08,993
Female,Yes,82.0,19.0,2001
Male,No,82.0,0.08,763
Male,Yes,82.0,18.0,1352
Other,No,26.0,26.0,1


In [93]:
df.groupby(['gender','ever_married']).agg({'age': ['mean', 'min', 'max']})

Unnamed: 0_level_0,Unnamed: 1_level_0,age,age,age
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,min,max
gender,ever_married,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Female,No,23.969426,0.08,82.0
Female,Yes,53.577211,19.0,82.0
Male,No,19.46443,0.08,82.0
Male,Yes,55.474112,18.0,82.0
Other,No,26.0,26.0,26.0


## Use named aggregations with different functions on different columns

In [94]:
df.groupby(['gender','ever_married']).agg(
             bmi_sum=('bmi', 'sum'),
             avg_glucose_level_mean=('avg_glucose_level', 'mean'),
             age_range=('age', lambda x: x.max() - x.min()))

Unnamed: 0_level_0,Unnamed: 1_level_0,bmi_sum,avg_glucose_level_mean,age_range
gender,ever_married,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,No,24771.4,95.939043,81.92
Female,Yes,59432.1,108.086762,63.0
Male,No,18197.3,97.049279,81.92
Male,Yes,39413.7,115.882855,64.0
Other,No,22.4,143.33,0.0


## Bespoke functions for aggregation using `apply`
- calculate the correlation of age with bmi for each category in the ever_married column

In [105]:
df.groupby('ever_married').apply(lambda x: x['age'].corr(x['bmi']))

ever_married
No     0.471514
Yes   -0.075455
dtype: float64

# Pivot tables
- Pivot tables by default give the mean values.
- The groupy by operation gives the same values as the pivot table operation
- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.pivot.html?highlight=pivot#pandas.pivot

In [98]:
print(df.groupby('gender')['age'].mean())
df.pivot_table(values='age',index='gender') # same as groupby

gender
Female    43.757395
Male      42.483385
Other     26.000000
Name: age, dtype: float64


Unnamed: 0_level_0,age
gender,Unnamed: 1_level_1
Female,43.757395
Male,42.483385
Other,26.0


## Pass multiple functions to pivot table
- len same as count

In [22]:
df.pivot_table(values='age',index='gender', aggfunc=[len,'std','max'])

Unnamed: 0_level_0,len,std,max
Unnamed: 0_level_1,age,age,age
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Female,2994.0,21.966561,82.0
Male,2115.0,23.484066,82.0
Other,1.0,,26.0


## Pivot on multiple columns
- calculate the mean of ages, tabulated for gender and ever_married

In [19]:
df.pivot_table(values='age',index='gender',columns='ever_married')

ever_married,No,Yes
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,23.969426,53.577211
Male,19.46443,55.474112
Other,26.0,


## Fill null values

In [106]:
df.pivot_table(values='age',index='gender',fill_value=0)

Unnamed: 0_level_0,age
gender,Unnamed: 1_level_1
Female,43.757395
Male,42.483385
Other,26.0


## `margins=True` keyword gives the overrall statistics

In [21]:
df.pivot_table(values='age',index='gender',columns='ever_married'\
               ,fill_value=0,margins=True)

ever_married,No,Yes,All
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,23.969426,53.577211,43.757395
Male,19.46443,55.474112,42.483385
Other,26.0,0.0,26.0
All,22.014229,54.342082,43.226614


## Crosstab
- A cross-tabulation (or crosstab for short) is a special case of a pivot table that computes group frequencies. 
- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.crosstab.html?highlight=crosstab

In [23]:
pd.crosstab(df.gender, df.work_type, margins=True)

work_type,Govt_job,Never_worked,Private,Self-employed,children,All
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Female,399,11,1754,504,326,2994
Male,258,11,1170,315,361,2115
Other,0,0,1,0,0,1
All,657,22,2925,819,687,5110


In [24]:
pd.crosstab([df.gender,df.ever_married], df.work_type, margins=True)

Unnamed: 0_level_0,work_type,Govt_job,Never_worked,Private,Self-employed,children,All
gender,ever_married,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Female,No,70,11,519,67,326,993
Female,Yes,329,0,1235,437,0,2001
Male,No,47,11,301,43,361,763
Male,Yes,211,0,869,272,0,1352
Other,No,0,0,1,0,0,1
All,,657,22,2925,819,687,5110


# Get percentage of total
- this requires two operations, the grouped sums and the total sums
- the total sums could be over gender or work_type


In [172]:
grouped_counts = df.groupby(['gender','work_type']).size()
grouped_counts

gender  work_type    
Female  Govt_job          399
        Never_worked       11
        Private          1754
        Self-employed     504
        children          326
Male    Govt_job          258
        Never_worked       11
        Private          1170
        Self-employed     315
        children          361
Other   Private             1
dtype: int64

## Get the sum by gender


In [176]:
counts_unstacked = grouped_counts.unstack()
counts_unstacked

work_type,Govt_job,Never_worked,Private,Self-employed,children
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Female,399.0,11.0,1754.0,504.0,326.0
Male,258.0,11.0,1170.0,315.0,361.0
Other,,,1.0,,


## Row wise sum to get counts by gender

In [177]:
counts_gender = counts_unstacked.sum(axis=1)
counts_gender

gender
Female    2994.0
Male      2115.0
Other        1.0
dtype: float64

## Get percentage total by gender
- division is done intelligently by matching the appropriate gender

In [174]:
grouped_counts/counts_gender

gender  work_type    
Female  Govt_job         0.133267
        Never_worked     0.003674
        Private          0.585838
        Self-employed    0.168337
        children         0.108884
Male    Govt_job         0.121986
        Never_worked     0.005201
        Private          0.553191
        Self-employed    0.148936
        children         0.170686
Other   Private          1.000000
dtype: float64

##  Re-sum using a different axis 
- column_wise sum to get counts by work_type
- check the final df to make sure the axis

In [178]:
counts_work_type = grouped_counts.unstack().sum(axis=0)
counts_work_type

work_type
Govt_job          657.0
Never_worked       22.0
Private          2925.0
Self-employed     819.0
children          687.0
dtype: float64

## Get the percentage by work type instead

In [179]:
grouped_counts/counts_work_type

gender  work_type    
Female  Govt_job         0.607306
        Never_worked     0.500000
        Private          0.599658
        Self-employed    0.615385
        children         0.474527
Male    Govt_job         0.392694
        Never_worked     0.500000
        Private          0.400000
        Self-employed    0.384615
        children         0.525473
Other   Private          0.000342
dtype: float64

## To see more clearly, we unstack to change indexing
- level indicates which index to be the outer index
- level=0 means use the first index as the outer index
- level=1 means use the second index as the outer index
- https://pandas.pydata.org/pandas-docs/stable/user_guide/reshaping.html

In [180]:
(grouped_counts/counts_work_type).unstack(level=0)

gender,Female,Male,Other
work_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Govt_job,0.607306,0.392694,
Never_worked,0.5,0.5,
Private,0.599658,0.4,0.000342
Self-employed,0.615385,0.384615,
children,0.474527,0.525473,


In [181]:
(grouped_counts/counts_work_type).unstack(level=1)

work_type,Govt_job,Never_worked,Private,Self-employed,children
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Female,0.607306,0.5,0.599658,0.615385,0.474527
Male,0.392694,0.5,0.4,0.384615,0.525473
Other,,,0.000342,,
