# Groupby in Python
- How to use pandas GroupBy operations on real-world data
- How the split-apply-combine chain of operations works
- How to decompose the split-apply-combine chain into steps
- How to categorize methods of a pandas GroupBy object based on their intent and result

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pydataset import data
import seaborn as sns

In [5]:
', '.join(data().title)



In [9]:
linkdata1 = 'https://theunitedstates.io/congress-legislators/legislators-current.csv'
linkdata2 = 'https://theunitedstates.io/congress-legislators/legislators-historical.csv'
df = pd.read_csv(linkdata2)
df.head()
#https://github.com/unitedstates/congress-legislators?tab=readme-ov-file

Unnamed: 0,last_name,first_name,middle_name,suffix,nickname,full_name,birthday,gender,type,state,...,opensecrets_id,lis_id,fec_ids,cspan_id,govtrack_id,votesmart_id,ballotpedia_id,washington_post_id,icpsr_id,wikipedia_id
0,Bassett,Richard,,,,,1745-04-02,M,sen,DE,...,,,,,401222,,,,507.0,Richard Bassett (Delaware politician)
1,Bland,Theodorick,,,,,1742-03-21,M,rep,VA,...,,,,,401521,,,,786.0,Theodorick Bland (congressman)
2,Burke,Aedanus,,,,,1743-06-16,M,rep,SC,...,,,,,402032,,,,1260.0,Aedanus Burke
3,Carroll,Daniel,,,,,1730-07-22,M,rep,MD,...,,,,,402334,,,,1538.0,Daniel Carroll
4,Clymer,George,,,,,1739-03-16,M,rep,PA,...,,,,,402671,,,,1859.0,George Clymer


In [10]:
df.columns

Index(['last_name', 'first_name', 'middle_name', 'suffix', 'nickname',
       'full_name', 'birthday', 'gender', 'type', 'state', 'district',
       'senate_class', 'party', 'url', 'address', 'phone', 'contact_form',
       'rss_url', 'twitter', 'twitter_id', 'facebook', 'youtube', 'youtube_id',
       'mastodon', 'bioguide_id', 'thomas_id', 'opensecrets_id', 'lis_id',
       'fec_ids', 'cspan_id', 'govtrack_id', 'votesmart_id', 'ballotpedia_id',
       'washington_post_id', 'icpsr_id', 'wikipedia_id'],
      dtype='object')

In [11]:
cols1 = ['last_name','first_name', 'birthday', 'gender', 'type', 'state', 'party']
df1 = df[cols1]
df1.head()

Unnamed: 0,last_name,first_name,birthday,gender,type,state,party
0,Bassett,Richard,1745-04-02,M,sen,DE,Anti-Administration
1,Bland,Theodorick,1742-03-21,M,rep,VA,
2,Burke,Aedanus,1743-06-16,M,rep,SC,
3,Carroll,Daniel,1730-07-22,M,rep,MD,
4,Clymer,George,1739-03-16,M,rep,PA,


In [17]:
df1.groupby('state').size().sort_values(ascending=False).head(5)
#top 5 status 

state
NY    1479
PA    1057
OH     682
IL     493
VA     435
dtype: int64

In [22]:
#by default, state values become the index, if it is required in column
#df1.groupby('state', as_index=False).size()
df1.groupby('state', as_index=False).size().sort_values(by='size',ascending=False).head(5)
#now it has rangeindex

Unnamed: 0,state,size
37,NY,1479
42,PA,1057
38,OH,682
17,IL,493
51,VA,435


In [24]:
#skip sort by state key
df1.groupby('state', sort=False).size().head(10)

state
DE      97
VA     435
SC     252
MD     306
PA    1057
MA     427
NJ     361
GA     319
NY    1479
NC     360
dtype: int64

In [25]:
# air quality
airquality = data('airquality')
airquality.head()

Unnamed: 0,Ozone,Solar.R,Wind,Temp,Month,Day
1,41.0,190.0,7.4,67,5,1
2,36.0,118.0,8.0,72,5,2
3,12.0,149.0,12.6,74,5,3
4,18.0,313.0,11.5,62,5,4
5,,,14.3,56,5,5
