# Pandas groupby is a function for grouping data objects into Series (columns) or DataFrames (a group of Series) based on particular indicators. In simpler terms, group by in Python makes the management of datasets easier since you can put related records into groups.

In [1]:
import pandas as pd
import numpy as np

df = pd.DataFrame( {     
"Employee" : ["Susan", "elina", "jordan", "Charles", "David", "Charles", "Julia", "Bart"] ,           
"City" : ["London", "London", "Philadelphia", "London", "London", "Philadelphia", "London", "Philadelphia"] ,
"Product" : [20, 40, 18, 24, 37, 40, 44, 20 ],
"Hours" : [24, 40, 50, 36, 54, 44, 41, 35]} ) 
df

Unnamed: 0,Employee,City,Product,Hours
0,Susan,London,20,24
1,elina,London,40,40
2,jordan,Philadelphia,18,50
3,Charles,London,24,36
4,David,London,37,54
5,Charles,Philadelphia,40,44
6,Julia,London,44,41
7,Bart,Philadelphia,20,35


In [6]:
df.groupby(['Employee']).sum()

Unnamed: 0_level_0,Product,Hours
Employee,Unnamed: 1_level_1,Unnamed: 2_level_1
Bart,20,35
Charles,64,80
David,37,54
Julia,44,41
Susan,20,24
elina,40,40
jordan,18,50


In [7]:
# count how many employee are from same city
df.groupby(['City']).count()

Unnamed: 0_level_0,Employee,Product,Hours
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
London,5,5,5
Philadelphia,3,3,3


In [8]:
df.groupby('Employee')['Hours'].sum()

Employee
Bart       35
Charles    80
David      54
Julia      41
Susan      24
elina      40
jordan     50
Name: Hours, dtype: int64

In [9]:
df.groupby('Employee')['Hours'].sum().reset_index()

Unnamed: 0,Employee,Hours
0,Bart,35
1,Charles,80
2,David,54
3,Julia,41
4,Susan,24
5,elina,40
6,jordan,50


In [10]:
#sort based on hours value
df.groupby('Employee')['Hours'].sum().reset_index().sort_values(by='Hours')

Unnamed: 0,Employee,Hours
4,Susan,24
0,Bart,35
5,elina,40
3,Julia,41
6,jordan,50
2,David,54
1,Charles,80


In [12]:
df.groupby('Employee').mean().reset_index()

Unnamed: 0,Employee,Product,Hours
0,Bart,20,35
1,Charles,32,40
2,David,37,54
3,Julia,44,41
4,Susan,20,24
5,elina,40,40
6,jordan,18,50


In [21]:
Employee={'EMPNO':(111,112,114,115,223,226,228,300,333,345,356,320),'Salary':(4000,6000,2000,8000,2000,1000,3000,500,700,300,200,700),'EMPCODE':('MGF','MGR','MGR','MGR','CLERK','CLERK','CLERK','PEON','PEON','PEON','PEON','PEON')}
Employee          
emp_df=pd.DataFrame(Employee)
emp_df

Unnamed: 0,EMPNO,Salary,EMPCODE
0,111,4000,MGF
1,112,6000,MGR
2,114,2000,MGR
3,115,8000,MGR
4,223,2000,CLERK
5,226,1000,CLERK
6,228,3000,CLERK
7,300,500,PEON
8,333,700,PEON
9,345,300,PEON


In [42]:
emp_df.groupby('EMPCODE',sort=True)['Salary'].value_counts()

EMPCODE  Salary
CLERK    1000      1
         2000      1
         3000      1
MGF      4000      1
MGR      2000      1
         6000      1
         8000      1
PEON     700       2
         200       1
         300       1
         500       1
Name: Salary, dtype: int64