## Group by

In [3]:
import pandas as pd 
print(pd.__version__)

1.3.4


In [4]:
sales = pd.read_csv("data/supermarket_sales.csv")
print(sales.head(3))

    Invoice ID Branch       City Customer type  Gender  \
0  750-67-8428      A     Yangon        Member  Female   
1  226-31-3081      C  Naypyitaw        Normal  Female   
2  631-41-3108      A     Yangon        Normal    Male   

             Product line  Unit price  Quantity      Date   Time      Payment  
0       Health and beauty       74.69         7  1/5/2019  13:08      Ewallet  
1  Electronic accessories       15.28         5  3/8/2019  10:29         Cash  
2      Home and lifestyle       46.33         7  3/3/2019  13:23  Credit card  


In [5]:
sales.groupby('Product line')['Quantity'].count()

Product line
Electronic accessories    170
Fashion accessories       178
Food and beverages        174
Health and beauty         152
Home and lifestyle        160
Sports and travel         166
Name: Quantity, dtype: int64

In [6]:
sales.groupby('Product line', sort=True)['Quantity'].count()

Product line
Electronic accessories    170
Fashion accessories       178
Food and beverages        174
Health and beauty         152
Home and lifestyle        160
Sports and travel         166
Name: Quantity, dtype: int64

In [7]:
sales.groupby(['Branch', 'Payment'])['Quantity'].count()

Branch  Payment    
A       Cash           110
        Credit card    104
        Ewallet        126
B       Cash           110
        Credit card    109
        Ewallet        113
C       Cash           124
        Credit card     98
        Ewallet        106
Name: Quantity, dtype: int64

In [8]:
sales.groupby(['Branch', 'Payment'], as_index=False)['Quantity'].count()

Unnamed: 0,Branch,Payment,Quantity
0,A,Cash,110
1,A,Credit card,104
2,A,Ewallet,126
3,B,Cash,110
4,B,Credit card,109
5,B,Ewallet,113
6,C,Cash,124
7,C,Credit card,98
8,C,Ewallet,106


## Groupby 작동 원리

In [9]:
by_payment = sales.groupby("Payment")
print(by_payment)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fe9106a65b0>


In [10]:
for payment, frame in by_payment:
    print(f"head(2) for {payment!r}")
    print("-----")
    print(frame[['Invoice ID', 'Branch', 'City', 'Payment']].head(2), end="\n\n")

head(2) for 'Cash'
-----
     Invoice ID Branch       City Payment
1   226-31-3081      C  Naypyitaw    Cash
11  529-56-3974      B   Mandalay    Cash

head(2) for 'Credit card'
-----
    Invoice ID Branch    City      Payment
2  631-41-3108      A  Yangon  Credit card
8  665-32-9167      A  Yangon  Credit card

head(2) for 'Ewallet'
-----
    Invoice ID Branch    City  Payment
0  750-67-8428      A  Yangon  Ewallet
3  123-19-1176      A  Yangon  Ewallet



In [11]:
print(by_payment.get_group('Cash'))

      Invoice ID Branch       City Customer type  Gender  \
1    226-31-3081      C  Naypyitaw        Normal  Female   
11   529-56-3974      B   Mandalay        Member    Male   
14   829-34-3910      A     Yangon        Normal  Female   
15   299-46-1805      B   Mandalay        Member  Female   
26   649-29-6775      B   Mandalay        Normal    Male   
..           ...    ...        ...           ...     ...   
983  148-41-7930      C  Naypyitaw        Normal    Male   
984  189-40-5216      C  Naypyitaw        Normal    Male   
997  727-02-1313      A     Yangon        Member    Male   
998  347-56-2442      A     Yangon        Normal    Male   
999  849-09-3807      A     Yangon        Member  Female   

               Product line  Unit price  Quantity       Date   Time Payment  
1    Electronic accessories       15.28         5   3/8/2019  10:29    Cash  
11   Electronic accessories       25.51         4   3/9/2019  17:03    Cash  
14        Health and beauty       71.38      

In [12]:
print(sales.loc[sales['Payment'] == "Cash"])

      Invoice ID Branch       City Customer type  Gender  \
1    226-31-3081      C  Naypyitaw        Normal  Female   
11   529-56-3974      B   Mandalay        Member    Male   
14   829-34-3910      A     Yangon        Normal  Female   
15   299-46-1805      B   Mandalay        Member  Female   
26   649-29-6775      B   Mandalay        Normal    Male   
..           ...    ...        ...           ...     ...   
983  148-41-7930      C  Naypyitaw        Normal    Male   
984  189-40-5216      C  Naypyitaw        Normal    Male   
997  727-02-1313      A     Yangon        Member    Male   
998  347-56-2442      A     Yangon        Normal    Male   
999  849-09-3807      A     Yangon        Member  Female   

               Product line  Unit price  Quantity       Date   Time Payment  
1    Electronic accessories       15.28         5   3/8/2019  10:29    Cash  
11   Electronic accessories       25.51         4   3/9/2019  17:03    Cash  
14        Health and beauty       71.38      

In [16]:
sales['Product line'].value_counts()

Fashion accessories       178
Food and beverages        174
Electronic accessories    170
Sports and travel         166
Home and lifestyle        160
Health and beauty         152
Name: Product line, dtype: int64

In [17]:
sales.groupby("Payment", sort=False)['Product line'].apply(lambda ser : ser.str.contains("Home").sum())

Payment
Ewallet        64
Cash           51
Credit card    45
Name: Product line, dtype: int64

## 참조
- https://towardsdatascience.com/pandas-groupby-a-simple-but-detailed-tutorial-314b8f37005d

In [22]:
tbl = sales.groupby(['Branch', 'Payment']).agg(group_total=('Payment', 'count')).reset_index()
tbl['total count in each branch'] = tbl.groupby('Branch')['group_total'].transform(sum)
tbl['% in each branch'] = (tbl['group_total'] * 100.0 / tbl['total count in each branch'])
tbl

Unnamed: 0,Branch,Payment,group_total,total count in each branch,% in each branch
0,A,Cash,110,340,32.352941
1,A,Credit card,104,340,30.588235
2,A,Ewallet,126,340,37.058824
3,B,Cash,110,332,33.13253
4,B,Credit card,109,332,32.831325
5,B,Ewallet,113,332,34.036145
6,C,Cash,124,328,37.804878
7,C,Credit card,98,328,29.878049
8,C,Ewallet,106,328,32.317073
