In [52]:
import pandas as pd
import numpy as np

## idxmax()

In [53]:
test_df = pd.DataFrame(
    {
        'A': [1,2,3],
        'B': [4,5,6],
    },
    index=['a','b','c']
)
test_df

Unnamed: 0,A,B
a,1,4
b,2,5
c,3,6


In [54]:
a_max_idx = test_df['A'].idxmax()
a_max_idx

'c'

In [55]:
test_df.loc[a_max_idx]

A    3
B    6
Name: c, dtype: int64

## Groupby example

In [56]:
# Sample DataFrame
data = {
    'Category': ['A', 'A', 'B', 'B', 'C', 'C'],
    'Values': [1, 2, 3, 4, 5, 6]
}
df = pd.DataFrame(data)
df

Unnamed: 0,Category,Values
0,A,1
1,A,2
2,B,3
3,B,4
4,C,5
5,C,6


### TASK: find the max value for each Category:

In [None]:
### Solution without groupby (DO NOT DO THAT!!!)
# get df for 'Category'=='A'
df_A = df[df['Category']=='A']
# get df for 'Category'=='B'
df_B = df[df['Category']=='B']
# get df for 'Category'=='C'
df_C = df[df['Category']=='C']

print( df_A['Values'].max() )
print( df_B['Values'].max() )
print( df_C['Values'].max() )

2
4
6


In [79]:
### Solution with group by (BEST VARIANT)
grouped = df.groupby('Category')
grouped['Values'].max().values

array([2, 4, 6])

### Groupby Object methods

In [58]:
a_df = grouped.get_group('A')
a_df

Unnamed: 0,Category,Values
0,A,1
1,A,2


In [59]:
grouped.size()

Category
A    2
B    2
C    2
dtype: int64

In [60]:
grouped.count()

Unnamed: 0_level_0,Values
Category,Unnamed: 1_level_1
A,2
B,2
C,2


In [61]:
grouped.std()

Unnamed: 0_level_0,Values
Category,Unnamed: 1_level_1
A,0.707107
B,0.707107
C,0.707107


In [62]:
grouped.describe()

Unnamed: 0_level_0,Values,Values,Values,Values,Values,Values,Values,Values
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
A,2.0,1.5,0.707107,1.0,1.25,1.5,1.75,2.0
B,2.0,3.5,0.707107,3.0,3.25,3.5,3.75,4.0
C,2.0,5.5,0.707107,5.0,5.25,5.5,5.75,6.0


In [63]:
grouped.first()

Unnamed: 0_level_0,Values
Category,Unnamed: 1_level_1
A,1
B,3
C,5


## Example 1: Sales Data

Scenario: You have a dataset of sales records for a store, including the date, product category, and sales amount. 

You want to analyze the total sales by category and month.

In [64]:
# Generate sample sales data

rng = np.random.default_rng(seed=42)

dates = pd.date_range('2023-01-01', periods=100, freq='D')
categories = ['Electronics', 'Clothing', 'Furniture']

sales_data = {
    'Date': rng.choice(dates, 100),
    'Category': rng.choice(categories, 100),
    'Sales_Amount': rng.uniform(20, 500, 100)
}
df_sales = pd.DataFrame(sales_data)
df_sales.head()

Unnamed: 0,Date,Category,Sales_Amount
0,2023-01-09,Furniture,456.118732
1,2023-03-19,Electronics,355.859424
2,2023-03-07,Furniture,147.617582
3,2023-02-13,Electronics,485.204661
4,2023-02-13,Furniture,393.800434


In [92]:
### create Month column from each Date
df_sales['Month'] = df_sales['Date'].dt.to_period('M')
df_sales.head()

Unnamed: 0,Date,Category,Sales_Amount,Month
0,2023-01-09,Furniture,456.118732,2023-01
1,2023-03-19,Electronics,355.859424,2023-03
2,2023-03-07,Furniture,147.617582,2023-03
3,2023-02-13,Electronics,485.204661,2023-02
4,2023-02-13,Furniture,393.800434,2023-02


In [93]:
#  calculate total sales for each category
grouped_sales = df_sales.groupby('Category')
grouped_sales['Sales_Amount'].sum()

Category
Clothing       9155.499867
Electronics    9179.056372
Furniture      7714.434980
Name: Sales_Amount, dtype: float64

In [66]:
# find max sale in all data - variant 1:
max_sale = df_sales['Sales_Amount'].max()
df_sales[df_sales['Sales_Amount']==max_sale]

Unnamed: 0,Date,Category,Sales_Amount
77,2023-01-23,Electronics,496.340271


In [67]:
max_sale_idx = df_sales['Sales_Amount'].idxmax()
df_sales.loc[max_sale_idx]

Date            2023-01-23 00:00:00
Category                Electronics
Sales_Amount             496.340271
Name: 77, dtype: object

In [84]:
# find the name of categories with max total sales
grouped_sales['Sales_Amount'].sum().idxmax()


'Electronics'

In [97]:
###  total sales by category and month
grouped_sales_by_cat_month = df_sales.groupby(['Category','Month'])
grouped_sales_by_cat_month['Sales_Amount'].sum()

Category     Month  
Clothing     2023-01    2414.752033
             2023-02    2269.664878
             2023-03    4213.537377
             2023-04     257.545580
Electronics  2023-01    2214.326785
             2023-02    3561.797767
             2023-03    2984.632743
             2023-04     418.299077
Furniture    2023-01    2284.188591
             2023-02    2300.474294
             2023-03    2345.193312
             2023-04     784.578783
Name: Sales_Amount, dtype: float64

## Example 2: Customer Transactions
Scenario: You have a dataset of customer transactions, including customer ID, transaction date, and transaction amount. You want to perform the following tasks:

1. Calculate the total transaction amount for each customer.
1. Find the average transaction amount per customer.
1. Identify customers with transactions above a certain threshold.
1. Analyze monthly transactions for each customer.

In [98]:
# Generate sample customer transactions data
np.random.seed(42)
customer_ids = np.arange(1, 11)
dates = pd.date_range('2023-01-01', periods=100, freq='D')
transaction_data = {
    'Customer_ID': np.random.choice(customer_ids, 100),
    'Transaction_Date': np.random.choice(dates, 100),
    'Transaction_Amount': np.random.uniform(10, 1000, 100)
}
df_transactions = pd.DataFrame(transaction_data)
df_transactions

Unnamed: 0,Customer_ID,Transaction_Date,Transaction_Amount
0,7,2023-01-12,285.860000
1,4,2023-02-03,909.183227
2,8,2023-02-02,247.166272
3,5,2023-02-17,153.445923
4,7,2023-01-23,494.558233
...,...,...,...
95,10,2023-01-02,25.302050
96,9,2023-01-02,929.035377
97,7,2023-04-02,433.902307
98,9,2023-02-23,966.988271


### Calculate the total transaction amount for each customer.

In [None]:
df_transactions.groupby('Customer_ID')['Transaction_Amount'].sum()

Customer_ID
1     4184.441341
2     3593.600771
3     5195.621393
4     4958.270198
5     3544.582543
6     2582.612421
7     6638.668663
8     7900.080139
9     7658.588096
10    5328.930893
Name: Transaction_Amount, dtype: float64

### Find the average transaction amount per customer.

In [103]:
df_transactions.groupby('Customer_ID')['Transaction_Amount'].mean()

Customer_ID
1     597.777334
2     359.360077
3     577.291266
4     550.918911
5     354.458254
6     430.435403
7     603.515333
8     526.672009
9     638.215675
10    484.448263
Name: Transaction_Amount, dtype: float64

### Identify customers with transactions above a average of all Transaction_Amount.

In [106]:
# average of all Transaction_Amount.
transaction_amount_mean = df_transactions['Transaction_Amount'].mean()
transaction_amount_mean

515.8539645793327

In [111]:
transactions_above_mean =  df_transactions.loc[ df_transactions['Transaction_Amount']>transaction_amount_mean ]
transactions_above_mean['Customer_ID'].unique()

array([ 4, 10,  7,  8,  3,  6,  2,  5,  1,  9])

## Example 3: Employee salaries per department
Scenario: You have a DataFrame with employee data (id, gender, department, salary)

You task is to answer next questions:

1. Find the employee with max salaray
1. How many males and females works in each departments
1. What is the average salary for males and females per each department.
1. In which department the male/female have higher average salary?

In [112]:
# Generate sample data:
data_rows = 10

# Define possible values for each column
employee_ids = range(1, data_rows+1)
genders = rng.choice(['Male', 'Female'], size=data_rows)
departments = rng.choice(['HR', 'IT', 'Finance', 'Marketing'], size=data_rows)
salaries = np.random.uniform(30000, 120000, size=data_rows)

# Create the DataFrame
employee_data = pd.DataFrame({
    'employee_id': employee_ids,
    'gender': genders,
    'department': departments,
    'salary': salaries
})
employee_data

Unnamed: 0,employee_id,gender,department,salary
0,1,Female,HR,106770.850992
1,2,Female,HR,56500.400286
2,3,Male,IT,64658.795574
3,4,Female,HR,106602.300437
4,5,Male,HR,58522.980464
5,6,Female,Finance,45254.347202
6,7,Male,Marketing,80112.113621
7,8,Male,HR,114253.929674
8,9,Female,IT,92642.681701
9,10,Male,Finance,81305.505308


### Find the employee with max salaray

In [117]:
max_salalry_idx = employee_data['salary'].idxmax()
max_salalry_idx

employee_data.loc[max_salalry_idx,'employee_id']

8

### How many males and females works in each departments

In [121]:
employee_data.groupby(['department','gender']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,employee_id,salary
department,gender,Unnamed: 2_level_1,Unnamed: 3_level_1
Finance,Female,1,1
Finance,Male,1,1
HR,Female,3,3
HR,Male,2,2
IT,Female,1,1
IT,Male,1,1
Marketing,Male,1,1


In [122]:
employee_data.groupby(['department','gender']).size()

department  gender
Finance     Female    1
            Male      1
HR          Female    3
            Male      2
IT          Female    1
            Male      1
Marketing   Male      1
dtype: int64

### What is the average salary for males and females per each department.

In [125]:
average_salary_by_gender =  employee_data.groupby(['department','gender'])['salary'].mean()
average_salary_by_gender

department  gender
Finance     Female    45254.347202
            Male      81305.505308
HR          Female    89957.850572
            Male      86388.455069
IT          Female    92642.681701
            Male      64658.795574
Marketing   Male      80112.113621
Name: salary, dtype: float64

### In which department the male/female have higher average salary?

In [138]:
average_salary_by_gender_df = average_salary_by_gender.reset_index()
average_salary_by_gender_df

Unnamed: 0,department,gender,salary
0,Finance,Female,45254.347202
1,Finance,Male,81305.505308
2,HR,Female,89957.850572
3,HR,Male,86388.455069
4,IT,Female,92642.681701
5,IT,Male,64658.795574
6,Marketing,Male,80112.113621


In [136]:
# average_salary_by_gender_df.groupby('gender')['salary'].idxmax()

gender
Female    4
Male      3
Name: salary, dtype: int64

In [137]:
average_salary_by_gender.unstack()

gender,Female,Male
department,Unnamed: 1_level_1,Unnamed: 2_level_1
Finance,45254.347202,81305.505308
HR,89957.850572,86388.455069
IT,92642.681701,64658.795574
Marketing,,80112.113621


In [140]:
### In which department the male/female have higher average salary?

average_salary_by_gender.apply

department  gender
Finance     Female    45254.347202
            Male      81305.505308
HR          Female    89957.850572
            Male      86388.455069
IT          Female    92642.681701
            Male      64658.795574
Marketing   Male      80112.113621
Name: salary, dtype: float64