# Setup

Please refer to the following tutorial for context to the examples below

Reference https://youtu.be/YAvTX-OR8QE

In [1]:
import pandas as pd
import numpy as np

## Helper functions

### get_data()

In [2]:
def get_data(number_of_rows = 200, year=2020, seed=42):
    '''
    Generate sales product test data for given year:
    location, product, month, target_sales, target_profit, actual_sales, actual profit.
    
    Example
    -------
    df = get_data()

    
    Parameters
    ----------
    number_of_rows - int - number of records/rows required.
    
    year - int - sales year to generate data for.
    
    
    Returns
    -------
    Pandas Dataframe
    
    '''
    
    locations = ['London', 'Paris', 'Milan']
    products = ['Tops & Blouses', 'Jeans', 'Footwear', 'Beachwear', 'Sportswear']  

    np.random.seed(seed)
    
    data = {
        'location': np.random.choice(locations, size=number_of_rows),
        'product': np.random.choice(products, size=number_of_rows),
        'month': np.random.choice(range(1, 13), size=number_of_rows),
        'target_sales': np.random.randint(14000, 40000, size=number_of_rows),
        'target%_profit': np.random.randint(10, size=number_of_rows) * .02
    }

    df = pd.DataFrame(data)

    df['month'] = df['month'].apply(
        lambda x: pd.Period(f'{year}-{str(x).zfill(2)}'))

    df['target_profit'] = df['target_sales'] * df['target%_profit']

    actual_sales_lambda = lambda x: x + (x * np.random.choice(range(-10, 10)) / 100)
    df['actual_sales'] = df['target_sales'].apply(actual_sales_lambda)

    df['actual_profit'] = (df['actual_sales'] * df['target%_profit']).round(2)
    
    df.drop(columns=['target%_profit'], inplace=True)
    df.month = pd.PeriodIndex(df.month).to_timestamp()
    
    df = df.sort_values(['month', 'location', 'product'])
   
    rows, cols = df.shape
    print(f'{rows} rows, {cols} cols.')

    return df

# Test data

In [3]:
df = get_data()
df.head(3)

200 rows, 7 cols.


Unnamed: 0,location,product,month,target_sales,target_profit,actual_sales,actual_profit
4,London,Beachwear,2020-01-01,31749,1904.94,29209.08,1752.54
125,London,Beachwear,2020-01-01,37833,6053.28,34049.7,5447.95
21,London,Jeans,2020-01-01,29485,4127.9,31548.95,4416.85


In [4]:
df.groupby('location').sum()

Unnamed: 0_level_0,target_sales,target_profit,actual_sales,actual_profit
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
London,1719825,165248.62,1698673.45,162687.57
Milan,2035834,169661.34,2019589.41,168178.4
Paris,1668056,152566.46,1657943.47,151267.95


## <a href="https://www.youtube.com/watch?v=YAvTX-OR8QE&t=30s">00:30 Group by single column and summing all columns or selected columns</a>

In [5]:
df.groupby('location')['actual_profit'].sum()

location
London    162687.57
Milan     168178.40
Paris     151267.95
Name: actual_profit, dtype: float64

## <a href="https://www.youtube.com/watch?v=YAvTX-OR8QE&t=49s">00:49 Group by single column and sum  selected columns</a>

Note that at https://youtu.be/YAvTX-OR8QE?t=67 you __CAN__ include sum() at the end of multiple column selection

In [6]:
df.groupby('location')[['target_profit', 'actual_profit']].sum()

Unnamed: 0_level_0,target_profit,actual_profit
location,Unnamed: 1_level_1,Unnamed: 2_level_1
London,165248.62,162687.57
Milan,169661.34,168178.4
Paris,152566.46,151267.95


## <a href="https://www.youtube.com/watch?v=YAvTX-OR8QE&t=75s">01:15 Group across rows - aggregate (sum) across rows (axis =1)</a>

In [7]:
df[['target_profit', 'actual_profit']].sum(axis='columns')

4       3657.48
125    11501.23
21      8544.75
148     7842.52
99      9013.94
         ...   
188     4493.41
33      2687.66
72      6472.27
85      9658.52
163     4227.60
Length: 200, dtype: float64

### apply() sum() across columns

In [8]:
df.groupby('location', as_index=True).apply(sum, axis='columns').to_frame().rename(columns={0: 'Total'})

Unnamed: 0_level_0,Unnamed: 1_level_0,Total
location,Unnamed: 1_level_1,Unnamed: 2_level_1
London,4,64615.56
London,125,83383.93
London,21,69578.70
London,148,86267.68
London,99,65351.06
...,...,...
Paris,188,60661.09
Paris,33,47482.06
Paris,72,52702.75
Paris,85,63316.98


### transform()

reference -> https://pbpython.com/pandas_transform.html

Calculate the 'contribution' (__percentage ratio__) of each location with respect to the total_profit

In [9]:
gx = df.groupby(['location'])['target_profit'].sum() 
gx = (gx * 100 / df['target_profit'].sum()).round(2)
gx

location
London    33.9
Milan     34.8
Paris     31.3
Name: target_profit, dtype: float64

Calculate this ratio to be available at the __'detail'__ level

In [10]:
group_target_profit = df.groupby('location')[['target_profit']].transform(lambda x: x.sum())
df['group_target_profit%'] = (group_target_profit * 100 / df['target_profit'].sum()).round(2)
df.head(10)

Unnamed: 0,location,product,month,target_sales,target_profit,actual_sales,actual_profit,group_target_profit%
4,London,Beachwear,2020-01-01,31749,1904.94,29209.08,1752.54,33.9
125,London,Beachwear,2020-01-01,37833,6053.28,34049.7,5447.95,33.9
21,London,Jeans,2020-01-01,29485,4127.9,31548.95,4416.85,33.9
148,London,Jeans,2020-01-01,37524,3752.4,40901.16,4090.12,33.9
99,London,Sportswear,2020-01-01,27216,4354.56,29121.12,4659.38,33.9
81,London,Tops & Blouses,2020-01-01,37959,4555.08,40995.72,4919.49,33.9
87,London,Tops & Blouses,2020-01-01,17719,708.76,17719.0,708.76,33.9
27,Milan,Footwear,2020-01-01,17709,708.36,19125.72,765.03,34.8
41,Milan,Footwear,2020-01-01,15531,2795.58,16773.48,3019.23,34.8
80,Milan,Footwear,2020-01-01,38933,1557.32,42047.64,1681.91,34.8


## <a href="https://www.youtube.com/watch?v=YAvTX-OR8QE&t=222s">03:42 No index - Dataframe to_string(index=False)</a>

In [11]:
display(df.groupby('location', as_index=False).sum())

dd = df.groupby('location', as_index=False).sum().to_string()
print(dd)

Unnamed: 0,location,target_sales,target_profit,actual_sales,actual_profit,group_target_profit%
0,London,1719825,165248.62,1698673.45,162687.57,2237.4
1,Milan,2035834,169661.34,2019589.41,168178.4,2540.4
2,Paris,1668056,152566.46,1657943.47,151267.95,1909.3


  location  target_sales  target_profit  actual_sales  actual_profit  group_target_profit%
0   London       1719825      165248.62    1698673.45      162687.57                2237.4
1    Milan       2035834      169661.34    2019589.41      168178.40                2540.4
2    Paris       1668056      152566.46    1657943.47      151267.95                1909.3


## <a href="https://www.youtube.com/watch?v=YAvTX-OR8QE&t=248s">04:08 Group selecting values in the grouping - Groupby.get_group()</a>

In [12]:
df.groupby('location').get_group('Paris').head()

Unnamed: 0,location,product,month,target_sales,target_profit,actual_sales,actual_profit,group_target_profit%
64,Paris,Beachwear,2020-01-01,19645,0.0,19448.55,0.0,31.3
129,Paris,Beachwear,2020-01-01,20938,1675.04,21775.52,1742.04,31.3
71,Paris,Footwear,2020-01-01,33087,1985.22,36064.83,2163.89,31.3
142,Paris,Footwear,2020-01-01,32384,1943.04,29469.44,1768.17,31.3
88,Paris,Sportswear,2020-01-01,33129,1325.16,32797.71,1311.91,31.3


### alternative => df.loc

In [13]:
df.loc[df['location'] == 'Paris'].head()

Unnamed: 0,location,product,month,target_sales,target_profit,actual_sales,actual_profit,group_target_profit%
64,Paris,Beachwear,2020-01-01,19645,0.0,19448.55,0.0,31.3
129,Paris,Beachwear,2020-01-01,20938,1675.04,21775.52,1742.04,31.3
71,Paris,Footwear,2020-01-01,33087,1985.22,36064.83,2163.89,31.3
142,Paris,Footwear,2020-01-01,32384,1943.04,29469.44,1768.17,31.3
88,Paris,Sportswear,2020-01-01,33129,1325.16,32797.71,1311.91,31.3


### alternative => df.query

In [14]:
df.query("location == 'Paris'").head()

Unnamed: 0,location,product,month,target_sales,target_profit,actual_sales,actual_profit,group_target_profit%
64,Paris,Beachwear,2020-01-01,19645,0.0,19448.55,0.0,31.3
129,Paris,Beachwear,2020-01-01,20938,1675.04,21775.52,1742.04,31.3
71,Paris,Footwear,2020-01-01,33087,1985.22,36064.83,2163.89,31.3
142,Paris,Footwear,2020-01-01,32384,1943.04,29469.44,1768.17,31.3
88,Paris,Sportswear,2020-01-01,33129,1325.16,32797.71,1311.91,31.3


## <a href="https://www.youtube.com/watch?v=YAvTX-OR8QE&t=273s">04:33 Group by multiple columns selecting values in each group - Groupby.get_group() tuple</a>

In [15]:
df.groupby(['location', 'product']).get_group(('Paris', 'Jeans')).head()

Unnamed: 0,location,product,month,target_sales,target_profit,actual_sales,actual_profit,group_target_profit%
162,Paris,Jeans,2020-03-01,38323,3832.3,40622.38,4062.24,31.3
143,Paris,Jeans,2020-04-01,23860,3340.4,25291.6,3540.82,31.3
66,Paris,Jeans,2020-05-01,33968,2038.08,32269.6,1936.18,31.3
111,Paris,Jeans,2020-05-01,18804,2632.56,18239.88,2553.58,31.3
14,Paris,Jeans,2020-07-01,15810,948.6,14861.4,891.68,31.3


In [16]:
df.loc[(df['location'] == 'Paris') & (df['product'] == 'Jeans')].head()

Unnamed: 0,location,product,month,target_sales,target_profit,actual_sales,actual_profit,group_target_profit%
162,Paris,Jeans,2020-03-01,38323,3832.3,40622.38,4062.24,31.3
143,Paris,Jeans,2020-04-01,23860,3340.4,25291.6,3540.82,31.3
66,Paris,Jeans,2020-05-01,33968,2038.08,32269.6,1936.18,31.3
111,Paris,Jeans,2020-05-01,18804,2632.56,18239.88,2553.58,31.3
14,Paris,Jeans,2020-07-01,15810,948.6,14861.4,891.68,31.3


### iterating keys and values => df.groups

In [17]:
for keys, values in df.groupby(['location', 'product']).groups.items():
    print(f'keys:{keys}\n')
    print(f'values: {values}')
    break

keys:('London', 'Beachwear')

values: Int64Index([4, 125, 95, 165, 168, 40, 47, 199, 5, 25, 114, 192, 120, 193, 83,
            184],
           dtype='int64')


## <a href="https://www.youtube.com/watch?v=YAvTX-OR8QE&t=330s">05:30 Add new columns to data frame and then group by the new column</a>

In [18]:
df['difference'] = df['target_profit'] - df['actual_profit']
df.head()

Unnamed: 0,location,product,month,target_sales,target_profit,actual_sales,actual_profit,group_target_profit%,difference
4,London,Beachwear,2020-01-01,31749,1904.94,29209.08,1752.54,33.9,152.4
125,London,Beachwear,2020-01-01,37833,6053.28,34049.7,5447.95,33.9,605.33
21,London,Jeans,2020-01-01,29485,4127.9,31548.95,4416.85,33.9,-288.95
148,London,Jeans,2020-01-01,37524,3752.4,40901.16,4090.12,33.9,-337.72
99,London,Sportswear,2020-01-01,27216,4354.56,29121.12,4659.38,33.9,-304.82


### alternative using df.assign()

In [19]:
df = df.assign(difference2 = lambda x: x.target_profit - x.actual_profit)
df.head()

Unnamed: 0,location,product,month,target_sales,target_profit,actual_sales,actual_profit,group_target_profit%,difference,difference2
4,London,Beachwear,2020-01-01,31749,1904.94,29209.08,1752.54,33.9,152.4,152.4
125,London,Beachwear,2020-01-01,37833,6053.28,34049.7,5447.95,33.9,605.33,605.33
21,London,Jeans,2020-01-01,29485,4127.9,31548.95,4416.85,33.9,-288.95,-288.95
148,London,Jeans,2020-01-01,37524,3752.4,40901.16,4090.12,33.9,-337.72,-337.72
99,London,Sportswear,2020-01-01,27216,4354.56,29121.12,4659.38,33.9,-304.82,-304.82


## <a href="https://www.youtube.com/watch?v=YAvTX-OR8QE&t=352s">05:52 How to create and groupby a new column by applying a function - dataframe.apply(functionname)</a>

In [20]:
def my_target(value):
    
    if value >= 20000:
        return 'over 20000'
    
    return 'under 20000'   

In [21]:
df['target_achieved'] = df['actual_sales'].apply(my_target)
df.head(5)

Unnamed: 0,location,product,month,target_sales,target_profit,actual_sales,actual_profit,group_target_profit%,difference,difference2,target_achieved
4,London,Beachwear,2020-01-01,31749,1904.94,29209.08,1752.54,33.9,152.4,152.4,over 20000
125,London,Beachwear,2020-01-01,37833,6053.28,34049.7,5447.95,33.9,605.33,605.33,over 20000
21,London,Jeans,2020-01-01,29485,4127.9,31548.95,4416.85,33.9,-288.95,-288.95,over 20000
148,London,Jeans,2020-01-01,37524,3752.4,40901.16,4090.12,33.9,-337.72,-337.72,over 20000
99,London,Sportswear,2020-01-01,27216,4354.56,29121.12,4659.38,33.9,-304.82,-304.82,over 20000


### alternative using df.loc

In [22]:
df.loc[df['actual_sales'] >= 20000, 'target_achieved(loc)'] = 'over 20000'
df.loc[df['actual_sales'] < 20000, 'target_achieved(loc)'] = 'under 20000'
df.head()

Unnamed: 0,location,product,month,target_sales,target_profit,actual_sales,actual_profit,group_target_profit%,difference,difference2,target_achieved,target_achieved(loc)
4,London,Beachwear,2020-01-01,31749,1904.94,29209.08,1752.54,33.9,152.4,152.4,over 20000,over 20000
125,London,Beachwear,2020-01-01,37833,6053.28,34049.7,5447.95,33.9,605.33,605.33,over 20000,over 20000
21,London,Jeans,2020-01-01,29485,4127.9,31548.95,4416.85,33.9,-288.95,-288.95,over 20000,over 20000
148,London,Jeans,2020-01-01,37524,3752.4,40901.16,4090.12,33.9,-337.72,-337.72,over 20000,over 20000
99,London,Sportswear,2020-01-01,27216,4354.56,29121.12,4659.38,33.9,-304.82,-304.82,over 20000,over 20000


### alternative using df.apply()

In [23]:
df['target_achieved_assign'] = df['actual_sales'].apply(lambda x: 'over 20000' if x >= 20000 else 'under 20000')
df.head()

Unnamed: 0,location,product,month,target_sales,target_profit,actual_sales,actual_profit,group_target_profit%,difference,difference2,target_achieved,target_achieved(loc),target_achieved_assign
4,London,Beachwear,2020-01-01,31749,1904.94,29209.08,1752.54,33.9,152.4,152.4,over 20000,over 20000,over 20000
125,London,Beachwear,2020-01-01,37833,6053.28,34049.7,5447.95,33.9,605.33,605.33,over 20000,over 20000,over 20000
21,London,Jeans,2020-01-01,29485,4127.9,31548.95,4416.85,33.9,-288.95,-288.95,over 20000,over 20000,over 20000
148,London,Jeans,2020-01-01,37524,3752.4,40901.16,4090.12,33.9,-337.72,-337.72,over 20000,over 20000,over 20000
99,London,Sportswear,2020-01-01,27216,4354.56,29121.12,4659.38,33.9,-304.82,-304.82,over 20000,over 20000,over 20000


## <a href="https://www.youtube.com/watch?v=YAvTX-OR8QE&t=460s">07:40 Groupby bin size - how to use pandas.cut (pd.cut)</a>

In [24]:
df = df.iloc[:, :-3] # Remove 'target_achieved' columns from previous section
df['bin category'] = pd.cut(x=df['actual_profit'], bins=4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
df.head()

Unnamed: 0,location,product,month,target_sales,target_profit,actual_sales,actual_profit,group_target_profit%,difference,difference2,bin category
4,London,Beachwear,2020-01-01,31749,1904.94,29209.08,1752.54,33.9,152.4,152.4,Q2
125,London,Beachwear,2020-01-01,37833,6053.28,34049.7,5447.95,33.9,605.33,605.33,Q4
21,London,Jeans,2020-01-01,29485,4127.9,31548.95,4416.85,33.9,-288.95,-288.95,Q3
148,London,Jeans,2020-01-01,37524,3752.4,40901.16,4090.12,33.9,-337.72,-337.72,Q3
99,London,Sportswear,2020-01-01,27216,4354.56,29121.12,4659.38,33.9,-304.82,-304.82,Q3


In [25]:
df.groupby('bin category').size()

bin category
Q1    79
Q2    63
Q3    45
Q4    13
dtype: int64

In [26]:
df['bin category2'] = pd.cut(x=df['actual_profit'],
                             bins=[0, 3000, 6000, 9000, 12000],
                             right=False)
df.head()

Unnamed: 0,location,product,month,target_sales,target_profit,actual_sales,actual_profit,group_target_profit%,difference,difference2,bin category,bin category2
4,London,Beachwear,2020-01-01,31749,1904.94,29209.08,1752.54,33.9,152.4,152.4,Q2,"[0, 3000)"
125,London,Beachwear,2020-01-01,37833,6053.28,34049.7,5447.95,33.9,605.33,605.33,Q4,"[3000, 6000)"
21,London,Jeans,2020-01-01,29485,4127.9,31548.95,4416.85,33.9,-288.95,-288.95,Q3,"[3000, 6000)"
148,London,Jeans,2020-01-01,37524,3752.4,40901.16,4090.12,33.9,-337.72,-337.72,Q3,"[3000, 6000)"
99,London,Sportswear,2020-01-01,27216,4354.56,29121.12,4659.38,33.9,-304.82,-304.82,Q3,"[3000, 6000)"


In [27]:
df.groupby('bin category2').size()

bin category2
[0, 3000)        127
[3000, 6000)      68
[6000, 9000)       5
[9000, 12000)      0
dtype: int64

## <a href="https://www.youtube.com/watch?v=YAvTX-OR8QE&t=573s">09:33 How to group by a date column - how to group by year and also by quarter - pandas.grouper(key,freq)</a>

Frequency aliases: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timeseries-offset-aliases

In [28]:
grouper = pd.Grouper(key='month', freq='Q-DEC')

df.groupby([grouper, 'location'], as_index=True)['actual_profit'].size().to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,actual_profit
month,location,Unnamed: 2_level_1
2020-03-31,London,16
2020-03-31,Milan,21
2020-03-31,Paris,15
2020-06-30,London,16
2020-06-30,Milan,14
2020-06-30,Paris,15
2020-09-30,London,19
2020-09-30,Milan,15
2020-09-30,Paris,12
2020-12-31,London,15


### Group by using multi-index (including pd.Grouper object)

In [29]:
grouper = pd.Grouper(key='month', freq='A')

df.groupby([grouper, 'location'], as_index=True)['actual_profit'].sum().to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,actual_profit
month,location,Unnamed: 2_level_1
2020-12-31,London,162687.57
2020-12-31,Milan,168178.4
2020-12-31,Paris,151267.95


## <a href="https://www.youtube.com/watch?v=YAvTX-OR8QE&t=660s">11:00 Group dataframe and filter groups according to group values - Groupby.filter</a>

Below extracts detail row data if the group __location/product__ combination __mean actual sales is greater than 20000__

In [30]:
df.groupby(['location', 'product']).filter(lambda x: x.actual_sales.mean() > 20000).head(10)

Unnamed: 0,location,product,month,target_sales,target_profit,actual_sales,actual_profit,group_target_profit%,difference,difference2,bin category,bin category2
4,London,Beachwear,2020-01-01,31749,1904.94,29209.08,1752.54,33.9,152.4,152.4,Q2,"[0, 3000)"
125,London,Beachwear,2020-01-01,37833,6053.28,34049.7,5447.95,33.9,605.33,605.33,Q4,"[3000, 6000)"
21,London,Jeans,2020-01-01,29485,4127.9,31548.95,4416.85,33.9,-288.95,-288.95,Q3,"[3000, 6000)"
148,London,Jeans,2020-01-01,37524,3752.4,40901.16,4090.12,33.9,-337.72,-337.72,Q3,"[3000, 6000)"
99,London,Sportswear,2020-01-01,27216,4354.56,29121.12,4659.38,33.9,-304.82,-304.82,Q3,"[3000, 6000)"
81,London,Tops & Blouses,2020-01-01,37959,4555.08,40995.72,4919.49,33.9,-364.41,-364.41,Q3,"[3000, 6000)"
87,London,Tops & Blouses,2020-01-01,17719,708.76,17719.0,708.76,33.9,0.0,0.0,Q1,"[0, 3000)"
27,Milan,Footwear,2020-01-01,17709,708.36,19125.72,765.03,34.8,-56.67,-56.67,Q1,"[0, 3000)"
41,Milan,Footwear,2020-01-01,15531,2795.58,16773.48,3019.23,34.8,-223.65,-223.65,Q2,"[3000, 6000)"
80,Milan,Footwear,2020-01-01,38933,1557.32,42047.64,1681.91,34.8,-124.59,-124.59,Q1,"[0, 3000)"
