In [54]:
import pandas as pd
import numpy as np

In [55]:
"""the groupby().transform() method is used to perform group-specific computations
and return a result that has the same shape as the original DataFrame or Series"""

"""This differs from groupby().apply() which can return an aggregated result,
potentially changing the shape of the data."""
data = pd.read_csv('myntra_dataset_ByScraping.csv')
data.head(10)

Unnamed: 0,brand_name,pants_description,price,MRP,discount_percent,ratings,number_of_ratings
0,WROGN,Men Loose Fit Cotton Jeans,1374.0,2499.0,0.45,4.2,57.0
1,Flying Machine,Men Slim Fit Jeans,1829.0,2999.0,0.39,4.6,5.0
2,Roadster,Men Pure Cotton Jeans,974.0,2499.0,0.61,3.6,1100.0
3,Bene Kleed,Relaxed Fit Denim Jeans,873.0,2299.0,0.62,4.0,4800.0
4,Levis,Men 511 Slim Fit Jeans,1478.0,2899.0,0.49,4.3,264.0
5,HERE&NOW,Men Stretchable Jeans,798.0,1699.0,0.53,4.0,33.0
6,Urbano Fashion,Men Relaxed Fit Jeans,944.0,2099.0,0.55,4.0,4200.0
7,WROGN,Men Anti Fit Jeans,1623.0,2799.0,0.42,4.2,42.0
8,Bene Kleed,Men Wide Leg Heavy Fade Jeans,839.0,2799.0,0.7,3.6,114.0
9,Bene Kleed,Relaxed Fit Denim Jeans,873.0,2299.0,0.62,4.1,5200.0


###Common Use Cases:
1) Filling missing values within groups using group-specific statistics (e.g., group mean).
2) Normalizing or standardizing data within groups.
3) Creating new features based on group-level information.

In [56]:
grouped = data.groupby('brand_name')
grouped.head()

Unnamed: 0,brand_name,pants_description,price,MRP,discount_percent,ratings,number_of_ratings
0,WROGN,Men Loose Fit Cotton Jeans,1374.0,2499.0,0.45,4.2,57.0
1,Flying Machine,Men Slim Fit Jeans,1829.0,2999.0,0.39,4.6,5.0
2,Roadster,Men Pure Cotton Jeans,974.0,2499.0,0.61,3.6,1100.0
3,Bene Kleed,Relaxed Fit Denim Jeans,873.0,2299.0,0.62,4.0,4800.0
4,Levis,Men 511 Slim Fit Jeans,1478.0,2899.0,0.49,4.3,264.0
...,...,...,...,...,...,...,...
51432,ARBIA FUNKI,Men Mid Rise Jeans,1393.0,3399.0,0.59,3.8,22.0
51534,M&H Our Water,Men Slim Tapered Fit Jeans,643.0,2299.0,0.72,4.1,183.0
51584,M&H Our Water,Men Slim Tapered Fit Jeans,643.0,2299.0,0.72,4.1,183.0
51855,RUF & TUF,Men Skinny Fit Jeans,1829.0,2999.0,0.39,4.6,5.0


In [57]:
data['brand_name'].nunique()

417

In [58]:
data['price_Mean'] = grouped['price'].transform('mean')
data['price_Mean'].head(20)

Unnamed: 0,price_Mean
0,1410.67377
1,1813.166442
2,779.847005
3,1004.176829
4,1946.544821
5,1205.267552
6,1033.402878
7,1410.67377
8,1004.176829
9,1004.176829


In [62]:
data['price_Mean'].shape

(52120,)

In [64]:
data.shape

(52120, 8)

In [65]:
data.head()

Unnamed: 0,brand_name,pants_description,price,MRP,discount_percent,ratings,number_of_ratings,price_Mean
0,WROGN,Men Loose Fit Cotton Jeans,1374.0,2499.0,0.45,4.2,57.0,1410.67377
1,Flying Machine,Men Slim Fit Jeans,1829.0,2999.0,0.39,4.6,5.0,1813.166442
2,Roadster,Men Pure Cotton Jeans,974.0,2499.0,0.61,3.6,1100.0,779.847005
3,Bene Kleed,Relaxed Fit Denim Jeans,873.0,2299.0,0.62,4.0,4800.0,1004.176829
4,Levis,Men 511 Slim Fit Jeans,1478.0,2899.0,0.49,4.3,264.0,1946.544821


In [67]:
data['ratings'].nunique()

41

In [68]:
"""The groupby().apply() combination in Pandas allows for flexible, custom operations on grouped data,
 where a function is applied to each group as a DataFrame or Series."""

 # Define a custom function to apply to each group
def calculate_custom_metric(group):
    """
    Calculates a custom metric for each group: (sum of Value1) / (max of Value2)
    """
    sum_value1 = group['number_of_ratings'].sum()
    max_value2 = group['ratings'].max()
    if max_value2 == 0:  # Avoid division by zero
        return 0
    return sum_value1 / max_value2

# Group by 'Category' and apply the custom function
result = data.groupby('price').apply(calculate_custom_metric)
result.head(20)

  result = data.groupby('price').apply(calculate_custom_metric)


Unnamed: 0_level_0,0
price,Unnamed: 1_level_1
337.0,65.714286
359.0,68.372093
369.0,86.5
376.0,114.186047
377.0,1377.608696
378.0,1481.458333
379.0,2702.608696
380.0,2692.608696
389.0,67.826087
397.0,5.806452


In [12]:
result.shape

(1485,)

In [69]:
"""You can create a Pandas Series or DataFrame column with the category dtype."""
# Convert 'brand_name' column to categorical
data['brand_name'] = data['brand_name'].astype('category')
data['brand_name'].dtype

CategoricalDtype(categories=['7 For All Mankind', '7OUNCE', 'AD By Arvind', 'ADBUCKS',
                  'ADIDAS Originals', 'AFLASH', 'ALCOTT',
                  'ALTINYILDIZ CLASSICS AC Co.', 'ALTIVA',
                  'AMERICAN EAGLE OUTFITTERS',
                  ...
                  'ether', 'fineblu', 'glitchez', 'max', 'me Queen',
                  'mode de base', 'nostrum', 'prochain', 'styzon',
                  'the Misnomer'],
, ordered=False, categories_dtype=object)

In [70]:
print("\nCategories of brand_name", data['brand_name'].cat.categories)


Categories of brand_name Index(['7 For All Mankind', '7OUNCE', 'AD By Arvind', 'ADBUCKS',
       'ADIDAS Originals', 'AFLASH', 'ALCOTT', 'ALTINYILDIZ CLASSICS AC Co.',
       'ALTIVA', 'AMERICAN EAGLE OUTFITTERS',
       ...
       'ether', 'fineblu', 'glitchez', 'max', 'me Queen', 'mode de base',
       'nostrum', 'prochain', 'styzon', 'the Misnomer'],
      dtype='object', length=417)


In [71]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52120 entries, 0 to 52119
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   brand_name         52120 non-null  category
 1   pants_description  52120 non-null  object  
 2   price              52120 non-null  float64 
 3   MRP                52120 non-null  float64 
 4   discount_percent   52120 non-null  float64 
 5   ratings            52120 non-null  float64 
 6   number_of_ratings  52120 non-null  float64 
 7   price_Mean         52120 non-null  float64 
dtypes: category(1), float64(6), object(1)
memory usage: 2.9+ MB


In [79]:
new_brand = data['brand_name'].cat.add_categories("Fashor")
new_brand[52120] = 'Fashor'
new_brand.tail()

Unnamed: 0,brand_name
52116,HERE&NOW
52117,Pepe Jeans
52118,Celio
52119,Pepe Jeans
52120,Fashor


In [80]:
data['brand_name'].cat.categories

Index(['7 For All Mankind', '7OUNCE', 'AD By Arvind', 'ADBUCKS',
       'ADIDAS Originals', 'AFLASH', 'ALCOTT', 'ALTINYILDIZ CLASSICS AC Co.',
       'ALTIVA', 'AMERICAN EAGLE OUTFITTERS',
       ...
       'ether', 'fineblu', 'glitchez', 'max', 'me Queen', 'mode de base',
       'nostrum', 'prochain', 'styzon', 'the Misnomer'],
      dtype='object', length=417)

In [84]:
renamed_brand = new_brand.cat.rename_categories({'Fashor':'Aurelia'})
renamed_brand.tail()

Unnamed: 0,brand_name
52116,HERE&NOW
52117,Pepe Jeans
52118,Celio
52119,Pepe Jeans
52120,Aurelia


In [51]:
new_brand.shape

(52121,)

In [86]:
brands = new_brand.cat.remove_unused_categories()
brands.tail()

Unnamed: 0,brand_name
52116,HERE&NOW
52117,Pepe Jeans
52118,Celio
52119,Pepe Jeans
52120,Fashor


In [87]:
brands.shape

(52121,)