In [1]:
#Updates/Tasks to make Code Neater

#1. Make code calculate annualized GMV given a general N-month solution,
#rather than the hard-coded 3 months we have. 

#2. Find smart/efficient way to calculate product concentration
#i.e. %sales of a brand from top 3 products

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
import math 

# Methodology

There are 3 main parts to our code below. 
    - First, we create a filtered subset of the products_daily data and filter out brands from there. 
    - Second, from this intially filtered list of brands, we look at the ENTIRE products_daily and check for other stuff, filtering out more brands still. 
    - Lastly, we have our final list of brands and want to give more color to the brand's performance by calculating some extra info. 

# Inputs

In [4]:
#Input the values you would like to filter by:

#Filter 1: Categories
(cat_1_on, cat_2_on, cat_3_on) = True, False, False
cat_lst_1 = ["Accessories", "Bag", "Men's fashion", 
               "Men's Shoes", "Women's fashion", 
               "Women's Shoes"]
cat_lst_2 = []
cat_lst_3 = []

#Filter 2: Times (Update-status, how established product is)
string = "1/1/2021"
element = datetime.strptime(string,"%d/%m/%Y") 
unix_time_now = datetime.timestamp(element) 

#unix_time_now = 1609682748 #Change based on most recent month

established_on = False
min_years = 1

#Filter 3: Annual Brand GMV
min_gmv_on = True
min_gmv = 70000

#Filter 4: Brand Avg Price 
min_unit_price_on = True
min_unit_price = 300

#Filter 5: GMV Concentration
min_gmv_conc_on = False
top_n = 3 #concetration num of products 
min_concentration = 0.2

#Filter 6: Avg Product Rating
min_avg_star_on = True
min_avg_star = 4.5

#Filter 7: Bad Customer Ratings 
max_bad_rating_on = True
max_bad_rating = 0.5

#Filter 8: Brand TOTAL GMV 
min_total_brand_gmv_on = True
min_total_brand_gmv = 2000000 

# Import Data

In [5]:
products_daily = pd.read_csv("products_daily_1609850907.csv")

In [6]:
#Change data types in dataframes to conserve memory:
def convert_to_cats(df, to_convert):
    '''
    Converts cols in to_convert_list to categories. 
    
    Inputs:
        df: a DataFrame
        to_convert: a list of column names (as strings)
                    we'd like to convert
    Returns:
        Nothing; modifies dataframe in place
    '''
    for col in df.columns:
        if col in to_convert:
            df[col] = df[col].astype("category")
            
to_convert_products_daily = ["added_at", 'category_one', 'category_two',
                             'category_three', 'category_one_en', 
                             'category_two_en', 'category_three_en', 
                             'category_one_th', 'category_two_th',
                             'category_three_th']

convert_to_cats(products_daily, to_convert_products_daily)

def downcast_numbers(df):
    '''
    Downcasts floats and ints.
    
    Inputs:
        df: a DataFrame objecet. 
    Returns:
        Nothing; modifies list in place
    '''
    for col in df.columns:
        if df[col].dtype == "float":
            df[col] = pd.to_numeric(df[col], downcast="float")
        if df[col].dtype == "int":
            df[col] = pd.to_numeric(df[col], downcast="unsigned")
            
downcast_numbers(products_daily)

In [7]:
#Take out nonselling products we don't care about 
mask = products_daily["gmv"] > 1
products_daily = products_daily[mask]

In [8]:
#Make New Columns to account that our data is a 3-mo. snapshot
products_daily["weighted_star"] = products_daily["rating_star"] * products_daily["sold"]
products_daily["4_month_weighted_star"] = products_daily["weighted_star"] * 4
products_daily["4_month_gmv"] = products_daily["gmv"] * 4 
products_daily["4_month_sold"] = products_daily["sold"] * 4 
products_daily["4_month_view_count"] = products_daily["view_count"] * 4

#Make untouched copy of products data for later
untouched_products_daily = products_daily.copy(deep=True)

#### Logic for why we have columns like "4_month_gmv": 

We want to calculate annual gmv. If a product is present for all Oct, Nov, Dec, i.e. it has 3 rows in the products_daily, then we'd take the average product_gmv for each row and x12. This is equivalent to taking the row-specific product_gmv, multiplying by 4, then adding up these numbers. 

Our method has assumption: if a product sells for only 1 of 3 months, then it will sell for 4 of 12 months.


# Product Filter 1: Category Filter

In [9]:
#Filter out products in irrelevant categories. 
def filter_by_category(df, category_n, ok_cat_lst):
    '''
    Keeps rows in the df that have cats in ok_cat_lst. 
    
    Inputs:
        df: a DataFrame
        category_n: (str) eg. "category_one"
        ok_cat_lst: (lst) of acceptable cat for that 
          cat as numbers eg [50, 26]
    Returns:
        The modified df 
    '''
    
    mask = df[category_n].isin(ok_cat_lst)
    return df[mask]

In [10]:
if cat_1_on:
    products_daily = filter_by_category(products_daily, "category_one_en", cat_lst_1) 

In [11]:
if cat_2_on:
    products_daily = filter_by_category(products_daily, "category_one_en", cat_lst_1) 

In [12]:
if cat_3_on:
    products_daily = filter_by_category(products_daily, "category_one_en", cat_lst_1) 

# Product Filter 2: Keep only Established Products and Products that Sellers are Updating

In [13]:
#Filter out products that have been sold for less than a specified time. 
if established_on:
    min_s = min_years * 365 * 24 * 60 * 60 
    mask = unix_time_now - products_daily["ctime"] > min_s
    products_daily = products_daily[mask]

# Brand Definition

In [14]:
#Get rid of products w/o brand info
mask = products_daily["brand"] != "No Brand(ไม่มียี่ห้อ)"
products_daily = products_daily[mask]

#Make a DataFrame of brands from which we'll start narrowing down. 
brands_list = products_daily["brand"].unique()
brands_list = np.delete(brands_list, np.where(brands_list == ('No Brand(ไม่มียี่ห้อ)')))

In [15]:
#Clean up brands_df
brands_df = pd.DataFrame(brands_list, columns = ["Brand Name"])

brands_df.dropna(inplace=True)

s1 = brands_df["Brand Name"] != "None"
brands_df = brands_df[s1]

s2 = brands_df["Brand Name"] != "0"
brands_df = brands_df[s2] 

In [16]:
#Set up columns for our brands_df

brands_df["Filtered_Brand_Annual_GMV"] = 0 
brands_df["Filtered_Average_Unit_Price"] = 0
brands_df["%_Filtered_GMV_from_Top_3_Products"] = 0
brands_df["Filtered_Weighted_Star_Rating"] = 0
brands_df["Filtered_Bad_Rating_Percent"] = 0
brands_df.set_index("Brand Name", inplace = True)

In [17]:
for brand in brands_df.index:
    
    #make a sub-df containing only rows with the correct brand
    my_brand_df = products_daily[products_daily["brand"] == brand]
    my_brand_df = my_brand_df.sort_values(by=["gmv"], ascending=False) 
    
    #Approximated ANNUAL gmv calculation
    brand_gmv = my_brand_df["4_month_gmv"].sum()
    brands_df.loc[brand, "Filtered_Brand_Annual_GMV"] = brand_gmv
    
    #ANNUAL volume calculation
    brand_volume = my_brand_df["4_month_sold"].sum()
    brands_df.loc[brand, "Filtered_Average_Unit_Price"] = brand_gmv / brand_volume
    
    '''
    Think about this
    #GMV Concentration Calculation
    if len(my_brand_df) >= top_n: 
        top_n_df = my_brand_df.head(top_n)
        top_n_gmv = top_n_df["gmv"].sum()
        brands_df.loc[brand, "%_Filtered_GMV_from_Top_3_Products"] = top_n_gmv / brand_gmv
        
    else:
        #brand sells less than like 5 products --> highly conc.
        brands_df.loc[brand, "%_Filtered_GMV_from_Top_3_Products"] = 1.0  
    '''

    #ANNUAL Weighted Star Rating
    total_star = my_brand_df["4_month_weighted_star"].sum()
    brands_df.loc[brand, "Filtered_Weighted_Star_Rating"] = total_star / brand_volume
    
    #ANNUAL Bad Rating Count
    bad_rating_count = my_brand_df["rating_count_one"].sum() + my_brand_df["rating_count_two"].sum()
    bad_rating_count *= 4 #to be fair and see over a year's data
    total_rating_count = my_brand_df["rating_count_total"].sum() * 4
    
    if total_rating_count != 0:
        val = bad_rating_count / total_rating_count 
    else: 
        val = -1
    
    brands_df.loc[brand, "Filtered_Bad_Rating_Percent"] = val

brands_df["Filtered_Bad_Rating_Percent"] = brands_df["Filtered_Bad_Rating_Percent"].round(decimals=3)

In [18]:
print(len(brands_df))

9894


# Brand Filter 1: Brand Filtered GMV

In [19]:
if min_gmv_on:
    brands_df = brands_df[brands_df["Filtered_Brand_Annual_GMV"] >= min_gmv]

In [20]:
print(len(brands_df))

3088


# Brand Filter 2: Brand (Filtered) Average Price

In [21]:
if min_unit_price_on:
    brands_df = brands_df[brands_df["Filtered_Average_Unit_Price"] >= min_unit_price]

In [22]:
print(len(brands_df))

1819


# Brand Filter 3: GMV Concentration of Top (3) Products

In [23]:
if min_gmv_conc_on:
    brands_df = brands_df[brands_df["Filtered_GMV_Concentration"] >= min_concentration]

In [24]:
print(len(brands_df))

1819


# Brand Filter 4: Average (Filtered) Product Rating

In [25]:
if min_avg_star_on:
    brands_df = brands_df[brands_df["Filtered_Weighted_Star_Rating"] >= min_avg_star]

In [26]:
print(len(brands_df))

1460


# Brand Filter 5: (Filtered) Bad Customer Ratings

In [27]:
if max_bad_rating_on:
    brands_df = brands_df[brands_df["Filtered_Bad_Rating_Percent"] <= max_bad_rating]

In [28]:
print(len(brands_df))

1460


# (GMV Growth?) - Revisit Later

# Brand Filter 6: Brand GMV (Across All Categories) 

In [29]:
#Using the untouched products data, calculate info pertainting to 
#unfiltered products including: brand gmv across all cats, its 
#predominant cat_2 focus


if min_total_brand_gmv_on: #Always kept on        
    for brand in brands_df.index:

        #make a sub-df containing only rows with the correct brand
        mask_brand = untouched_products_daily["brand"] == brand
        my_brand_df = untouched_products_daily[mask_brand]

        #TOTAL ANNUAL Brand gmv calculation
        total_brand_gmv = my_brand_df["4_month_gmv"].sum() 
        brands_df.loc[brand, "Brand_GMV_All_Cats"] = total_brand_gmv
        
        #Total brand view count 
        total_brand_views = my_brand_df["4_month_view_count"].sum()
        conv_rate = total_brand_gmv / total_brand_views
        brands_df.loc[brand, "Conversion_Rate_All_Cats_(gmv/views)"] = conv_rate
        
        #Finding brand's predominant cat_2
        winning_cat_2 = None
        winning_cat_2_gmv = 0.0

        for cat_2 in my_brand_df["category_two_en"].unique():
            mask_cat_2 = my_brand_df["category_two_en"] == cat_2
            cat_2_df = my_brand_df[mask_cat_2]

            cat_2_gmv = cat_2_df["4_month_gmv"].sum()
            if cat_2_gmv > winning_cat_2_gmv:
                winning_cat_2 = cat_2
                winning_cat_2_gmv = cat_2_gmv
                    
        percent = (winning_cat_2_gmv / total_brand_gmv).round(decimals=3)
        brands_df.loc[brand, "Cat_2_Percent_of_Total_GMV"] = percent 
        brands_df.loc[brand, "Brand_Category_2"] = winning_cat_2

        #print("for brand: ", brand, "Winning cat 2 is: ", winning_cat_2)
        #print("with a cat_2_gmv value of: ", winning_cat_2_gmv)    

In [30]:
#Apply the total brand gmv filter
brands_df = brands_df[brands_df["Brand_GMV_All_Cats"] >= min_total_brand_gmv]

# Calculate Additional Info on Recency of Sales for Shortlisted Brands

In [31]:
#From the final list of brands we have, add info on recency ratio 
#eg. percentage of total gmv coming from products established at 
#least 3, 6, 12, 24 months. 

In [32]:
#Calculate times
three_mo = 3 * 30 * 24 * 60 * 60
six_mo = 6 * 30 * 24 * 60 * 60
twelve_mo = 12 * 30 * 24 * 60 * 60
twenty_four_mo = 24 * 30 * 24 * 60 * 60

In [33]:
#Make four dataframes representing slices of the untouched products
#that have sold for more than 3, 6, 12, 24 months. 

mask_3 = unix_time_now - untouched_products_daily["ctime"] > three_mo
timed_df_3 = untouched_products_daily[mask_3]

mask_6 = unix_time_now - untouched_products_daily["ctime"] > six_mo
timed_df_6 = untouched_products_daily[mask_6]

mask_12 = unix_time_now - untouched_products_daily["ctime"] > twelve_mo
timed_df_12 = untouched_products_daily[mask_12]

mask_24 = unix_time_now - untouched_products_daily["ctime"] > twenty_four_mo
timed_df_24 = untouched_products_daily[mask_24]

In [34]:
#Calculate GMV generated from products that are N months and older 

for brand in brands_df.index:
    #make a sub-df containing only rows with the correct brand
    my_brand_df = timed_df_3[timed_df_3["brand"] == brand]
    
    #gmv calculation
    brand_gmv_3 = my_brand_df["4_month_gmv"].sum()
    brands_df.loc[brand, "%Total GMV from products 3 months+"] = brand_gmv_3

for brand in brands_df.index:
    #make a sub-df containing only rows with the correct brand
    my_brand_df = timed_df_6[timed_df_6["brand"] == brand]
    
    #gmv calculation
    brand_gmv_6 = my_brand_df["4_month_gmv"].sum()
    brands_df.loc[brand, "%Total GMV from products 6 months+"] = brand_gmv_6
    
for brand in brands_df.index:
    #make a sub-df containing only rows with the correct brand
    my_brand_df = timed_df_12[timed_df_12["brand"] == brand]
    
    #gmv calculation
    brand_gmv_12 = my_brand_df["4_month_gmv"].sum()
    brands_df.loc[brand, "%Total GMV from products 12 months+"] = brand_gmv_12
    
for brand in brands_df.index:
    #make a sub-df containing only rows with the correct brand
    my_brand_df = timed_df_24[timed_df_24["brand"] == brand]
    
    #gmv calculation
    brand_gmv_24 = my_brand_df["4_month_gmv"].sum()
    brands_df.loc[brand, "%Total GMV from products 24 months+"] = brand_gmv_24


In [35]:
#For each brand, we have now calculated GMV from products that
#are at least 3, 6, 12, 24 months old. Now, change it to % of total. 


for brand in brands_df.index:
    percentage = brands_df.loc[brand, "%Total GMV from products 3 months+"] / brands_df.loc[brand, 
                                                                                   "Brand_GMV_All_Cats"]
    brands_df.loc[brand, "%Total GMV from products 3 months+"] = round(percentage, 3)
    
for brand in brands_df.index:
    percentage = brands_df.loc[brand, "%Total GMV from products 6 months+"] / brands_df.loc[brand, 
                                                                                   "Brand_GMV_All_Cats"]
    brands_df.loc[brand, "%Total GMV from products 6 months+"] = round(percentage, 3)
    
for brand in brands_df.index:
    percentage = brands_df.loc[brand, "%Total GMV from products 12 months+"] / brands_df.loc[brand, 
                                                                                   "Brand_GMV_All_Cats"]
    brands_df.loc[brand, "%Total GMV from products 12 months+"] = round(percentage, 3)
    
for brand in brands_df.index:
    percentage = brands_df.loc[brand, "%Total GMV from products 24 months+"] / brands_df.loc[brand, 
                                                                                   "Brand_GMV_All_Cats"]
    brands_df.loc[brand, "%Total GMV from products 24 months+"] = round(percentage, 3)

# Calculate Additional Info on Discounts for Shortlisted Brands

In [36]:
#From the final list of brands we have, add info on how 
#much they discount on ALL their products. 

In [37]:
#First, fill in 0 values in "price_before_discount" with "price" value
#This means there was no discount
conditions = [untouched_products_daily['price_before_discount'].eq(0)]
choices = [untouched_products_daily["price"]]
untouched_products_daily['price_before_discount'] = np.select(conditions,choices,default=untouched_products_daily['price_before_discount'])

In [38]:
#Now, create a 4-month-full-priced-gmv 
untouched_products_daily["full_priced_gmv"] = untouched_products_daily["price_before_discount"] * untouched_products_daily["sold"]
untouched_products_daily["4_mo_full_priced_gmv"] = untouched_products_daily["full_priced_gmv"] * 4

In [39]:
#Now, for each brand calculate the would-be full_priced annual gmv. (go for each row)

for brand in brands_df.index:
    
    #make a sub-df containing only rows with the correct brand
    my_brand_df = untouched_products_daily[untouched_products_daily["brand"] == brand]
    
    #Approximated ANNUAL gmv calculation
    brand_full_priced_gmv = my_brand_df["4_mo_full_priced_gmv"].sum()
    brands_df.loc[brand, "Brand_Full_Priced_GMV"] = brand_full_priced_gmv #this number is inherently an annual

In [40]:
#Divide the Brand_GMV_All_Cats (how much brand actually sold) by 
#Brand_Full_Priced_GMV (what it would've sold in an ideal, non-discount world). 

brands_df["Average_Disc"] = 1 - brands_df["Brand_GMV_All_Cats"] / brands_df["Brand_Full_Priced_GMV"]

# Output: Brands that Meet Our Criterion

In [41]:
brands_df = brands_df.rename(columns={"Cat_2_Percent_of_Total_GMV" : "Cat_2_Contr_to_Total_GMV"})
brands_df.sort_index(inplace=True)

In [42]:
#Reorganize the outputted df
brands_df = brands_df[['Filtered_Brand_Annual_GMV', 'Brand_Full_Priced_GMV', 
                       'Filtered_Average_Unit_Price', 
                       'Filtered_Weighted_Star_Rating',
                       'Filtered_Bad_Rating_Percent', 'Brand_GMV_All_Cats',
                       'Conversion_Rate_All_Cats_(gmv/views)', 'Cat_2_Contr_to_Total_GMV',
                       'Brand_Category_2', '%Total GMV from products 3 months+',
                       '%Total GMV from products 6 months+',
                       '%Total GMV from products 12 months+',
                       '%Total GMV from products 24 months+',
                       'Average_Disc']]

In [43]:
display(brands_df)
#NOTE: Anything before the Brand_GMV_All_Cats uses FILTERED product data 
#i.e. data where we've the products to count for by category, and time. 

Unnamed: 0_level_0,Filtered_Brand_Annual_GMV,Brand_Full_Priced_GMV,Filtered_Average_Unit_Price,Filtered_Weighted_Star_Rating,Filtered_Bad_Rating_Percent,Brand_GMV_All_Cats,Conversion_Rate_All_Cats_(gmv/views),Cat_2_Contr_to_Total_GMV,Brand_Category_2,%Total GMV from products 3 months+,%Total GMV from products 6 months+,%Total GMV from products 12 months+,%Total GMV from products 24 months+,Average_Disc
Brand Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
ARROW (แอร์โร่ว์),6817428,12753880.0,1262.486667,4.823056,0.010,6847380.0,26.421846,0.956,Shorts,1.000,1.000,0.010,0.010,0.463114
Adidas(อดิดาส),19429908,56767404.0,358.326719,4.681903,0.022,38775572.0,5.367515,0.268,Men's sportswear,0.947,0.606,0.181,0.042,0.316939
Adidas(อะดิดาส),4167856,6544432.0,697.899531,4.719574,0.015,4954936.0,4.438316,0.841,Slippers,0.993,0.851,0.194,0.027,0.242878
Adidas(อาดิดาส),126951076,263103288.0,760.805662,4.703256,0.013,144042048.0,5.229604,0.523,Lace-up sneakers,0.823,0.544,0.289,0.069,0.452527
Aerosoft(เอโรซอฟท์),4549640,7976476.0,602.122816,4.768984,0.004,4549640.0,15.627413,0.539,Casual shoes,0.991,0.697,0.141,0.076,0.429618
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
xiaomi,86480,31144656.0,407.924528,4.820886,0.006,14643204.0,12.825385,0.272,Bathroom,0.999,0.875,0.355,0.004,0.529833
ห้างขายทองสุวรรณภูมิ3,11190996,14750300.0,1667.311681,4.730639,0.021,11190996.0,6.924110,1.000,Gold,1.000,1.000,0.656,0.000,0.241304
ห้างทองรวมสินไทยขอนแก่น,80549776,104646124.0,3641.490778,4.931899,0.001,80549776.0,50.640872,0.996,Gold,1.000,1.000,0.994,0.000,0.230265
ห้างทองแม่ทองสุก,9732240,12008800.0,8567.112676,4.840712,0.000,9732240.0,45.146961,1.000,Gold,0.896,0.861,0.000,0.000,0.189574


In [71]:
#Convert to csv if desired. 
#brands_df.to_csv("fashion_brands_shortlist.csv")

***
# Miscellaneous: Nakrin's Task of Finding Cat 1 Annual GMV Values from the Data

# Calculate Cat 1 Annual GMV Values for the categories that McGroup is in

In [10]:
#Approach 1: Use static Data (less accurate b/c using 1-month snapshot)

cat_1_focus = ["Accessories", "Bag", "Men's fashion", 
               "Men's Shoes", "Women's fashion", 
               "Women's Shoes"]

#Create new df to store cat 1 info
results_df = pd.DataFrame()
i = 0
for cat_1 in cat_1_focus:
    
    #take slice of products_static df include only the right cat
    mask = products_static["category_one_en"] == cat_1
    cat_1_df = products_static[mask]
    
    cat_1_gmv = cat_1_df["gmv"].sum()
    
    results_df.loc[i, "Category_One"] = cat_1
    results_df.loc[i, "Category_One_Dec_GMV"] = cat_1_gmv
    results_df.loc[i, "Category_One_Annual_GMV"] = cat_1_gmv * 12
    
    i += 1

results_df 

#Approach 2: Use daily data 

'''
#Interpretation: Use Nick's Interpretation i.e. if a product
#is sold only 1 of 3 months, we assume it is sold only 8 of 12
#months. 

#Implementation: create new column for products_daily called
#4-month GMV. Then just sum up 4-month GMV column to get annual. 
#Products that were present in all Oct, Nov, Dec will have 12 months
#worth of GMV then, weighted equally across Oct, Nov, Dec GMVs. 

#Would be interesting to test via a lower bound methhod i.e. include
#only products that have been on for all Oct, Nov, Dec, but am unsure 
#how to do that efficiently 
'''
joined_df["4_month_gmv"] = joined_df["gmv"] * 4 

cat_1_focus = ["Accessories", "Bag", "Men's fashion", 
               "Men's Shoes", "Women's fashion", 
               "Women's Shoes"]

#Create new df to store cat 1 info
results_df = pd.DataFrame()
i = 0
for cat_1 in cat_1_focus:
    
    #take slice of products_static df include only the right cat
    mask = joined_df["category_one_en"] == cat_1
    cat_1_df = joined_df[mask]
    
    cat_1_annual_gmv = cat_1_df["4_month_gmv"].sum()
    
    results_df.loc[i, "Category_One"] = cat_1
    results_df.loc[i, "Category_One_Annual_GMV"] = cat_1_annual_gmv
    i += 1

results_df 

#results_df.to_csv("cat_1_gmvs.csv")

Unnamed: 0,Category_One,Category_One_Dec_GMV,Category_One_Annual_GMV
0,Accessories,253648878.0,3043787000.0
1,Bag,280760974.0,3369132000.0
2,Men's fashion,185695784.0,2228349000.0
3,Men's Shoes,115676793.0,1388122000.0
4,Women's fashion,548695736.0,6584349000.0
5,Women's Shoes,131829899.0,1581959000.0


# Calculating Cat 1 Growth

In [47]:
Oct = 1604102400
Nov = 1606694400
Dec = 1609372800

mask_oct = joined_df["added_at"] == Oct
oct_data = joined_df[mask_oct]

mask_nov = joined_df["added_at"] == Nov
nov_data = joined_df[mask_nov]

mask_dec = joined_df["added_at"] == Dec
dec_data = joined_df[mask_dec]

In [73]:
cat_1_focus = ["Accessories", "Bag", "Men's fashion", 
               "Men's Shoes", "Women's fashion", 
               "Women's Shoes"]

#Create new df to store cat 1 info
results_df = pd.DataFrame()
i = 0
for cat_1 in cat_1_focus:
    #take slice of products_static df include only the right cat
    mask = oct_data["category_one_en"] == cat_1
    cat_1_df = oct_data[mask]
    
    cat_1_gmv = cat_1_df["gmv"].sum()
    
    results_df.loc[i, "Category_One"] = cat_1
    results_df.loc[i, "Category_One_Oct_GMV"] = cat_1_gmv
    
    i += 1

i = 0
for cat_1 in cat_1_focus:
    #take slice of products_static df include only the right cat
    mask = nov_data["category_one_en"] == cat_1
    cat_1_df = nov_data[mask]
    
    cat_1_gmv = cat_1_df["gmv"].sum()
    
    results_df.loc[i, "Category_One"] = cat_1
    results_df.loc[i, "Category_One_Nov_GMV"] = cat_1_gmv
    
    i += 1

i = 0
for cat_1 in cat_1_focus:
    #take slice of products_static df include only the right cat
    mask = dec_data["category_one_en"] == cat_1
    cat_1_df = dec_data[mask]
    
    cat_1_gmv = cat_1_df["gmv"].sum()
    
    results_df.loc[i, "Category_One"] = cat_1
    results_df.loc[i, "Category_One_Dec_GMV"] = cat_1_gmv
    
    i += 1
    
results_df["Oct_to_Nov"] = results_df["Category_One_Nov_GMV"] / results_df["Category_One_Oct_GMV"] - 1
results_df["Nov_to_Dec"] = results_df["Category_One_Dec_GMV"] / results_df["Category_One_Nov_GMV"] - 1

results_df

Unnamed: 0,Category_One,Category_One_Oct_GMV,Category_One_Nov_GMV,Category_One_Dec_GMV,Oct_to_Nov,Nov_to_Dec
0,Accessories,187904865.0,177479220.0,246703309.0,-0.055484,0.390041
1,Bag,239114870.0,228619246.0,263781015.0,-0.043894,0.153801
2,Men's fashion,193851253.0,165768971.0,188135247.0,-0.144865,0.134924
3,Men's Shoes,101629221.0,90250304.0,104093270.0,-0.111965,0.153384
4,Women's fashion,537097328.0,528241014.0,557742972.0,-0.016489,0.055849
5,Women's Shoes,126427631.0,115749937.0,137222235.0,-0.084457,0.185506


In [74]:
results_df.to_csv("Cat_1_Growth.csv")

# Calculate Cat 2 Annual GMV Values for the categories that McGroup is in

In [49]:
'''
#Make a df for each cat 1 then run through cat 2's. 

#Approach 1: Use static Data (less accurate b/c using 1-month snapshot)
cat_1_focus = ["Accessories", "Bag", "Men's fashion", 
               "Men's Shoes", "Women's fashion", 
               "Women's Shoes"]

#Create new df to store cat 1 info
results_df = pd.DataFrame()
i = 0
for cat_1 in cat_1_focus:
    
    #take slice of products_static df include only the right cat
    mask = products_static["category_one_en"] == cat_1
    cat_1_df = products_static[mask]
    
    #now iterate over cat_2's that are sub to this cat_1
    for cat_2 in cat_1_df["category_two_en"].unique():
        mask2 = cat_1_df["category_two_en"] == cat_2
        cat_2_df = cat_1_df[mask2]
        
        cat_2_gmv = cat_2_df["product_gmv"].sum()
        
        results_df.loc[i, "Category_One"] = cat_1
        results_df.loc[i, "Category_Two"] = cat_2
        results_df.loc[i, "Category_Two_Dec_GMV"] = cat_2_gmv
        results_df.loc[i, "Category_Two_Annual_GMV"] = cat_2_gmv * 12
        
        i += 1

results_df 
'''

Unnamed: 0,Category_One,Category_Two,Category_Two_Dec_GMV,Category_Two_Annual_GMV
0,Accessories,Belt,9927493.0,119129916.0
1,Accessories,Necklace,16302723.0,195632676.0
2,Accessories,Rings,7785881.0,93430572.0
3,Accessories,Bracelet,13683288.0,164199456.0
4,Accessories,Earrings,8461123.0,101533476.0
...,...,...,...,...
72,Women's Shoes,Canvas shoes,34359645.0,412315740.0
73,Women's Shoes,Accessories for shoes,2090580.0,25086960.0
74,Women's Shoes,Boots and ankle boots,2840056.0,34080672.0
75,Women's Shoes,Socks and stockings,14493562.0,173922744.0


In [67]:
#Approach 2: Use Daily Data


#Create new df to store results
results_df = pd.DataFrame()
i = 0

cat_1_focus = ["Accessories", "Bag", "Men's fashion", 
               "Men's Shoes", "Women's fashion", 
               "Women's Shoes"]

for cat_1 in cat_1_focus: 
    
    #take slice of products_static df include only the right cat_1
    mask = joined_df["category_one_en"] == cat_1
    cat_1_df = joined_df[mask]
    
    #now iterate over cat_2's that are sub to this cat_1
    for cat_2 in cat_1_df["category_two_en"].unique():
        mask2 = cat_1_df["category_two_en"] == cat_2
        cat_2_df = cat_1_df[mask2]
        
        cat_2_annual_gmv = cat_2_df["4_month_gmv"].sum()
        
        results_df.loc[i, "Category_One"] = cat_1
        results_df.loc[i, "Category_Two"] = cat_2
        results_df.loc[i, "Category_Two_Annual_GMV"] = cat_2_annual_gmv
        
        i += 1

results_df 

Unnamed: 0,Category_One,Category_Two,Category_Two_Annual_GMV
0,Accessories,Belt,106183840.0
1,Accessories,Necklace,165866056.0
2,Accessories,Rings,79873036.0
3,Accessories,Key ring,26672396.0
4,Accessories,Earrings,91987748.0
...,...,...,...
72,Women's Shoes,Casual shoes,69383012.0
73,Women's Shoes,Accessories for shoes,25177912.0
74,Women's Shoes,Boots and ankle boots,30312832.0
75,Women's Shoes,Socks and stockings,147970364.0


In [69]:
results_df = results_df.sort_values(by=["Category_One", "Category_Two"])

In [70]:
results_df.to_csv("cat_2_gmv.csv")

# Calculate Cat 2 Growth

In [71]:
cat_1_focus = ["Accessories", "Bag", "Men's fashion", 
               "Men's Shoes", "Women's fashion", 
               "Women's Shoes"]

#Create new df to store cat 1 info
results_df = pd.DataFrame()
i = 0
for cat_1 in cat_1_focus:
    #take slice of products_static df include only the right cat_1
    mask = oct_data["category_one_en"] == cat_1
    cat_1_df = oct_data[mask]
    
    #now iterate over cat_2's that are sub to this cat_1
    for cat_2 in cat_1_df["category_two_en"].unique():
        mask2 = cat_1_df["category_two_en"] == cat_2
        cat_2_df = cat_1_df[mask2]
        
        cat_2_gmv = cat_2_df["gmv"].sum()
        
        results_df.loc[i, "Category_One"] = cat_1
        results_df.loc[i, "Category_Two"] = cat_2
        results_df.loc[i, "Category_Two_Oct_GMV"] = cat_2_gmv
        
        i += 1

i = 0
for cat_1 in cat_1_focus:
    #take slice of products_static df include only the right cat
    mask = nov_data["category_one_en"] == cat_1
    cat_1_df = nov_data[mask]
    
    #now iterate over cat_2's that are sub to this cat_1
    for cat_2 in cat_1_df["category_two_en"].unique():
        mask2 = cat_1_df["category_two_en"] == cat_2
        cat_2_df = cat_1_df[mask2]
        
        cat_2_gmv = cat_2_df["gmv"].sum()
        
        results_df.loc[i, "Category_One"] = cat_1
        results_df.loc[i, "Category_Two"] = cat_2
        results_df.loc[i, "Category_Two_Nov_GMV"] = cat_2_gmv
        
        i += 1

i = 0
for cat_1 in cat_1_focus:
    #take slice of products_static df include only the right cat
    mask = dec_data["category_one_en"] == cat_1
    cat_1_df = dec_data[mask]
    
    #now iterate over cat_2's that are sub to this cat_1
    for cat_2 in cat_1_df["category_two_en"].unique():
        mask2 = cat_1_df["category_two_en"] == cat_2
        cat_2_df = cat_1_df[mask2]
        
        cat_2_gmv = cat_2_df["gmv"].sum()
        
        results_df.loc[i, "Category_One"] = cat_1
        results_df.loc[i, "Category_Two"] = cat_2
        results_df.loc[i, "Category_Two_Dec_GMV"] = cat_2_gmv
        
        i += 1
    
results_df["Oct_to_Nov"] = results_df["Category_Two_Nov_GMV"] / results_df["Category_Two_Oct_GMV"] - 1
results_df["Nov_to_Dec"] = results_df["Category_Two_Dec_GMV"] / results_df["Category_Two_Nov_GMV"] - 1

results_df

Unnamed: 0,Category_One,Category_Two,Category_Two_Oct_GMV,Category_Two_Nov_GMV,Category_Two_Dec_GMV,Oct_to_Nov,Nov_to_Dec
0,Accessories,Belt,9672038.0,7501793.0,9372129.0,-0.224383,0.249319
1,Accessories,Necklace,15060666.0,13256299.0,13149549.0,-0.119807,-0.008053
2,Accessories,Bracelet,6908711.0,1791204.0,12462170.0,-0.740733,5.957426
3,Accessories,Earrings,2600376.0,9631080.0,7340383.0,2.703726,-0.237844
4,Accessories,Hat,8364018.0,7292536.0,24775045.0,-0.128106,2.397315
...,...,...,...,...,...,...,...
72,Women's Shoes,Canvas shoes,6593395.0,5330739.0,32343549.0,-0.191503,5.067367
73,Women's Shoes,Accessories for shoes,2265966.0,1976549.0,2051963.0,-0.127723,0.038154
74,Women's Shoes,Boots and ankle boots,2762158.0,2138025.0,2678025.0,-0.225958,0.252570
75,Women's Shoes,Socks and stockings,11416117.0,11390633.0,14185841.0,-0.002232,0.245395


In [72]:
results_df = results_df.sort_values(by=["Category_One", "Category_Two"])
results_df.to_csv("cat_2_growth.csv")

### Export joined df into Tableau for effective visualization. 

In [88]:
#Include only needed info like product_gmv, categories, month to make Tableau fast. 
tableau_df = joined_df[["added_at", "gmv", "category_one_en", "category_two_en"]]

cat_1_focus = ["Accessories", "Bag", "Men's fashion", 
               "Men's Shoes", "Women's fashion", 
               "Women's Shoes"]
mask = tableau_df["category_one_en"].isin(cat_1_focus)
tableau_df = tableau_df[mask]

tableau_df = tableau_df.sort_values(by=["category_one_en", "category_two_en"])

In [97]:
tableau_df["Month"] = 0
tableau_df.sample(n=10)

Unnamed: 0,added_at,gmv,category_one_en,category_two_en,Month
1658019,1609372800,9880,Men's fashion,Uniforms,0
3754345,1604102400,119,Bag,Cloth bag,0
2092491,1604102400,13497,Women's fashion,Jackets and Coats,0
5502153,1609372800,185,Bag,Handbag,0
7001196,1609372800,452,Men's fashion,Shirt,0
5165513,1606694400,7500,Women's Shoes,Socks and stockings,0
1773128,1604102400,290,Bag,Backpack,0
4829248,1604102400,882,Men's fashion,Shorts,0
4997940,1606694400,654,Women's fashion,Sets,0
6331011,1606694400,139,Women's fashion,Sleepwear,0


In [96]:
tableau_df["added_at"].unique()


[1604102400, 1606694400, 1609372800]
Categories (3, int64): [1604102400, 1606694400, 1609372800]

In [100]:
tableau_df.loc[tableau_df["added_at"] == 1604102400, "Month"] = "October"
tableau_df.loc[tableau_df["added_at"] == 1606694400, "Month"] = "November"
tableau_df.loc[tableau_df["added_at"] == 1609372800, "Month"] = "December"

In [105]:
tableau_df["Month"] = tableau_df["Month"].astype("category")

In [107]:
tableau_df.to_csv("cat_sizes_for_McGroup_Presentation_Jan_8.csv")

# End of Nakrin's Task
***