In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import math 

# Inputs: 
Input the values you would like to filter by:

In [2]:
#Note: When next batch comes in will have to do x3 instead since we'll 
#have Oct, Nov, Dec, Jan --> x3 for annual guess

#could get rid of *4, *3 and just write *year_expansion

In [3]:
#Filter 1: Categories
(cat_1_on, cat_2_on, cat_3_on) = True, False, False
cat_lst_1 = ["Accessories", "Bag", "Men's fashion", 
               "Men's Shoes", "Women's fashion", 
               "Women's Shoes"]
cat_lst_2 = []
cat_lst_3 = []

#Filter 2: Times (Update-status, how established product is)
unix_time_now = 1609682748 #Change based on Nick's input

established_on = False
min_years = 1

update_on = True
updated_previously_weeks = 2


#Filter 3: Annual Brand GMV
min_gmv_on = True
min_gmv = 70000

#Filter 4: Brand Avg Price 
min_unit_price_on = True
min_unit_price = 300

#Filter 5: GMV Concentration
min_gmv_conc_on = False
top_n = 3 #concetration num of products #IF CHANGE THIS NUMBER CHANGE COLUMN NAME TOO 
min_concentration = 0.2

#Filter 6: Avg Product Rating
min_avg_star_on = True
min_avg_star = 4.5

#Filter 7: Bad Customer Ratings 
max_bad_rating_on = True
max_bad_rating = 0.5

#Filter 8: Brand TOTAL GMV 
min_total_brand_gmv_on = True
min_total_brand_gmv = 2000000 #should not be dividing by 12 since we want annual

# Import Data

In [4]:
products_static = pd.read_csv("products_static_1609779145.csv")
products_daily = pd.read_csv("products_daily_1609682748.csv")

Change data types in dataframes to conserve memory:

In [5]:
def convert_to_cats(df, to_convert):
    '''
    Converts cols in to_convert_list to categories. 
    
    Inputs:
        df: a DataFrame
        to_convert: a list of column names (as strings)
                    we'd like to convert
    Returns:
        Nothing; modifies dataframe in place
    '''
    for col in df.columns:
        if col in to_convert:
            df[col] = df[col].astype("category")

In [6]:
to_convert_products_static = ['category_one', 'category_one_en',
                              'category_one_th', 'category_two', 
                              'category_two_en', 'category_two_th',
                              'category_three', 'category_three_en', 
                              'category_three_th', 'reviews_count_context', 
                              'reviews_count_image', 'shopee_verified', 
                              'show_discount']
convert_to_cats(products_static, to_convert_products_static)

to_convert_products_daily = ["added_at"]

convert_to_cats(products_daily, to_convert_products_daily)

In [7]:
def downcast_numbers(df):
    '''
    Downcasts floats and ints.
    
    Inputs:
        df: a DataFrame objecet. 
    Returns:
        Nothing; modifies list in place
    '''
    for col in df.columns:
        if df[col].dtype == "float":
            df[col] = pd.to_numeric(df[col], downcast="float")
        if df[col].dtype == "int":
            df[col] = pd.to_numeric(df[col], downcast="unsigned")

In [8]:
downcast_numbers(products_static)
downcast_numbers(products_daily)

# Merge products_static w/ products_daily for joined_df

In [9]:
#First, prep the right df (products_static) by making a copy of 
#products_static keeping only relevant info

small_static_df = products_static[["itemid", "brand", "category_one", 
                                   "category_one_en", "category_two", 
                                   "category_two_en", "category_three",
                                   "category_three_en", "ctime",
                                   "modified_at"]]

#Next, join this to the main products_daily dataframe
joined_df = products_daily.merge(small_static_df, on="itemid", how="left")

#Now, remove products that haven't sold from our joined_df
mask = joined_df["gmv"] > 1
joined_df = joined_df[mask]

In [10]:
#Make New Columns to account that our data is a 3-mo. snapshot

#Make new column in joined_df for weighted rating
joined_df["weighted_star"] = joined_df["rating_star"] * joined_df["sold"]
joined_df["4_month_weighted_star"] = joined_df["weighted_star"] * 4
joined_df["4_month_gmv"] = joined_df["gmv"] * 4 
joined_df["4_month_sold"] = joined_df["sold"] * 4 
joined_df["4_month_view_count"] = joined_df["view_count"] * 4

#Make untouched copy of joined_df for later
untouched_joined_df = joined_df.copy(deep=True)

#### Logic for why we have columns like "4_month_gmv": 

We want to calculate annual gmv. If a product is present for all Oct, Nov, Dec, i.e. it has 3 rows in the products_daily, then we'd take the average product_gmv for each row and x12. This is equivalent to taking the row-specific product_gmv, multiplying by 4, then adding up these numbers. 

Our method has assumption: if a product sells for only 1 of 3 months, then it will sell for 4 of 12 months.


# Product Filter 1: Category Filter

Filter our DataFrames to only keep rows with Categories we care for. 


In [11]:
#Helper Functions

def filter_by_category(df, category_n, ok_cat_lst):
    '''
    Keeps rows in the df that have cats in ok_cat_lst. 
    
    Inputs:
        df: a DataFrame
        category_n: (str) eg. "category_one"
        ok_cat_lst: (lst) of acceptable cat for that 
          cat as numbers eg [50, 26]
    Returns:
        The modified df 
    '''
    
    mask = df[category_n].isin(ok_cat_lst)
    return df[mask]

In [12]:
if cat_1_on:
    joined_df = filter_by_category(joined_df, "category_one_en", cat_lst_1) 
    #Care: I'm using "category_one_en" not "category_one". Flexible but needs moidfying

In [13]:
if cat_2_on:
    joined_df = filter_by_category(joined_df, "category_one_en", cat_lst_1) 
    #Care: I'm using "category_one_en" not "category_one". Flexible but needs moidfying

In [14]:
if cat_3_on:
    joined_df = filter_by_category(joined_df, "category_one_en", cat_lst_1) 
    #Care: I'm using "category_one_en" not "category_one". Flexible but needs moidfying

# Product Filter 2: Keep only Established Products and Products that Sellers are Updating

Filter out products that have been sold for less than a specified time. 

In [15]:
if established_on:
    min_s = min_years * 365 * 24 * 60 * 60 
    mask = unix_time_now - joined_df["ctime"] > min_s
    joined_df = joined_df[mask]
    
if update_on:
    max_s = (updated_previously_weeks * 7 * 24 * 60 * 60)
    mask = (unix_time_now - joined_df["modified_at"] < max_s)
    joined_df = joined_df[mask]

# Brand Definition

Extract unique brand names from the remaining products

In [16]:
mask = joined_df["brand"] != "No Brand(ไม่มียี่ห้อ)"
joined_df = joined_df[mask]

brands_list = joined_df["brand"].unique()
brands_list = np.delete(brands_list, np.where(brands_list == ('No Brand(ไม่มียี่ห้อ)')))

In [17]:
#Clean up brands_df
brands_df = pd.DataFrame(brands_list, columns = ["Brand Name"])

brands_df.dropna(inplace=True)

s1 = brands_df["Brand Name"] != "None"
brands_df = brands_df[s1]

s2 = brands_df["Brand Name"] != "0"
brands_df = brands_df[s2] 


In [18]:
#Set up columns for our brands_df

#NOTE: these values are all 1 month-based since the products data was only one month
brands_df["Filtered_Brand_Annual_GMV"] = 0 
brands_df["Filtered_Average_Unit_Price"] = 0
brands_df["%_Filtered_GMV_from_Top_3_Products"] = 0
brands_df["Filtered_Weighted_Star_Rating"] = 0
brands_df["Filtered_Bad_Rating_Percent"] = 0
brands_df.set_index("Brand Name", inplace = True)

In [19]:
for brand in brands_df.index:
    
    #make a sub-df containing only rows with the correct brand
    my_brand_df = joined_df[joined_df["brand"] == brand]
    my_brand_df = my_brand_df.sort_values(by=["gmv"], ascending=False) 
    
    #Approximated ANNUAL gmv calculation
    brand_gmv = my_brand_df["4_month_gmv"].sum()
    brands_df.loc[brand, "Filtered_Brand_Annual_GMV"] = brand_gmv
    
    #ANNUAL volume calculation
    brand_volume = my_brand_df["4_month_sold"].sum()
    brands_df.loc[brand, "Filtered_Average_Unit_Price"] = brand_gmv / brand_volume
    
    '''
    Think about this
    #GMV Concentration Calculation
    if len(my_brand_df) >= top_n: 
        top_n_df = my_brand_df.head(top_n)
        top_n_gmv = top_n_df["gmv"].sum()
        brands_df.loc[brand, "%_Filtered_GMV_from_Top_3_Products"] = top_n_gmv / brand_gmv
        
    else:
        #brand sells less than like 5 products --> highly conc.
        brands_df.loc[brand, "%_Filtered_GMV_from_Top_3_Products"] = 1.0  
    '''

    #ANNUAL Weighted Star Rating
    total_star = my_brand_df["4_month_weighted_star"].sum()
    brands_df.loc[brand, "Filtered_Weighted_Star_Rating"] = total_star / brand_volume
    
    #ANNUAL Bad Rating Count
    bad_rating_count = my_brand_df["rating_count_one"].sum() + my_brand_df["rating_count_two"].sum()
    bad_rating_count *= 4 #to be fair and see over a year's data
    total_rating_count = my_brand_df["rating_count_total"].sum() * 4
    
    if total_rating_count != 0:
        val = bad_rating_count / total_rating_count 
    else: 
        val = -1
    
    brands_df.loc[brand, "Filtered_Bad_Rating_Percent"] = val

brands_df["Filtered_Bad_Rating_Percent"] = brands_df["Filtered_Bad_Rating_Percent"].round(decimals=3)

In [20]:
print(len(brands_df))

11700


# Brand Filter 1: Brand Filtered GMV

In [21]:
if min_gmv_on:
    brands_df = brands_df[brands_df["Filtered_Brand_Annual_GMV"] >= min_gmv]

In [22]:
print(len(brands_df))

3347


# Brand Filter 2: Brand Average Price

In [23]:
if min_unit_price_on:
    brands_df = brands_df[brands_df["Filtered_Average_Unit_Price"] >= min_unit_price]

In [24]:
print(len(brands_df))

2033


# Brand Filter 3: GMV Concentration of Top (3) Products

In [25]:
if min_gmv_conc_on:
    brands_df = brands_df[brands_df["Filtered_GMV_Concentration"] >= min_concentration]

In [26]:
print(len(brands_df))

2033


# Brand Filter 4: Average Product Rating

In [27]:
if min_avg_star_on:
    brands_df = brands_df[brands_df["Filtered_Weighted_Star_Rating"] >= min_avg_star]

In [28]:
print(len(brands_df))

1574


# Brand Filter 5: Bad Customer Ratings

In [29]:
if max_bad_rating_on:
    brands_df = brands_df[brands_df["Filtered_Bad_Rating_Percent"] <= max_bad_rating]

In [30]:
print(len(brands_df))

1574


# (GMV Growth?) - Revisit Later

# Brand Filter 6: Brand GMV (Across All Categories) 

Use a fresh copy of the products data to get more info on our final list of brands. We use a fresh set because now that we've narrowed down our brand list, we want to see how our Brands are doing as a whole (all cats, all products). 

In [33]:
#Assume this is always kept on for simplicity
if min_total_brand_gmv_on:
        
    for brand in brands_df.index:

        #make a sub-df containing only rows with the correct brand
        mask_brand = untouched_joined_df["brand"] == brand
        my_brand_df = untouched_joined_df[mask_brand]

        #TOTAL ANNUAL Brand gmv calculation
        total_brand_gmv = my_brand_df["4_month_gmv"].sum() #DO NOT MULTIPLY BY 3 HERE. 
                                                           #THIS RESTS ON ASS. that repeated 
                                                           #products have all Oct, nov, dec entries
        brands_df.loc[brand, "Brand_GMV_All_Cats"] = total_brand_gmv
        
        #Total brand view count 

        total_brand_views = my_brand_df["4_month_view_count"].sum()
        conv_rate = total_brand_gmv / total_brand_views
        brands_df.loc[brand, "Conversion_Rate_All_Cats_(gmv/views)"] = conv_rate
        
        #--------------------------------------------------------------
        #Take this out and put later if the min_total_brand_gmv 
        #becomes higher, otherwise inclue this in the for loop
        winning_cat_2 = None
        winning_cat_2_gmv = 0.0

        for cat_2 in my_brand_df["category_two_en"].unique():
            mask_cat_2 = my_brand_df["category_two_en"] == cat_2
            cat_2_df = my_brand_df[mask_cat_2]

            cat_2_gmv = cat_2_df["4_month_gmv"].sum()
            if cat_2_gmv > winning_cat_2_gmv:
                winning_cat_2 = cat_2
                winning_cat_2_gmv = cat_2_gmv
                    
        percent = (winning_cat_2_gmv / total_brand_gmv).round(decimals=3)
        brands_df.loc[brand, "Cat_2_Percent_of_Total_GMV"] = percent 
        brands_df.loc[brand, "Brand_Category_2"] = winning_cat_2

        print("for brand: ", brand, "Winning cat 2 is: ", winning_cat_2)
        print("with a cat_2_gmv value of: ", winning_cat_2_gmv)
        
        #--------------------------------------------------------------
            
        
        
    #Apply the total brand gmv filter
    brands_df = brands_df[brands_df["Brand_GMV_All_Cats"] >= min_total_brand_gmv]

for brand:  Guy Laroche(กีลาโรช) Winning cat 2 is:  Men's underwear
with a cat_2_gmv value of:  2470736
for brand:  Guy Laroche(กีย์ ลาโรช) Winning cat 2 is:  Belt
with a cat_2_gmv value of:  147980
for brand:  Bata(บาจา) Winning cat 2 is:  Slippers
with a cat_2_gmv value of:  5147664
for brand:  Footin(ฟุตอิน) Winning cat 2 is:  High heels
with a cat_2_gmv value of:  646912
for brand:  ﻿Bata Men Winning cat 2 is:  Slippers
with a cat_2_gmv value of:  49364
for brand:  Bata(บาต้า) Winning cat 2 is:  Slippers
with a cat_2_gmv value of:  216200
for brand:  BATA(บาจา)​ Winning cat 2 is:  Slip-on sneakers
with a cat_2_gmv value of:  147160
for brand:  Footin(ฟูทิน) Winning cat 2 is:  Canvas shoes
with a cat_2_gmv value of:  152640
for brand:  Bata(เบต้า) Winning cat 2 is:  Canvas shoes
with a cat_2_gmv value of:  135484
for brand:  ﻿Bata Ladies Winning cat 2 is:  High heels
with a cat_2_gmv value of:  154892
for brand:  ﻿Bata Winning cat 2 is:  High heels
with a cat_2_gmv value of:  107148

for brand:  Champion(แชมป์เปี่ยน) Winning cat 2 is:  T-shirt
with a cat_2_gmv value of:  762080
for brand:  Ep(อีีพี) Winning cat 2 is:  Shirt
with a cat_2_gmv value of:  87264
for brand:  Ep(อีพี) Winning cat 2 is:  Jackets and Coats
with a cat_2_gmv value of:  121528
for brand:  EP(อีพี) Winning cat 2 is:  Dress
with a cat_2_gmv value of:  70252
for brand:  Samsonite(แซมโซไนท์) Winning cat 2 is:  Luggage
with a cat_2_gmv value of:  6779560
for brand:  Baoji(บาโอจิ) Winning cat 2 is:  Canvas shoes
with a cat_2_gmv value of:  23305192
for brand:  BAOJI(บาโอจิ) Winning cat 2 is:  Lace-up sneakers
with a cat_2_gmv value of:  2682432
for brand:  Baoji(บาโอจิ)​ Winning cat 2 is:  Lace-up sneakers
with a cat_2_gmv value of:  7935504
for brand:  Fila(ฟีล่า) Winning cat 2 is:  Lace-up sneakers
with a cat_2_gmv value of:  812356
for brand:  Scholl(สกอลล์) Winning cat 2 is:  Slippers
with a cat_2_gmv value of:  22306832
for brand:  G2000(จีทูเทาว์ซัน) Winning cat 2 is:  Polo shirt
with a cat_2_

for brand:  Merch(เมิช) Winning cat 2 is:  T-shirt
with a cat_2_gmv value of:  76540
for brand:  Olymp(โอลิมป์) Winning cat 2 is:  Shirt
with a cat_2_gmv value of:  549520
for brand:  Flynow(ฟลายนาว) Winning cat 2 is:  Shoulder bag
with a cat_2_gmv value of:  1117972
for brand:  INCO (อินโค) Winning cat 2 is:  Underwear
with a cat_2_gmv value of:  874720
for brand:  Under Amour(อันเดอร์อาร์เมอร์) Winning cat 2 is:  Brand name
with a cat_2_gmv value of:  293760
for brand:  NIPON Winning cat 2 is:  Men's underwear
with a cat_2_gmv value of:  257600
for brand:  Rollica (โรลลิกา) Winning cat 2 is:  Luggage
with a cat_2_gmv value of:  1007240
for brand:  SWISS GEAR(สวิสเกียร์) Winning cat 2 is:  Luggage
with a cat_2_gmv value of:  1032852
for brand:  SWISSGEAR(สวิตซ์เกียร์) Winning cat 2 is:  Shoulder bag
with a cat_2_gmv value of:  50264
for brand:  Swiss gear(สวิสเกียร์) Winning cat 2 is:  Flower bag
with a cat_2_gmv value of:  50116
for brand:  Hey Dude(เฮ้ ดูด) Winning cat 2 is:  Slip-o

for brand:  Sabina(ซาบีน่า) Winning cat 2 is:  Underwear
with a cat_2_gmv value of:  269136936
for brand:  Sabina Winning cat 2 is:  Underwear
with a cat_2_gmv value of:  5445880
for brand:  Sabina(ซาบินา) Winning cat 2 is:  Underwear
with a cat_2_gmv value of:  47016904
for brand:  Peak Design(พีค ดีไซน์) Winning cat 2 is:  Camera accessory
with a cat_2_gmv value of:  14527196
for brand:  Twentysecond(ทเวนตี้เซคเคิล) Winning cat 2 is:  Shirt
with a cat_2_gmv value of:  1179404
for brand:  Korin Design(โครินเดสซิ่ง) Winning cat 2 is:  Backpack
with a cat_2_gmv value of:  196080
for brand:  Twentysecond(ทเวนตี้ซีคอนด์) Winning cat 2 is:  Polo shirt
with a cat_2_gmv value of:  223664
for brand:  Hapitas(ฮาพิเทส) Winning cat 2 is:  Luggage
with a cat_2_gmv value of:  641600
for brand:  6ixty8ight(ซิกซ์ตี้เอธ) Winning cat 2 is:  Underwear
with a cat_2_gmv value of:  77132
for brand:  Pacsafe(พาคเซฟ) Winning cat 2 is:  Flower bag
with a cat_2_gmv value of:  75736
for brand:  Levi's(ลีวายส์)

for brand:  Life Blessing(พรีชีวิต) Winning cat 2 is:  Gold
with a cat_2_gmv value of:  281660
for brand:  Bangkok Golds(บางกอกโกลด์ส) Winning cat 2 is:  Gold
with a cat_2_gmv value of:  70279164
for brand:  Carboy(คาร์บอย) Winning cat 2 is:  Lace up shoes
with a cat_2_gmv value of:  94600
for brand:  Onisuka Tiger(โอนิซึกะ ไทเกอร์) Winning cat 2 is:  Lace up shoes
with a cat_2_gmv value of:  138068
for brand:  era-won Winning cat 2 is:  Lace up shoes
with a cat_2_gmv value of:  121440
for brand:  Binsin(บินซิน) Winning cat 2 is:  Slip-on leather shoes
with a cat_2_gmv value of:  953432
for brand:  Srimuangsupan Gold(ศรีเมืองสุพรรณโกลด์) Winning cat 2 is:  Gold
with a cat_2_gmv value of:  11474036
for brand:  Goldseller(โกลด์เซลเลอร์) Winning cat 2 is:  Gold
with a cat_2_gmv value of:  3567840
for brand:  Swpgold(เพชรทองสุวรรณภูมิ) Winning cat 2 is:  Gold
with a cat_2_gmv value of:  5643988
for brand:  Calorie Fashion(แคลอรี่แฟชั่น) Winning cat 2 is:  Dress
with a cat_2_gmv value of:  

for brand:  Guess(เกสส์) Winning cat 2 is:  Flower bag
with a cat_2_gmv value of:  1418164
for brand:  Jacob International(จาค็อบ อินเตอร์เนชั่นแนล) Winning cat 2 is:  Handbag
with a cat_2_gmv value of:  3940456
for brand:  Jacob (จาค็อบ) Winning cat 2 is:  Wallet
with a cat_2_gmv value of:  136796
for brand:  Shoe Story (ชู สตอรี่) Winning cat 2 is:  Flat shoes
with a cat_2_gmv value of:  249128
for brand:  Seira Elves(เซร่า เอลฟ์) Winning cat 2 is:  Flat shoes
with a cat_2_gmv value of:  618632
for brand:  Saramanda  (ซาราแมนด้า) Winning cat 2 is:  Flat shoes
with a cat_2_gmv value of:  90240
for brand:  เก๋ปลาทอง Winning cat 2 is:  Plus size clothing
with a cat_2_gmv value of:  352000
for brand:  Flat 2112(แฟลต 2112) Winning cat 2 is:  Boots and ankle boots
with a cat_2_gmv value of:  615032
for brand:  Sirigold(ศิริโกลด์) Winning cat 2 is:  Gold
with a cat_2_gmv value of:  7144024
for brand:  TAYWIN (เทวินทร์) Winning cat 2 is:  Lace up shoes
with a cat_2_gmv value of:  31680
for b

for brand:  Body Glove(บอดี้โกรบ) Winning cat 2 is:  Jackets and Coats
with a cat_2_gmv value of:  2468968
for brand:  Body Glove (บอดี้ โกลฟ) Winning cat 2 is:  Jackets and Coats
with a cat_2_gmv value of:  634952
for brand:  Bodyglove Winning cat 2 is:  Polo shirt
with a cat_2_gmv value of:  214160
for brand:  PLAY Comme des Garçons(เพลย์คอมมี่เดสกาคอน) Winning cat 2 is:  T-shirt
with a cat_2_gmv value of:  2321468
for brand:  Ozuko(โอซูโก) Winning cat 2 is:  Flower bag
with a cat_2_gmv value of:  694372
for brand:  Musculo(มัสคูโล่) Winning cat 2 is:  T-shirt
with a cat_2_gmv value of:  390000
for brand:  Kyda(ไกดา) Winning cat 2 is:  High heels
with a cat_2_gmv value of:  2676280
for brand:  Bodyglove(บอดี้โกรฟ) Winning cat 2 is:  Polo shirt
with a cat_2_gmv value of:  128560
for brand:  Body Glove (บอดี้โกลฟ) Winning cat 2 is:  Jackets and Coats
with a cat_2_gmv value of:  651784
for brand:  ห้างทองทรัพย์เจริญ 1(ห้างทองทรัพย์เจริญ1) Winning cat 2 is:  Gold
with a cat_2_gmv value o

for brand:  Hi Fashion(ไฮ แฟชั่น) Winning cat 2 is:  Skirt
with a cat_2_gmv value of:  81040
for brand:  Donna & Co(ดอนน่าแอนด์โค) Winning cat 2 is:  Slippers
with a cat_2_gmv value of:  69884
for brand:  TNP-Thanapand Winning cat 2 is:  Safety shoes
with a cat_2_gmv value of:  186640
for brand:  Jeab Seng Hong(ห้างทองเจี๊ยบเซ้งฮง) Winning cat 2 is:  Gold
with a cat_2_gmv value of:  167440
for brand:  Hush puppie(ฮัชพัพพีส์) Winning cat 2 is:  Lace-up sneakers
with a cat_2_gmv value of:  171892
for brand:  Buckleme_brand(บัคเคิลมี) Winning cat 2 is:  High heels
with a cat_2_gmv value of:  215760
for brand:  ไทยธานี35 จำกัด Winning cat 2 is:  Gold
with a cat_2_gmv value of:  1606632
for brand:  ห้างทองรุ่งโรจน์ Winning cat 2 is:  Gold
with a cat_2_gmv value of:  71320
for brand:  SSW Gold(เอสเอสดับบลิว โกลด์) Winning cat 2 is:  Gold
with a cat_2_gmv value of:  4924024
for brand:  Windy(วินดี้) Winning cat 2 is:  Slippers
with a cat_2_gmv value of:  77640
for brand:  Golden Oldy(โกลเด้น 

for brand:  JIMMY CHOO(จิมมี่ ชู) Winning cat 2 is:  High heels
with a cat_2_gmv value of:  165580
for brand:  LYN(ลินน์) Winning cat 2 is:  Shoulder bag
with a cat_2_gmv value of:  9071104
for brand:  Starkela (สตาร์เคล่า) Winning cat 2 is:  Boots and ankle boots
with a cat_2_gmv value of:  51360
for brand:  Bape(เบพ) Winning cat 2 is:  Shoulder bag
with a cat_2_gmv value of:  36672
for brand:  Kloset Etcetera(โคลเซ็ท เอทเซเทร่า) Winning cat 2 is:  Handbag
with a cat_2_gmv value of:  232500
for brand:  Collonil(โคโลนิล) Winning cat 2 is:  Bag accessories
with a cat_2_gmv value of:  41748
for brand:  Gusmob(กัสม็อป) Winning cat 2 is:  T-shirt
with a cat_2_gmv value of:  169600
for brand:  Thaitanee35(ไทยธานี35) Winning cat 2 is:  Gold
with a cat_2_gmv value of:  971980
for brand:  Awarin(อวาริน) Winning cat 2 is:  Shoulder bag
with a cat_2_gmv value of:  1333688
for brand:  Velika(เวลิก้า) Winning cat 2 is:  T-shirt
with a cat_2_gmv value of:  632040
for brand:  HOKA ONE ONE(โฮ้กก้าโอเ

for brand:  Anyagoldsmiths(อัญญาโกลด์สมิธ) Winning cat 2 is:  Gold
with a cat_2_gmv value of:  83760
for brand:  Patagonia(พาธาโกเนีย) Winning cat 2 is:  T-shirt
with a cat_2_gmv value of:  1144020
for brand:  babyahjuma Winning cat 2 is:  High heels
with a cat_2_gmv value of:  80856
for brand:  Sweat16(สเว็ตซิกซ์ทีน) Winning cat 2 is:  Other
with a cat_2_gmv value of:  587000
for brand:  Bape(เบ็ป) Winning cat 2 is:  T-shirt
with a cat_2_gmv value of:  57212
for brand:  Luxx(ลัคซ์) Winning cat 2 is:  Bag accessories
with a cat_2_gmv value of:  113000
for brand:  Anti social club(แอนตี้ โซเชียล คลับ) Winning cat 2 is:  T-shirt
with a cat_2_gmv value of:  168432
for brand:  LIQUID BLUE(ลิควิดบลู) Winning cat 2 is:  T-shirt
with a cat_2_gmv value of:  355276
for brand:  Hybrid Outfitters(ไฮบริด เอาท์ฟิทเตอร์) Winning cat 2 is:  Outerwear
with a cat_2_gmv value of:  99240
for brand:  Hybrid Outfitters(ไฮบริดเอ๊าท์ฟิตเตอร์) Winning cat 2 is:  Polo shirt
with a cat_2_gmv value of:  118360
f

for brand:  Tooch Winning cat 2 is:  Pants
with a cat_2_gmv value of:  528000
for brand:  T Girls(ที เกิร์ล) Winning cat 2 is:  Pants
with a cat_2_gmv value of:  225472
for brand:  YYNA(วายวายนา) Winning cat 2 is:  Shorts
with a cat_2_gmv value of:  193552
for brand:  urmalebrand Winning cat 2 is:  Shorts
with a cat_2_gmv value of:  76440
for brand:  Goodwin(กู๊ดวิน) Winning cat 2 is:  Shorts
with a cat_2_gmv value of:  318576
for brand:  Sommai(สมหมาย) Winning cat 2 is:  Shorts
with a cat_2_gmv value of:  1549396
for brand:  Fanture(แฟนเจอร์) Winning cat 2 is:  Slippers
with a cat_2_gmv value of:  500420
for brand:  Safety Goods(เซฟตี้ กู๊ดส์) Winning cat 2 is:  Safety shoes
with a cat_2_gmv value of:  1664944
for brand:  Darpa(ดาร์ปา) Winning cat 2 is:  Safety shoes
with a cat_2_gmv value of:  457420
for brand:  One Shoes(วัน ชูส) Winning cat 2 is:  Safety shoes
with a cat_2_gmv value of:  266732
for brand:  Safety Jogger(เซฟตี้ จ๊อกเกอร์) Winning cat 2 is:  Safety shoes
with a cat_2

for brand:  Sara Beauty London(ซาร่า บิวตี้ ลอนดอน) Winning cat 2 is:  Underwear
with a cat_2_gmv value of:  1974408
for brand:  Basics By Sita(เบสิคบายสิตา) Winning cat 2 is:  Pants
with a cat_2_gmv value of:  847640
for brand:  Roomry.n(รูมรี่) Winning cat 2 is:  Pants
with a cat_2_gmv value of:  309960
for brand:  ESTHETA(เอสเดอะต้าร์) Winning cat 2 is:  Wallet
with a cat_2_gmv value of:  322384
for brand:  Doughnut Macaroon(โดห์นัท มาการูน) Winning cat 2 is:  Backpack
with a cat_2_gmv value of:  1749628
for brand:  AppleSheep Winning cat 2 is:  Other
with a cat_2_gmv value of:  87120
for brand:  Whiteoak Factory(ไวท์โอ๊ค) Winning cat 2 is:  Shoulder bag
with a cat_2_gmv value of:  224000
for brand:  MUSEONTHEMOVE Winning cat 2 is:  High heels
with a cat_2_gmv value of:  129300
for brand:  5 KG(5 เคจี) Winning cat 2 is:  Pants
with a cat_2_gmv value of:  136616
for brand:  Fiftysix(ฟิฟตี้ซิกซ์) Winning cat 2 is:  High heels
with a cat_2_gmv value of:  109456
for brand:  Beming(บีมิ่

for brand:  OZUKO(โอซูโกะ) Winning cat 2 is:  Shoulder bag
with a cat_2_gmv value of:  414056
for brand:  Golden Wolf(โกลเดนวู๊ฟ) Winning cat 2 is:  Backpack
with a cat_2_gmv value of:  37136
for brand:  I cheer(ไอ เชียร์) Winning cat 2 is:  Underwear
with a cat_2_gmv value of:  1287200
for brand:  Kiddi.s Winning cat 2 is:  Shirt
with a cat_2_gmv value of:  52000
for brand:  SAADO Winning cat 2 is:  Slippers
with a cat_2_gmv value of:  190316
for brand:  Magofootwear(มาโกะฟุตแวร์) Winning cat 2 is:  Slippers
with a cat_2_gmv value of:  765960
for brand:  Stradivarius(สตราดิวาริอุส) Winning cat 2 is:  Denim
with a cat_2_gmv value of:  79996
for brand:  Mora Jewelry Diamond(โมรา จิลเวอรี่ ไดม่อน) Winning cat 2 is:  Rings
with a cat_2_gmv value of:  58648
for brand:  Chaps(แชปส์) Winning cat 2 is:  Slippers
with a cat_2_gmv value of:  304092
for brand:  Bunkernroi Gold(บุญเกินร้อยโกล์ด) Winning cat 2 is:  Uniforms
with a cat_2_gmv value of:  754464
for brand:  MP UNIFORM(เอ็มพียูนิฟอร์ม)

for brand:  Puri & me Winning cat 2 is:  Jackets and Coats
with a cat_2_gmv value of:  389200
for brand:  About Boy(อะเบ้าท์บอย) Winning cat 2 is:  Long pants
with a cat_2_gmv value of:  470540
for brand:  Proman(โปแมน) Winning cat 2 is:  Long pants
with a cat_2_gmv value of:  177808
for brand:  SMILE(สไมล์) Winning cat 2 is:  Long pants
with a cat_2_gmv value of:  267840
for brand:  GIOVANNI(จีโอวานี) Winning cat 2 is:  Jackets and Coats
with a cat_2_gmv value of:  81600
for brand:  Tppjeans Winning cat 2 is:  Long pants
with a cat_2_gmv value of:  81684
for brand:  Frontnine(ฟร้อนไนล์) Winning cat 2 is:  Jackets and Coats
with a cat_2_gmv value of:  214224
for brand:  Goodwin76(กู๊ดวิน76) Winning cat 2 is:  Long pants
with a cat_2_gmv value of:  143192
for brand:  The North Face(เดอะ นอร์ธ เฟส) Winning cat 2 is:  Jackets and Coats
with a cat_2_gmv value of:  326388
for brand:  Weftloom(เวลลูม) Winning cat 2 is:  Jackets and Coats
with a cat_2_gmv value of:  87120
for brand:  DOCKER(ด

for brand:  Evalyn(เอ-วา-ลีน) Winning cat 2 is:  Jumpsuit
with a cat_2_gmv value of:  520304
for brand:  Chamniii(แชมนิ) Winning cat 2 is:  Shirt
with a cat_2_gmv value of:  4481516
for brand:  Lism Brand(ลิสซึม แบรนด์) Winning cat 2 is:  Jumpsuit
with a cat_2_gmv value of:  476640
for brand:  Krissify(คริสสิฟาย) Winning cat 2 is:  Dress
with a cat_2_gmv value of:  3143440
for brand:  Madam peony(มาดาม พิโอนี่) Winning cat 2 is:  Wallet
with a cat_2_gmv value of:  434320
for brand:  Bonnes frazier(บอนเนส ฟาเซียร์) Winning cat 2 is:  Dress
with a cat_2_gmv value of:  2936336
for brand:  N์apatvogue.bkk (นภัทโว๊คดอทบีเคเค) Winning cat 2 is:  Dress
with a cat_2_gmv value of:  266200
for brand:  Basic by sita(เบสิค บาย ซิต้า) Winning cat 2 is:  Dress
with a cat_2_gmv value of:  2954400
for brand:  Ketysmile(เคธี่สไมล์) Winning cat 2 is:  Dress
with a cat_2_gmv value of:  273760
for brand:  Krissify(คริสซี่ฟาย) Winning cat 2 is:  Jumpsuit
with a cat_2_gmv value of:  106880
for brand:  Lille

for brand:  Truffle(ทรัฟเฟิล) Winning cat 2 is:  Shorts
with a cat_2_gmv value of:  149040
for brand:  Alin Brand(เอลิน แบรนด์) Winning cat 2 is:  Slippers
with a cat_2_gmv value of:  174860
for brand:  AMY.BRAND Winning cat 2 is:  Dress
with a cat_2_gmv value of:  73336
for brand:  Quick Step(ควิกสเตป) Winning cat 2 is:  Slippers
with a cat_2_gmv value of:  144224
for brand:  Quick Step(ควิก สเต็บ) Winning cat 2 is:  Slippers
with a cat_2_gmv value of:  208468
for brand:  ฺBellabona(เบลล่าโบนา) Winning cat 2 is:  Dress
with a cat_2_gmv value of:  236280
for brand:  Janen Closet(จาเนน โคลเซต) Winning cat 2 is:  Dress
with a cat_2_gmv value of:  225240
for brand:  Goodwin 76(กู๊ดวิน 76) Winning cat 2 is:  Shorts
with a cat_2_gmv value of:  135600
for brand:  Tory burch(ทอรี เบิร์ช) Winning cat 2 is:  Slippers
with a cat_2_gmv value of:  189720
for brand:  Kimber(คิมเบอร์) Winning cat 2 is:  Dress
with a cat_2_gmv value of:  126280
for brand:  Sharya(ชาร์ยา) Winning cat 2 is:  Dress
with

for brand:  Medi(เมดิ) Winning cat 2 is:  Socks and stockings
with a cat_2_gmv value of:  1554020
for brand:  PAZZO(พาสโซ่) Winning cat 2 is:  Men's underwear
with a cat_2_gmv value of:  899960
for brand:  Giordano(จีออดาโน) Winning cat 2 is:  Men's underwear
with a cat_2_gmv value of:  150904
for brand:  Musculo Winning cat 2 is:  Shorts
with a cat_2_gmv value of:  106200
for brand:  Groovin(กรูฟวิน) Winning cat 2 is:  Men's underwear
with a cat_2_gmv value of:  93100
for brand:  Columbia(คอลัมเบีย) Winning cat 2 is:  Men's underwear
with a cat_2_gmv value of:  131964
for brand:  Feetures (ฟีเจอร์) Winning cat 2 is:  Socks
with a cat_2_gmv value of:  157880
for brand:  Dexshell(เด็กซ์เชล) Winning cat 2 is:  Socks
with a cat_2_gmv value of:  248376
for brand:  Huak Brand(ฮวกแบรนด์) Winning cat 2 is:  Shirt
with a cat_2_gmv value of:  970160
for brand:  Polo ralph lauren(โปโลราล์ฟลอเรน) Winning cat 2 is:  Shirt
with a cat_2_gmv value of:  6405464
for brand:  Dosmart(ดอสมาร์ท) Winning ca

for brand:  CharinBrand(ชรินทร์แบรนด์) Winning cat 2 is:  Handbag
with a cat_2_gmv value of:  75760
for brand:  Vince everyday(วินซ์ เอเวอรี่เดย์) Winning cat 2 is:  Shirt
with a cat_2_gmv value of:  72600
for brand:  Everyday Apparels(เอวี่เดย์ แอพพาเรล) Winning cat 2 is:  Cloth bag
with a cat_2_gmv value of:  166600
for brand:  Bagtash(แบ็กแทช) Winning cat 2 is:  Slip-on leather shoes
with a cat_2_gmv value of:  101440
for brand:  Loonny(ลูนนี่) Winning cat 2 is:  Dress
with a cat_2_gmv value of:  2366320
for brand:  LamuneStudio(ละมุนสตูดิโอ) Winning cat 2 is:  Shirt
with a cat_2_gmv value of:  370320
for brand:  Crosstwelfth(ครอสทเวลท์) Winning cat 2 is:  Shirt
with a cat_2_gmv value of:  127800
for brand:  SAINT LAURENT(แซนลาเร้นท์) Winning cat 2 is:  T-shirt
with a cat_2_gmv value of:  1191576
for brand:  OldBlue(โอบลู) Winning cat 2 is:  Outerwear
with a cat_2_gmv value of:  94400
for brand:  ห้างเพชรทองสว่างพร(ห้างเพชรทองสว่างพร) Winning cat 2 is:  Gold
with a cat_2_gmv value o

for brand:  VS License Trading Winning cat 2 is:  Cloth bag
with a cat_2_gmv value of:  250424
for brand:  Tathata(ทาทาต้า) Winning cat 2 is:  Shoulder bag
with a cat_2_gmv value of:  176200
for brand:  20Again Winning cat 2 is:  Dress
with a cat_2_gmv value of:  509212
for brand:  Carnivalbkk(คาร์นิเวลบีเคเค) Winning cat 2 is:  Shirt
with a cat_2_gmv value of:  107880
for brand:  Niseko Winning cat 2 is:  Pants
with a cat_2_gmv value of:  99900
for brand:  Adison Winning cat 2 is:  Shorts
with a cat_2_gmv value of:  107892
for brand:  Wtpgold(ห้างทองวังทองโพธาราม) Winning cat 2 is:  Gold
with a cat_2_gmv value of:  165772
for brand:  Memo(เมโม่) Winning cat 2 is:  Outerwear
with a cat_2_gmv value of:  630720
for brand:  Memo Co(เมโม่โค) Winning cat 2 is:  Long pants
with a cat_2_gmv value of:  84280
for brand:  Cruzy(ครูซี่) Winning cat 2 is:  Heel strap shoes
with a cat_2_gmv value of:  75452
for brand:  Slip To Sleep(สลิปทูสลีป) Winning cat 2 is:  Sleepwear
with a cat_2_gmv value of

# Calculate Additional Info on Recency of Sales for Shortlisted Brands

From the final list of brands we have, add info on recency ratio eg. percentage of total gmv coming from products established at least 3, 6, 12, 24 months. 

In [35]:
#Rmk: Should still expect very different numbers on ctime (=when product was added to shopee)
#To-do: calculate annualized GMV of 3,6,12,24 mo. established products
#same procedure use the 4-month_gmv from untouched_joined_df

In [36]:
three_mo = 3 * 30 * 24 * 60 * 60
six_mo = 6 * 30 * 24 * 60 * 60
twelve_mo = 12 * 30 * 24 * 60 * 60
twenty_four_mo = 24 * 30 * 24 * 60 * 60

In [37]:
#Make four dataframes representing slices of the untouched products
#that have sold for more than 3, 6, 12, 24 months. 

mask_3 = unix_time_now - untouched_joined_df["ctime"] > three_mo
timed_df_3 = untouched_joined_df[mask_3]

mask_6 = unix_time_now - untouched_joined_df["ctime"] > six_mo
timed_df_6 = untouched_joined_df[mask_6]

mask_12 = unix_time_now - untouched_joined_df["ctime"] > twelve_mo
timed_df_12 = untouched_joined_df[mask_12]

mask_24 = unix_time_now - untouched_joined_df["ctime"] > twenty_four_mo
timed_df_24 = untouched_joined_df[mask_24]

In [38]:
for brand in brands_df.index:
    #make a sub-df containing only rows with the correct brand
    my_brand_df = timed_df_3[timed_df_3["brand"] == brand]
    
    #gmv calculation
    brand_gmv_3 = my_brand_df["4_month_gmv"].sum()
    brands_df.loc[brand, "%Total GMV from products 3 months+"] = brand_gmv_3

for brand in brands_df.index:
    #make a sub-df containing only rows with the correct brand
    my_brand_df = timed_df_6[timed_df_6["brand"] == brand]
    
    #gmv calculation
    brand_gmv_6 = my_brand_df["4_month_gmv"].sum()
    brands_df.loc[brand, "%Total GMV from products 6 months+"] = brand_gmv_6
    
for brand in brands_df.index:
    #make a sub-df containing only rows with the correct brand
    my_brand_df = timed_df_12[timed_df_12["brand"] == brand]
    
    #gmv calculation
    brand_gmv_12 = my_brand_df["4_month_gmv"].sum()
    brands_df.loc[brand, "%Total GMV from products 12 months+"] = brand_gmv_12
    
for brand in brands_df.index:
    #make a sub-df containing only rows with the correct brand
    my_brand_df = timed_df_24[timed_df_24["brand"] == brand]
    
    #gmv calculation
    brand_gmv_24 = my_brand_df["4_month_gmv"].sum()
    brands_df.loc[brand, "%Total GMV from products 24 months+"] = brand_gmv_24


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i
i


In [39]:
#Now we have baht gmv in the four recency columns, 
#just convert into ratio compared to total_gmv 
for brand in brands_df.index:
    percentage = brands_df.loc[brand, "%Total GMV from products 3 months+"] / brands_df.loc[brand, 
                                                                                   "Brand_GMV_All_Cats"]
    brands_df.loc[brand, "%Total GMV from products 3 months+"] = round(percentage, 3)
    
for brand in brands_df.index:
    percentage = brands_df.loc[brand, "%Total GMV from products 6 months+"] / brands_df.loc[brand, 
                                                                                   "Brand_GMV_All_Cats"]
    brands_df.loc[brand, "%Total GMV from products 6 months+"] = round(percentage, 3)
    
for brand in brands_df.index:
    percentage = brands_df.loc[brand, "%Total GMV from products 12 months+"] / brands_df.loc[brand, 
                                                                                   "Brand_GMV_All_Cats"]
    brands_df.loc[brand, "%Total GMV from products 12 months+"] = round(percentage, 3)
    
for brand in brands_df.index:
    percentage = brands_df.loc[brand, "%Total GMV from products 24 months+"] / brands_df.loc[brand, 
                                                                                   "Brand_GMV_All_Cats"]
    brands_df.loc[brand, "%Total GMV from products 24 months+"] = round(percentage, 3)

# Calculate Additional Info on Discounts for Shortlisted Brands

From the final list of brands we have, add info on how much they discount on ALL their products. 

In [40]:
#First, fill in 0 values in "price_before_discount" with "price" value
conditions = [untouched_joined_df['price_before_discount'].eq(0)]
choices = [untouched_joined_df["price"]]
untouched_joined_df['price_before_discount'] = np.select(conditions,choices,default=untouched_joined_df['price_before_discount'])

In [41]:
#Now, create a 4-month-full-priced-gmv 
#versus a 4-month actual gmv
untouched_joined_df["full_priced_gmv"] = untouched_joined_df["price_before_discount"] * untouched_joined_df["sold"]
untouched_joined_df["4_mo_full_priced_gmv"] = untouched_joined_df["full_priced_gmv"] * 4

In [42]:
#Now, for each brand calculate the would-be full_priced annual gmv. (go for each row)

for brand in brands_df.index:
    
    #make a sub-df containing only rows with the correct brand
    my_brand_df = untouched_joined_df[untouched_joined_df["brand"] == brand]
    
    #Approximated ANNUAL gmv calculation
    brand_full_priced_gmv = my_brand_df["4_mo_full_priced_gmv"].sum()
    brands_df.loc[brand, "Brand_Full_Priced_GMV"] = brand_full_priced_gmv #this number is inherently an annual

In [44]:
#Divide the two numbers to get brands avg. discount

brands_df["Average_Disc"] = 1 - brands_df["Brand_GMV_All_Cats"] / brands_df["Brand_Full_Priced_GMV"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


# Output: Brands that Meet Our Criterion

In [45]:
brands_df = brands_df.rename(columns={"Cat_2_Percent_of_Total_GMV" : "Cat_2_Contr_to_Total_GMV"})
brands_df.sort_index(inplace=True)
display(brands_df)
#NOTE: Anything before the Brand_GMV_All_Cats uses FILTERED product data 
#i.e. data where we've the products to count for by category, and time. 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,Filtered_Brand_Annual_GMV,Filtered_Average_Unit_Price,%_Filtered_GMV_from_Top_3_Products,Filtered_Weighted_Star_Rating,Filtered_Bad_Rating_Percent,Brand_GMV_All_Cats,Conversion_Rate_All_Cats_(gmv/views),Cat_2_Percent_of_Total_GMV,Brand_Category_2,%Total GMV from products 3 months+,%Total GMV from products 6 months+,%Total GMV from products 12 months+,%Total GMV from products 24 months+,Brand_Full_Priced_GMV,Average_Disc
Brand Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
ARROW (แอร์โร่ว์),6781068,1261.359375,0,4.819529,0.010,6849340.0,26.391119,0.956,Shorts,1.000,1.000,0.010,0.010,12757840.0,0.463127
Adidas(อดิดาส),21242968,380.180543,0,4.666737,0.022,43556488.0,5.493020,0.270,Men's sportswear,0.902,0.581,0.183,0.042,63467788.0,0.313723
Adidas(อะดิดาส),4817288,733.448234,0,4.718524,0.014,5675624.0,4.774182,0.849,Slippers,0.988,0.864,0.202,0.031,7332240.0,0.225936
Adidas(อาดิดาส),142223340,788.605030,0,4.702753,0.013,162783916.0,5.442152,0.552,Lace-up sneakers,0.769,0.514,0.278,0.062,296695096.0,0.451343
Aerosoft(เอโรซอฟท์),4599272,595.452097,0,4.749303,0.004,4599272.0,15.417037,0.534,Casual shoes,0.988,0.695,0.143,0.076,8056852.0,0.429148
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
xiaomi,86480,407.924528,0,4.820886,0.006,14673588.0,12.806502,0.271,Bathroom,0.998,0.875,0.385,0.004,31247360.0,0.530406
ห้างขายทองสุวรรณภูมิ3,11410416,1671.121265,0,4.724927,0.021,11410416.0,6.988953,1.000,Gold,1.000,1.000,0.649,0.000,15022404.0,0.240440
ห้างทองรวมสินไทยขอนแก่น,82607844,3631.433269,0,4.931094,0.001,82607844.0,49.348992,0.992,Gold,0.995,0.995,0.973,0.000,106929792.0,0.227457
ห้างทองแม่ทองสุก,16158120,9925.135135,0,4.712561,0.000,16177720.0,47.280050,0.607,Gold,0.782,0.720,0.000,0.000,19974400.0,0.190077


In [48]:
display(brands_df)

Unnamed: 0_level_0,Filtered_Brand_Annual_GMV,Filtered_Average_Unit_Price,%_Filtered_GMV_from_Top_3_Products,Filtered_Weighted_Star_Rating,Filtered_Bad_Rating_Percent,Brand_GMV_All_Cats,Conversion_Rate_All_Cats_(gmv/views),Cat_2_Contr_to_Total_GMV,Brand_Category_2,%Total GMV from products 3 months+,%Total GMV from products 6 months+,%Total GMV from products 12 months+,%Total GMV from products 24 months+,Brand_Full_Priced_GMV,Average_Disc
Brand Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
ARROW (แอร์โร่ว์),6781068,1261.359375,0,4.819529,0.010,6849340.0,26.391119,0.956,Shorts,1.000,1.000,0.010,0.010,12757840.0,0.463127
Adidas(อดิดาส),21242968,380.180543,0,4.666737,0.022,43556488.0,5.493020,0.270,Men's sportswear,0.902,0.581,0.183,0.042,63467788.0,0.313723
Adidas(อะดิดาส),4817288,733.448234,0,4.718524,0.014,5675624.0,4.774182,0.849,Slippers,0.988,0.864,0.202,0.031,7332240.0,0.225936
Adidas(อาดิดาส),142223340,788.605030,0,4.702753,0.013,162783916.0,5.442152,0.552,Lace-up sneakers,0.769,0.514,0.278,0.062,296695096.0,0.451343
Aerosoft(เอโรซอฟท์),4599272,595.452097,0,4.749303,0.004,4599272.0,15.417037,0.534,Casual shoes,0.988,0.695,0.143,0.076,8056852.0,0.429148
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
xiaomi,86480,407.924528,0,4.820886,0.006,14673588.0,12.806502,0.271,Bathroom,0.998,0.875,0.385,0.004,31247360.0,0.530406
ห้างขายทองสุวรรณภูมิ3,11410416,1671.121265,0,4.724927,0.021,11410416.0,6.988953,1.000,Gold,1.000,1.000,0.649,0.000,15022404.0,0.240440
ห้างทองรวมสินไทยขอนแก่น,82607844,3631.433269,0,4.931094,0.001,82607844.0,49.348992,0.992,Gold,0.995,0.995,0.973,0.000,106929792.0,0.227457
ห้างทองแม่ทองสุก,16158120,9925.135135,0,4.712561,0.000,16177720.0,47.280050,0.607,Gold,0.782,0.720,0.000,0.000,19974400.0,0.190077


In [None]:
#make to csv here if desired 
#brands_df.to_csv("test_3.csv")

In [49]:
brands_df.columns

Index(['Filtered_Brand_Annual_GMV', 'Filtered_Average_Unit_Price',
       '%_Filtered_GMV_from_Top_3_Products', 'Filtered_Weighted_Star_Rating',
       'Filtered_Bad_Rating_Percent', 'Brand_GMV_All_Cats',
       'Conversion_Rate_All_Cats_(gmv/views)', 'Cat_2_Contr_to_Total_GMV',
       'Brand_Category_2', '%Total GMV from products 3 months+',
       '%Total GMV from products 6 months+',
       '%Total GMV from products 12 months+',
       '%Total GMV from products 24 months+', 'Brand_Full_Priced_GMV',
       'Average_Disc'],
      dtype='object')

In [54]:
brands_df = brands_df[['Filtered_Brand_Annual_GMV', 'Brand_Full_Priced_GMV', 
                       'Filtered_Average_Unit_Price', 
                       'Filtered_Weighted_Star_Rating',
                       'Filtered_Bad_Rating_Percent', 'Brand_GMV_All_Cats',
                       'Conversion_Rate_All_Cats_(gmv/views)', 'Cat_2_Contr_to_Total_GMV',
                       'Brand_Category_2', '%Total GMV from products 3 months+',
                       '%Total GMV from products 6 months+',
                       '%Total GMV from products 12 months+',
                       '%Total GMV from products 24 months+',
                       'Average_Disc']]

#figure this out:'%_Filtered_GMV_from_Top_3_Products'

In [55]:
brands_df.to_csv("fashion_brands_shortlist.csv")

***
# Nakrin's Task of Finding Cat 1 Annual GMV Values from the Data

# Calculate Cat 1 Annual GMV Values for the categories that McGroup is in

In [10]:
#Approach 1: Use static Data (less accurate b/c using 1-month snapshot)

cat_1_focus = ["Accessories", "Bag", "Men's fashion", 
               "Men's Shoes", "Women's fashion", 
               "Women's Shoes"]

#Create new df to store cat 1 info
results_df = pd.DataFrame()
i = 0
for cat_1 in cat_1_focus:
    
    #take slice of products_static df include only the right cat
    mask = products_static["category_one_en"] == cat_1
    cat_1_df = products_static[mask]
    
    cat_1_gmv = cat_1_df["gmv"].sum()
    
    results_df.loc[i, "Category_One"] = cat_1
    results_df.loc[i, "Category_One_Dec_GMV"] = cat_1_gmv
    results_df.loc[i, "Category_One_Annual_GMV"] = cat_1_gmv * 12
    
    i += 1

results_df 

#Approach 2: Use daily data 

'''
#Interpretation: Use Nick's Interpretation i.e. if a product
#is sold only 1 of 3 months, we assume it is sold only 8 of 12
#months. 

#Implementation: create new column for products_daily called
#4-month GMV. Then just sum up 4-month GMV column to get annual. 
#Products that were present in all Oct, Nov, Dec will have 12 months
#worth of GMV then, weighted equally across Oct, Nov, Dec GMVs. 

#Would be interesting to test via a lower bound methhod i.e. include
#only products that have been on for all Oct, Nov, Dec, but am unsure 
#how to do that efficiently 
'''
joined_df["4_month_gmv"] = joined_df["gmv"] * 4 

cat_1_focus = ["Accessories", "Bag", "Men's fashion", 
               "Men's Shoes", "Women's fashion", 
               "Women's Shoes"]

#Create new df to store cat 1 info
results_df = pd.DataFrame()
i = 0
for cat_1 in cat_1_focus:
    
    #take slice of products_static df include only the right cat
    mask = joined_df["category_one_en"] == cat_1
    cat_1_df = joined_df[mask]
    
    cat_1_annual_gmv = cat_1_df["4_month_gmv"].sum()
    
    results_df.loc[i, "Category_One"] = cat_1
    results_df.loc[i, "Category_One_Annual_GMV"] = cat_1_annual_gmv
    i += 1

results_df 

#results_df.to_csv("cat_1_gmvs.csv")

Unnamed: 0,Category_One,Category_One_Dec_GMV,Category_One_Annual_GMV
0,Accessories,253648878.0,3043787000.0
1,Bag,280760974.0,3369132000.0
2,Men's fashion,185695784.0,2228349000.0
3,Men's Shoes,115676793.0,1388122000.0
4,Women's fashion,548695736.0,6584349000.0
5,Women's Shoes,131829899.0,1581959000.0


# Calculating Cat 1 Growth

In [47]:
Oct = 1604102400
Nov = 1606694400
Dec = 1609372800

mask_oct = joined_df["added_at"] == Oct
oct_data = joined_df[mask_oct]

mask_nov = joined_df["added_at"] == Nov
nov_data = joined_df[mask_nov]

mask_dec = joined_df["added_at"] == Dec
dec_data = joined_df[mask_dec]

In [73]:
cat_1_focus = ["Accessories", "Bag", "Men's fashion", 
               "Men's Shoes", "Women's fashion", 
               "Women's Shoes"]

#Create new df to store cat 1 info
results_df = pd.DataFrame()
i = 0
for cat_1 in cat_1_focus:
    #take slice of products_static df include only the right cat
    mask = oct_data["category_one_en"] == cat_1
    cat_1_df = oct_data[mask]
    
    cat_1_gmv = cat_1_df["gmv"].sum()
    
    results_df.loc[i, "Category_One"] = cat_1
    results_df.loc[i, "Category_One_Oct_GMV"] = cat_1_gmv
    
    i += 1

i = 0
for cat_1 in cat_1_focus:
    #take slice of products_static df include only the right cat
    mask = nov_data["category_one_en"] == cat_1
    cat_1_df = nov_data[mask]
    
    cat_1_gmv = cat_1_df["gmv"].sum()
    
    results_df.loc[i, "Category_One"] = cat_1
    results_df.loc[i, "Category_One_Nov_GMV"] = cat_1_gmv
    
    i += 1

i = 0
for cat_1 in cat_1_focus:
    #take slice of products_static df include only the right cat
    mask = dec_data["category_one_en"] == cat_1
    cat_1_df = dec_data[mask]
    
    cat_1_gmv = cat_1_df["gmv"].sum()
    
    results_df.loc[i, "Category_One"] = cat_1
    results_df.loc[i, "Category_One_Dec_GMV"] = cat_1_gmv
    
    i += 1
    
results_df["Oct_to_Nov"] = results_df["Category_One_Nov_GMV"] / results_df["Category_One_Oct_GMV"] - 1
results_df["Nov_to_Dec"] = results_df["Category_One_Dec_GMV"] / results_df["Category_One_Nov_GMV"] - 1

results_df

Unnamed: 0,Category_One,Category_One_Oct_GMV,Category_One_Nov_GMV,Category_One_Dec_GMV,Oct_to_Nov,Nov_to_Dec
0,Accessories,187904865.0,177479220.0,246703309.0,-0.055484,0.390041
1,Bag,239114870.0,228619246.0,263781015.0,-0.043894,0.153801
2,Men's fashion,193851253.0,165768971.0,188135247.0,-0.144865,0.134924
3,Men's Shoes,101629221.0,90250304.0,104093270.0,-0.111965,0.153384
4,Women's fashion,537097328.0,528241014.0,557742972.0,-0.016489,0.055849
5,Women's Shoes,126427631.0,115749937.0,137222235.0,-0.084457,0.185506


In [74]:
results_df.to_csv("Cat_1_Growth.csv")

# Calculate Cat 2 Annual GMV Values for the categories that McGroup is in

In [49]:
'''
#Make a df for each cat 1 then run through cat 2's. 

#Approach 1: Use static Data (less accurate b/c using 1-month snapshot)
cat_1_focus = ["Accessories", "Bag", "Men's fashion", 
               "Men's Shoes", "Women's fashion", 
               "Women's Shoes"]

#Create new df to store cat 1 info
results_df = pd.DataFrame()
i = 0
for cat_1 in cat_1_focus:
    
    #take slice of products_static df include only the right cat
    mask = products_static["category_one_en"] == cat_1
    cat_1_df = products_static[mask]
    
    #now iterate over cat_2's that are sub to this cat_1
    for cat_2 in cat_1_df["category_two_en"].unique():
        mask2 = cat_1_df["category_two_en"] == cat_2
        cat_2_df = cat_1_df[mask2]
        
        cat_2_gmv = cat_2_df["product_gmv"].sum()
        
        results_df.loc[i, "Category_One"] = cat_1
        results_df.loc[i, "Category_Two"] = cat_2
        results_df.loc[i, "Category_Two_Dec_GMV"] = cat_2_gmv
        results_df.loc[i, "Category_Two_Annual_GMV"] = cat_2_gmv * 12
        
        i += 1

results_df 
'''

Unnamed: 0,Category_One,Category_Two,Category_Two_Dec_GMV,Category_Two_Annual_GMV
0,Accessories,Belt,9927493.0,119129916.0
1,Accessories,Necklace,16302723.0,195632676.0
2,Accessories,Rings,7785881.0,93430572.0
3,Accessories,Bracelet,13683288.0,164199456.0
4,Accessories,Earrings,8461123.0,101533476.0
...,...,...,...,...
72,Women's Shoes,Canvas shoes,34359645.0,412315740.0
73,Women's Shoes,Accessories for shoes,2090580.0,25086960.0
74,Women's Shoes,Boots and ankle boots,2840056.0,34080672.0
75,Women's Shoes,Socks and stockings,14493562.0,173922744.0


In [67]:
#Approach 2: Use Daily Data


#Create new df to store results
results_df = pd.DataFrame()
i = 0

cat_1_focus = ["Accessories", "Bag", "Men's fashion", 
               "Men's Shoes", "Women's fashion", 
               "Women's Shoes"]

for cat_1 in cat_1_focus: 
    
    #take slice of products_static df include only the right cat_1
    mask = joined_df["category_one_en"] == cat_1
    cat_1_df = joined_df[mask]
    
    #now iterate over cat_2's that are sub to this cat_1
    for cat_2 in cat_1_df["category_two_en"].unique():
        mask2 = cat_1_df["category_two_en"] == cat_2
        cat_2_df = cat_1_df[mask2]
        
        cat_2_annual_gmv = cat_2_df["4_month_gmv"].sum()
        
        results_df.loc[i, "Category_One"] = cat_1
        results_df.loc[i, "Category_Two"] = cat_2
        results_df.loc[i, "Category_Two_Annual_GMV"] = cat_2_annual_gmv
        
        i += 1

results_df 

Unnamed: 0,Category_One,Category_Two,Category_Two_Annual_GMV
0,Accessories,Belt,106183840.0
1,Accessories,Necklace,165866056.0
2,Accessories,Rings,79873036.0
3,Accessories,Key ring,26672396.0
4,Accessories,Earrings,91987748.0
...,...,...,...
72,Women's Shoes,Casual shoes,69383012.0
73,Women's Shoes,Accessories for shoes,25177912.0
74,Women's Shoes,Boots and ankle boots,30312832.0
75,Women's Shoes,Socks and stockings,147970364.0


In [69]:
results_df = results_df.sort_values(by=["Category_One", "Category_Two"])

In [70]:
results_df.to_csv("cat_2_gmv.csv")

# Calculate Cat 2 Growth

In [71]:
cat_1_focus = ["Accessories", "Bag", "Men's fashion", 
               "Men's Shoes", "Women's fashion", 
               "Women's Shoes"]

#Create new df to store cat 1 info
results_df = pd.DataFrame()
i = 0
for cat_1 in cat_1_focus:
    #take slice of products_static df include only the right cat_1
    mask = oct_data["category_one_en"] == cat_1
    cat_1_df = oct_data[mask]
    
    #now iterate over cat_2's that are sub to this cat_1
    for cat_2 in cat_1_df["category_two_en"].unique():
        mask2 = cat_1_df["category_two_en"] == cat_2
        cat_2_df = cat_1_df[mask2]
        
        cat_2_gmv = cat_2_df["gmv"].sum()
        
        results_df.loc[i, "Category_One"] = cat_1
        results_df.loc[i, "Category_Two"] = cat_2
        results_df.loc[i, "Category_Two_Oct_GMV"] = cat_2_gmv
        
        i += 1

i = 0
for cat_1 in cat_1_focus:
    #take slice of products_static df include only the right cat
    mask = nov_data["category_one_en"] == cat_1
    cat_1_df = nov_data[mask]
    
    #now iterate over cat_2's that are sub to this cat_1
    for cat_2 in cat_1_df["category_two_en"].unique():
        mask2 = cat_1_df["category_two_en"] == cat_2
        cat_2_df = cat_1_df[mask2]
        
        cat_2_gmv = cat_2_df["gmv"].sum()
        
        results_df.loc[i, "Category_One"] = cat_1
        results_df.loc[i, "Category_Two"] = cat_2
        results_df.loc[i, "Category_Two_Nov_GMV"] = cat_2_gmv
        
        i += 1

i = 0
for cat_1 in cat_1_focus:
    #take slice of products_static df include only the right cat
    mask = dec_data["category_one_en"] == cat_1
    cat_1_df = dec_data[mask]
    
    #now iterate over cat_2's that are sub to this cat_1
    for cat_2 in cat_1_df["category_two_en"].unique():
        mask2 = cat_1_df["category_two_en"] == cat_2
        cat_2_df = cat_1_df[mask2]
        
        cat_2_gmv = cat_2_df["gmv"].sum()
        
        results_df.loc[i, "Category_One"] = cat_1
        results_df.loc[i, "Category_Two"] = cat_2
        results_df.loc[i, "Category_Two_Dec_GMV"] = cat_2_gmv
        
        i += 1
    
results_df["Oct_to_Nov"] = results_df["Category_Two_Nov_GMV"] / results_df["Category_Two_Oct_GMV"] - 1
results_df["Nov_to_Dec"] = results_df["Category_Two_Dec_GMV"] / results_df["Category_Two_Nov_GMV"] - 1

results_df

Unnamed: 0,Category_One,Category_Two,Category_Two_Oct_GMV,Category_Two_Nov_GMV,Category_Two_Dec_GMV,Oct_to_Nov,Nov_to_Dec
0,Accessories,Belt,9672038.0,7501793.0,9372129.0,-0.224383,0.249319
1,Accessories,Necklace,15060666.0,13256299.0,13149549.0,-0.119807,-0.008053
2,Accessories,Bracelet,6908711.0,1791204.0,12462170.0,-0.740733,5.957426
3,Accessories,Earrings,2600376.0,9631080.0,7340383.0,2.703726,-0.237844
4,Accessories,Hat,8364018.0,7292536.0,24775045.0,-0.128106,2.397315
...,...,...,...,...,...,...,...
72,Women's Shoes,Canvas shoes,6593395.0,5330739.0,32343549.0,-0.191503,5.067367
73,Women's Shoes,Accessories for shoes,2265966.0,1976549.0,2051963.0,-0.127723,0.038154
74,Women's Shoes,Boots and ankle boots,2762158.0,2138025.0,2678025.0,-0.225958,0.252570
75,Women's Shoes,Socks and stockings,11416117.0,11390633.0,14185841.0,-0.002232,0.245395


In [72]:
results_df = results_df.sort_values(by=["Category_One", "Category_Two"])
results_df.to_csv("cat_2_growth.csv")

### Export joined df into Tableau for effective visualization. 

In [88]:
#Include only needed info like product_gmv, categories, month to make Tableau fast. 
tableau_df = joined_df[["added_at", "gmv", "category_one_en", "category_two_en"]]

cat_1_focus = ["Accessories", "Bag", "Men's fashion", 
               "Men's Shoes", "Women's fashion", 
               "Women's Shoes"]
mask = tableau_df["category_one_en"].isin(cat_1_focus)
tableau_df = tableau_df[mask]

tableau_df = tableau_df.sort_values(by=["category_one_en", "category_two_en"])

In [97]:
tableau_df["Month"] = 0
tableau_df.sample(n=10)

Unnamed: 0,added_at,gmv,category_one_en,category_two_en,Month
1658019,1609372800,9880,Men's fashion,Uniforms,0
3754345,1604102400,119,Bag,Cloth bag,0
2092491,1604102400,13497,Women's fashion,Jackets and Coats,0
5502153,1609372800,185,Bag,Handbag,0
7001196,1609372800,452,Men's fashion,Shirt,0
5165513,1606694400,7500,Women's Shoes,Socks and stockings,0
1773128,1604102400,290,Bag,Backpack,0
4829248,1604102400,882,Men's fashion,Shorts,0
4997940,1606694400,654,Women's fashion,Sets,0
6331011,1606694400,139,Women's fashion,Sleepwear,0


In [96]:
tableau_df["added_at"].unique()


[1604102400, 1606694400, 1609372800]
Categories (3, int64): [1604102400, 1606694400, 1609372800]

In [100]:
tableau_df.loc[tableau_df["added_at"] == 1604102400, "Month"] = "October"
tableau_df.loc[tableau_df["added_at"] == 1606694400, "Month"] = "November"
tableau_df.loc[tableau_df["added_at"] == 1609372800, "Month"] = "December"

In [105]:
tableau_df["Month"] = tableau_df["Month"].astype("category")

In [107]:
tableau_df.to_csv("cat_sizes_for_McGroup_Presentation_Jan_8.csv")

# End of Nakrin's Task
***