In [1]:
import pandas as pd
import numpy as np
products_static = pd.read_csv("products_static_1607946056.csv")

# INPUTS: 
Input the values you would like to filter by:

In [2]:
#Filter 1: Categories
(cat_1_on, cat_2_on, cat_3_on) = False, True, False
cat_lst_1 = []
cat_lst_2 = [12999, 1301, 21762, 1297, 1295, 17095, 893, 
              17089, 12738, 2037,2035, 9197, 20768, 2033, 
              1329, 14492, 14503, 21849, 16502, 16500]
cat_lst_3 = []

#Filter 2: Times (Update-status, how long been sold)
time_on = True
unix_time_now = 1607946056 #Change based on Nick's input
min_years = 1
updated_previously_weeks = 2


#Filter 3: Brand GMV
min_gmv_on = False
min_gmv = 1000

#Filter 4: Brand Avg Price 
min_unit_price_on = True
min_unit_price = 300

#Filter 5: GMV Concentration
min_gmv_conc_on = False
top_n = 3 #concetration num of products
min_concentration = 0.2

#Filter 6: Avg Product Rating
min_avg_star_on = False
min_avg_star = 4.5

#Filter 7: Bad Customer Ratings 
max_bad_rating_on = False 
max_bad_rating = 0.5

#Filter 8: Brand TOTAL GMV 
min_total_brand_gmv_on = True
min_total_brand_gmv = 2000000 / 12

# Import Data

In [3]:
shops_gmv = pd.read_csv("shops_gmv_data.csv")
products_static = pd.read_csv("products_static_1607946056.csv")
models_static = pd.read_csv("models_static_data.csv")

Change data types in dataframes to conserve memory:

In [4]:
def convert_to_cats(df, to_convert):
    '''
    Converts cols in to_convert_list to categories. 
    
    Inputs:
        df: a DataFrame
        to_convert: a list of column names (as strings)
                    we'd like to convert
    Returns:
        Nothing; modifies dataframe in place
    '''
    for col in df.columns:
        if col in to_convert:
            df[col] = df[col].astype("category")

In [5]:
to_convert_models = ['category_one', 'category_one_en',
                     'category_one_th', 'category_two',
                     'category_two_en', 'category_two_th',
                     'category_three', 'category_three_en',
                     'category_three_th']
convert_to_cats(models_static, to_convert_models)

to_convert_products = ['category_one', 'category_one_en',
                      'category_one_th', 'category_two', 
                      'category_two_en', 'category_two_th',
                      'category_three', 'category_three_en', 
                      'category_three_th', 'reviews_count_context', 
                      'reviews_count_image', 'shopee_verified', 
                      'show_discount']
convert_to_cats(products_static, to_convert_products)

to_convert_shops = ["shopid", "category_one", "category_two",
                    "category_three"]
convert_to_cats(shops_gmv, to_convert_shops)

In [6]:
def downcast_numbers(df):
    '''
    Downcasts floats and ints.
    
    Inputs:
        df: a DataFrame objecet. 
    Returns:
        Nothing; modifies list in place
    '''
    for col in df.columns:
        if df[col].dtype == "float":
            df[col] = pd.to_numeric(df[col], downcast="float")
        if df[col].dtype == "int":
            df[col] = pd.to_numeric(df[col], downcast="unsigned")

In [7]:
downcast_numbers(models_static)
downcast_numbers(products_static)
downcast_numbers(shops_gmv)

#For Step 9:
products_static["product_gmv"] = products_static["price"] * products_static["sold"] 
products_static_untouched = products_static.copy(deep=True)


# Filter 1: Category Filter

Filter our DataFrames to only keep rows with Categories we care for. 


In [8]:
def filter_by_category(df, category_n, ok_cat_lst):
    '''
    Keeps rows in the df that have cats in ok_cat_lst. 
    
    Inputs:
        df: a DataFrame
        category_n: (str) eg. "category_one"
        ok_cat_lst: (lst) of acceptable cat for that 
          cat as numbers eg [50, 26]
    Returns:
        Nothing; modifies list in place
    '''
    
    mask = df[category_n].isin(ok_cat_lst)
    df = df[mask]

In [9]:
def filter_by_category(df, category_n, ok_cat_lst):
    '''
    Keeps rows in the df that have cats in ok_cat_lst. 
    
    Inputs:
        df: a DataFrame
        category_n: (str) eg. "category_one"
        ok_cat_lst: (lst) of acceptable cat for that 
          cat as numbers eg [50, 26]
    Returns:
        The modified df 
    '''
    
    mask = df[category_n].isin(ok_cat_lst)
    return df[mask]

In [10]:
if cat_1_on:
    models_static = filter_by_category(models_static, "category_one", cat_lst_1)
    products_static = filter_by_category(products_static, "category_one", cat_lst_1) 

In [11]:
if cat_2_on:
    models_static = filter_by_category(models_static, "category_two", cat_lst_2)
    products_static = filter_by_category(products_static, "category_two", cat_lst_2) 

In [12]:
if cat_3_on:
    models_static = filter_by_category(models_static, "category_three", cat_lst_3)
    products_static = filter_by_category(products_static, "category_three", cat_lst_3) 

# Filter 2: Keep only Established Products and Products that Sellers are Updating

Filter out products that have been sold for less than a specified time. 

In [13]:
if time_on: 
    from datetime import datetime
    import math 

    def filter_out_new_products(df, min_years, unix_time_now):
        '''
        Filters out rows from our products_static df that 
        have been sold less than a specified time
        '''

        min_s = min_years * 365 * 24 * 60 * 60 

        mask = (unix_time_now - products_static["ctime"] > min_s)

        return df[mask]

    products_static = filter_out_new_products(products_static, min_years,
                                              unix_time_now)

    def filter_out_unupdated_products(df, updated_previously_weeks):
        '''
        Filters out rows from our products_static df that 
        have been sold less than a specified time
        '''

        unix_time_now = math.ceil(datetime.now().timestamp())

        max_s = updated_previously_weeks * 7 * 24 * 60 * 60

        mask = (unix_time_now - products_static["modified_at"] < max_s)

        return df[mask]

    products_static = filter_out_unupdated_products(products_static, 
                                                    updated_previously_weeks)

# Brand Definition

Extract unique brand names from the remaining products

In [14]:
mask = products_static["brand"] != "No Brand(ไม่มียี่ห้อ)"
products_static = products_static[mask]

brands_list = products_static["brand"].unique()
brands_list = np.delete(brands_list, np.where(brands_list == ('No Brand(ไม่มียี่ห้อ)')))

In [15]:
#Clean up brands_df
brands_df = pd.DataFrame(brands_list, columns = ["Brand Name"])

brands_df.dropna(inplace=True)

s1 = brands_df["Brand Name"] != "None"
brands_df = brands_df[s1]

s2 = brands_df["Brand Name"] != "0"
brands_df = brands_df[s2] 


In [16]:
#Set up columns for our brands_df
brands_df["Brand_GMV"] = 0
brands_df["Average_Unit_Price"] = 0
brands_df["GMV_Concentration"] = 0
brands_df["Weighted_Star_Rating"] = 0
brands_df["Bad_Rating_Percent"] = 0
brands_df.set_index("Brand Name", inplace = True)

In [17]:
#Make new column in products_static for GMV of a product (price * sold)
products_static["weighted_star"] = products_static["rating_star"] * products_static["sold"]

In [18]:
for brand in brands_df.index:
    
    #make a sub-df containing only rows with the correct brand
    my_brand_df = products_static[products_static["brand"] == brand]
    my_brand_df = my_brand_df.sort_values(by=["product_gmv"], ascending=False) 
    
    #gmv calculation
    brand_gmv = my_brand_df["product_gmv"].sum()
    brands_df.loc[brand, "Brand_GMV"] = brand_gmv
    
    #volume calculation
    brand_volume = my_brand_df["sold"].sum()
    brands_df.loc[brand, "Average_Unit_Price"] = brand_gmv / brand_volume
    
    #GMV Concentration Calculation
    if len(my_brand_df) >= top_n: #in case brand sells less than like 5 products
        top_n_df = my_brand_df.head(top_n)
        top_n_gmv = top_n_df["product_gmv"].sum()
        brands_df.loc[brand, "GMV_Concentration"] = top_n_gmv / brand_gmv
        
    #Weighted Star Rating
    total_star = my_brand_df["weighted_star"].sum()
    brands_df.loc[brand, "Weighted_Star_Rating"] = total_star / brand_volume
    
    #Bad Rating Count
    bad_rating_count = my_brand_df["rating_count_one"].sum() + my_brand_df["rating_count_two"].sum()
    total_rating_count = my_brand_df["rating_count_total"].sum()
    
    if total_rating_count != 0:
        val = bad_rating_count / total_rating_count 
    else: 
        val = -1
    
    brands_df.loc[brand, "Bad_Rating_Percent"] = val

brands_df["Bad_Rating_Percent"] = brands_df["Bad_Rating_Percent"].round(decimals=3)

# Filter 3: Brand Total GMV

In [19]:
if min_gmv_on:
    brands_df = brands_df[brands_df["Brand_GMV"] >= min_gmv]

# Filter 4: Brand Average Price

In [20]:
if min_unit_price_on:
    brands_df = brands_df[brands_df["Average_Unit_Price"] >= min_unit_price]

# Filter 5: GMV Concentration

In [21]:
if min_gmv_conc_on:
    brands_df = brands_df[brands_df["GMV_Concentration"] >= min_concentration]

# Filter 6: Average Product Rating

In [22]:
if min_avg_star_on:
    brands_df = brands_df[brands_df["Weighted_Star_Rating"] >= min_avg_star]

# Filter 7: Bad Customer Ratings

In [23]:
if max_bad_rating_on:
    brands_df = brands_df[brands_df["Bad_Rating_Percent"] <= max_bad_rating]

# (GMV Growth?)

In [24]:
# number of brands in our brands_df
print(len(brands_df))

2573


# Filter 8: Brand GMV (Across All Categories) 

Reimport the products data but only filter for the brands in our df. Note: We do this step at the end b/c we'll have to reimport all of products data, so makes sense to have the smallest brands list possible. 

### Calculate Category of the Brands

Do this by looping over again, since going through each cat 2 for each brand is computationally expensive, want to get the brands list down as much as possible.
Note: 
- If set min_total_brand_gmv = 2,000,000 / 12, go from 415 --> 415
- If set min_total_brand_gmv = 2,000,000 / 10, go from 415 --> 364
- If set min_total_brand_gmv = 2,000,000 / 8, go from 415 --> 314
- If set min_total_brand_gmv = 2,000,000 / 6, go from 415 --> 258
- If set min_total_brand_gmv = 2,000,000 / 4, go from 415 --> 193

This is just a sample but goes to show that doing our brand's cat_2 calculation can be expensive, so that's why I wanted to put it after we had chipped away at list. However, if we set min_total_brand_gmv = 2,000,000 / 12, there's no effect so might as well just include our brand cat 2 calculation in Filter 8's for loop. 


In [51]:
#Assume this is always kept on for simplicity
if min_total_brand_gmv_on:
        
    for brand in brands_df.index:

        #make a sub-df containing only rows with the correct brand
        mask_brand = products_static_untouched["brand"] == brand
        my_brand_df = products_static_untouched[mask_brand]

        #TOTAL Brand gmv calculation
        total_brand_gmv = my_brand_df["product_gmv"].sum()
        brands_df.loc[brand, "Brand_GMV_All_Cats"] = total_brand_gmv
        
        #Total brand view count 
        total_brand_views = my_brand_df["view_count"].sum()
        conv_rate = total_brand_gmv / total_brand_views
        brands_df.loc[brand, "Conversation_Rate_All_Cats"] = conv_rate
        
        #--------------------------------------------------------------
        #Take this out and put later if the min_total_brand_gmv 
        #becomes higher, otherwise inclue this in the for loop
        winning_cat_2 = None
        winning_cat_2_gmv = 0.0

        for cat_2 in my_brand_df["category_two_en"].unique():
            mask_cat_2 = my_brand_df["category_two_en"] == cat_2
            cat_2_df = my_brand_df[mask_cat_2]

            cat_2_gmv = cat_2_df["product_gmv"].sum()
            if cat_2_gmv > winning_cat_2_gmv:
                winning_cat_2 = cat_2
                winning_cat_2_gmv = cat_2_gmv
                    
        percent = (winning_cat_2_gmv / total_brand_gmv).round(decimals=3)
        brands_df.loc[brand, "Brand_Cat_2_Contr"] = percent 
        
        brands_df.loc[brand, "Brand_Category_2"] = winning_cat_2
        
        
        print("for brand: ", brand, "Winning cat 2 is: ", winning_cat_2)
        print("with a cat_2_gmv value of: ", winning_cat_2_gmv)

        
        #--------------------------------------------------------------
            
        
        
    #Apply the total brand gmv filter
    brands_df = brands_df[brands_df["Brand_GMV_All_Cats"] >= min_total_brand_gmv]
    
brands_df = brands_df[brands_df["Brand_GMV_All_Cats"] >= min_total_brand_gmv]

for brand:  Genie Bra(จีนี่ บรา) Winning cat 2 is:  Underwear
with a cat_2_gmv value of:  200022
for brand:  SKG(เอสเคจี) Winning cat 2 is:  Small kitchen appliances
with a cat_2_gmv value of:  1182053
for brand:  Bandai(บันได)​ Winning cat 2 is:  Game collectibles
with a cat_2_gmv value of:  1892788
for brand:  BANDAI(บันได) Winning cat 2 is:  Game collectibles
with a cat_2_gmv value of:  366985
for brand:  Bandai(บันได) Winning cat 2 is:  Game collectibles
with a cat_2_gmv value of:  260032
for brand:  WELNESS(เวลเนส) Winning cat 2 is:  Health equipment
with a cat_2_gmv value of:  517900
for brand:  Triumph(ไทรอัมพ์) Winning cat 2 is:  Underwear
with a cat_2_gmv value of:  168954
for brand:  Bandai Namco (บันได นัมโกะ) Winning cat 2 is:  Game collectibles
with a cat_2_gmv value of:  191944
for brand:  Columbia(โคลัมเบีย) Winning cat 2 is:  Men's sportswear
with a cat_2_gmv value of:  226680
for brand:  Wacoal Winning cat 2 is:  Underwear
with a cat_2_gmv value of:  281539
for brand: 

for brand:  Grand sport(แกรนด์สปอร์ต) Winning cat 2 is:  Men's sportswear
with a cat_2_gmv value of:  90735
for brand:  Frolina(โฟรลินา) Winning cat 2 is:  Bathroom
with a cat_2_gmv value of:  175381
for brand:  Stiebel Eltron(สตีเบล เอลทรอน) Winning cat 2 is:  Small kitchen appliances
with a cat_2_gmv value of:  1677405
for brand:  Stiebel Eltron Winning cat 2 is:  Water heater
with a cat_2_gmv value of:  551895
for brand:  PETKIT(เพ็ทคิท) Winning cat 2 is:  Cat
with a cat_2_gmv value of:  510953
for brand:  Petkit(เพ็ทคิท) Winning cat 2 is:  Cat
with a cat_2_gmv value of:  523179
for brand:  Certainty Winning cat 2 is:  Health equipment
with a cat_2_gmv value of:  4474548
for brand:  CERTAINTY(เซอร์เทนตี้) Winning cat 2 is:  Diapers and wet wipes
with a cat_2_gmv value of:  1852159
for brand:  Certainty(เซอร์เทนตี้) Winning cat 2 is:  Health equipment
with a cat_2_gmv value of:  2806862
for brand:  CUSHY(คุซชี่) Winning cat 2 is:  Furniture
with a cat_2_gmv value of:  162855
for bran

for brand:  Colandas(คอแลนดาส) Winning cat 2 is:  Small kitchen appliances
with a cat_2_gmv value of:  288984
for brand:  Royal Canin Winning cat 2 is:  Cat
with a cat_2_gmv value of:  657486
for brand:  Taste Of The Wild(เท็ดออฟเดอะไวท์) Winning cat 2 is:  Cat
with a cat_2_gmv value of:  435941
for brand:  MARA(มาร่า) Winning cat 2 is:  Small kitchen appliances
with a cat_2_gmv value of:  478718
for brand:  Fast Pure(ฟาส์ท เพียว) Winning cat 2 is:  Small kitchen appliances
with a cat_2_gmv value of:  672041
for brand:  SHARP Winning cat 2 is:  Small kitchen appliances
with a cat_2_gmv value of:  140576
for brand:  Mazuma(มาซูม่า) Winning cat 2 is:  Small kitchen appliances
with a cat_2_gmv value of:  532426
for brand:  YOUHA PLUS (ยูฮาพลัส) Winning cat 2 is:  Baby goods
with a cat_2_gmv value of:  303990
for brand:  Freena(ฟรีน่า) Winning cat 2 is:  Baby goods
with a cat_2_gmv value of:  354956
for brand:  Dog days(ด็อกเดย์) Winning cat 2 is:  Dog
with a cat_2_gmv value of:  224208
fo

for brand:  Miren(มิเร้น) Winning cat 2 is:  Furniture
with a cat_2_gmv value of:  1745687
for brand:  Clearblue(เคลียร์บลู) Winning cat 2 is:  Health equipment
with a cat_2_gmv value of:  196700
for brand:  Inn Home(อินน์โฮม) Winning cat 2 is:  Furniture
with a cat_2_gmv value of:  496759
for brand:  Diy Siam(ดีไอวาย สยาม） Winning cat 2 is:  Camping and hiking
with a cat_2_gmv value of:  1149680
for brand:  Carlino Design(คาลิโน่ ดีไซน์) Winning cat 2 is:  Furniture
with a cat_2_gmv value of:  388617
for brand:  Myhome Design(มายโฮม ดีไซน์) Winning cat 2 is:  Furniture
with a cat_2_gmv value of:  350862
for brand:  IKEA(อีเกีย) Winning cat 2 is:  Bathroom
with a cat_2_gmv value of:  1152210
for brand:  Inntech(อินน์เทค) Winning cat 2 is:  Power tools and tools
with a cat_2_gmv value of:  276652
for brand:  BG Winning cat 2 is:  Fitness and exercise equipment
with a cat_2_gmv value of:  206889
for brand:  Beanbag(บีนแบ็ค) Winning cat 2 is:  Health food supplements
with a cat_2_gmv valu

for brand:  Topper(ทอปเปอร์) Winning cat 2 is:  Bedroom
with a cat_2_gmv value of:  173048
for brand:  Denso(เดนโซ) Winning cat 2 is:  Spare parts and car accessories
with a cat_2_gmv value of:  672064
for brand:  FLEX(เฟล็กซ์) Winning cat 2 is:  Spare parts and car accessories
with a cat_2_gmv value of:  331180
for brand:  YSS(วายเอสเอส) Winning cat 2 is:  Spare parts and motorcycle accessories
with a cat_2_gmv value of:  793253
for brand:  NGK(เอ็นจีเค) Winning cat 2 is:  Spare parts and car accessories
with a cat_2_gmv value of:  169912
for brand:  Aisin(ไอซิน) Winning cat 2 is:  Oil and liquid
with a cat_2_gmv value of:  468129
for brand:  Diff(ดิฟฟ์) Winning cat 2 is:  Car accessories
with a cat_2_gmv value of:  188285
for brand:  Hitachi(ฮิตาชิ) Winning cat 2 is:  Home improvement equipment
with a cat_2_gmv value of:  1607302
for brand:  No Name(โนเนม) Winning cat 2 is:  Sleepwear
with a cat_2_gmv value of:  7595327
for brand:  Nexzter(เน็กซ์เตอร์) Winning cat 2 is:  Spare parts 

for brand:  Centrum(เซนทรัม) Winning cat 2 is:  Health food supplements
with a cat_2_gmv value of:  1005487


# Output: Brands that Meet Our Criterion

In [52]:
brands_df

Unnamed: 0_level_0,Brand_GMV,Average_Unit_Price,GMV_Concentration,Weighted_Star_Rating,Bad_Rating_Percent,Brand_GMV_All_Cats,Brand_Category_2,Brand_Cat_2_Contr,Conversation_Rate_All_Cats
Brand Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Genie Bra(จีนี่ บรา),138452,854.641975,0.591974,4.832525,0.004,200022.0,Underwear,1.000,12.231517
SKG(เอสเคจี),747026,373.513000,0.595177,4.639416,0.022,2329345.0,Small kitchen appliances,0.507,8.569944
Bandai(บันได)​,334183,705.027426,0.260809,4.935471,0.001,1892788.0,Game collectibles,1.000,5.939500
BANDAI(บันได),366985,824.685393,0.149328,4.959240,0.000,366985.0,Game collectibles,1.000,4.645027
Bandai(บันได),214314,965.378378,0.687141,4.950898,0.002,590250.0,Game collectibles,0.441,5.753597
...,...,...,...,...,...,...,...,...,...
Dermatix(เดอร์มาติกซ์),6680,835.000000,0.000000,4.958333,0.000,440039.0,Skin care products,0.928,15.982239
Casiko(คาซิโก้),429,429.000000,0.000000,4.800000,0.000,738071.0,Microwaves and ovens,0.952,25.525540
Citizen(ซิติเซ็น),1840,460.000000,0.000000,4.500000,0.000,249717.0,Men's watch,0.545,14.292411
Hot Toys Limited.(ฮ็อททอยลิมิเต็ด),28000,28000.000000,0.000000,5.000000,0.000,1906765.0,Game collectibles,1.000,56.742203


In [53]:
#make to csv here if desired 
#brands_df.to_csv("test_2.csv")

In [29]:
'''
mask = products_static_untouched["brand"] == "SKG(เอสเคจี)"
sample_brand_df = products_static_untouched[mask]
products_static_untouched[mask].sort_values("category_two_en", 
                                            ascending = False).tail(20)
'''


In [33]:
'''
winning_cat_2 = None
winning_cat_2_gmv = 0.0
    
for cat_2 in sample_brand_df["category_two_en"].unique(): 
    mask_cat_2 = sample_brand_df["category_two_en"] == cat_2
    cat_2_df = sample_brand_df[mask_cat_2]
    
    cat_2_gmv = cat_2_df["product_gmv"].sum()
    
    if cat_2_gmv > winning_cat_2_gmv:
        winning_cat_2 = cat_2
        winning_cat_2_gmv = cat_2_gmv
    
print(winning_cat_2, "with a cat gmv of: winning_cat_2_gmv")

'''

SyntaxError: unexpected EOF while parsing (<ipython-input-33-d48cdbd1c010>, line 14)

In [None]:
'''
To get category name info
models_static = pd.read_csv("models_static_data.csv")

mapping_1 = models_static[["category_one", "category_one_en", "category_one_th"]].drop_duplicates()
mapping_1 = mapping_1.set_index("category_one").dropna().sort_index()

mapping_2 = models_static[["category_two", "category_two_en", "category_two_th"]].drop_duplicates()
mapping_2 = mapping_2.set_index("category_two").dropna().sort_index()

mapping_3 = models_static[["category_three", "category_three_en", "category_three_th"]].drop_duplicates()
mapping_3 = mapping_3.set_index("category_three").dropna().sort_index()

mapping_1.to_csv("mapping_1.csv")
mapping_2.to_csv("mapping_2.csv")
mapping_3.to_csv("mapping_3.csv")
'''