In [2]:
import pandas as pd
import numpy as np

# INPUTS: 
Input the values you would like to filter by:

In [3]:
#Filter 1: Categories
(cat_1_on, cat_2_on, cat_3_on) = True, True, False
cat_lst_1 = [47, 49, 1017]
cat_lst_2 = [875, 887, 1117]
cat_lst_3 = [8669, 8637, 9189, 9266, 10723, 21419, 10710]

#Filter 2: Times (Update-status, how long been sold)
time_on = True
unix_time_now = 1607946056 #Change based on Nick's input
min_years = 1
updated_previously_weeks = 2


#Filter 3: Brand GMV
min_gmv_on = True
min_gmv = 1000

#Filter 4: Brand Avg Price 
min_unit_price_on = True
min_unit_price = 100

#Filter 5: GMV Concentration
min_gmv_conc_on = True
top_n = 3 #concetration num of products
min_concentration = 0.2

#Filter 6: Avg Product Rating
min_avg_star_on = True
min_avg_star = 4.5

#Filter 7: Bad Customer Ratings 
max_bad_rating_on = True
max_bad_rating = 0.5

#Filter 8: Brand TOTAL GMV 
min_total_brand_gmv = True
min_total_brand_gmv = 200

# Import Data

In [4]:
shops_gmv = pd.read_csv("shops_gmv_data.csv")
products_static = pd.read_csv("products_static_1607946056.csv")
models_static = pd.read_csv("models_static_data.csv")

Change data types in dataframes to conserve memory:

In [5]:
def convert_to_cats(df, to_convert):
    '''
    Converts cols in to_convert_list to categories. 
    
    Inputs:
        df: a DataFrame
        to_convert: a list of column names (as strings)
                    we'd like to convert
    Returns:
        Nothing; modifies dataframe in place
    '''
    for col in df.columns:
        if col in to_convert:
            df[col] = df[col].astype("category")

In [6]:
to_convert_models = ['category_one', 'category_one_en',
                     'category_one_th', 'category_two',
                     'category_two_en', 'category_two_th',
                     'category_three', 'category_three_en',
                     'category_three_th']
convert_to_cats(models_static, to_convert_models)

to_convert_products = ['category_one', 'category_one_en',
                      'category_one_th', 'category_two', 
                      'category_two_en', 'category_two_th',
                      'category_three', 'category_three_en', 
                      'category_three_th', 'reviews_count_context', 
                      'reviews_count_image', 'shopee_verified', 
                      'show_discount']
convert_to_cats(products_static, to_convert_products)

to_convert_shops = ["shopid", "category_one", "category_two",
                    "category_three"]
convert_to_cats(shops_gmv, to_convert_shops)

In [7]:
def downcast_numbers(df):
    '''
    Downcasts floats and ints.
    
    Inputs:
        df: a DataFrame objecet. 
    Returns:
        Nothing; modifies list in place
    '''
    for col in df.columns:
        if df[col].dtype == "float":
            df[col] = pd.to_numeric(df[col], downcast="float")
        if df[col].dtype == "int":
            df[col] = pd.to_numeric(df[col], downcast="unsigned")

In [8]:
downcast_numbers(models_static)
downcast_numbers(products_static)
downcast_numbers(shops_gmv)

#For Step 9:
products_static["product_gmv"] = products_static["price"] * products_static["sold"] 
products_static_untouched = products_static.copy(deep=True)


# Filter 1: Category Filter

Filter our DataFrames to only keep rows with Categories we care for. 


In [9]:
def filter_by_category(df, category_n, ok_cat_lst):
    '''
    Keeps rows in the df that have cats in ok_cat_lst. 
    
    Inputs:
        df: a DataFrame
        category_n: (str) eg. "category_one"
        ok_cat_lst: (lst) of acceptable cat for that 
          cat as numbers eg [50, 26]
    Returns:
        Nothing; modifies list in place
    '''
    
    mask = df[category_n].isin(ok_cat_lst)
    df = df[mask]

In [10]:
def filter_by_category(df, category_n, ok_cat_lst):
    '''
    Keeps rows in the df that have cats in ok_cat_lst. 
    
    Inputs:
        df: a DataFrame
        category_n: (str) eg. "category_one"
        ok_cat_lst: (lst) of acceptable cat for that 
          cat as numbers eg [50, 26]
    Returns:
        The modified df 
    '''
    
    mask = df[category_n].isin(ok_cat_lst)
    return df[mask]

In [11]:
if cat_1_on:
    models_static = filter_by_category(models_static, "category_one", cat_lst_1)
    products_static = filter_by_category(products_static, "category_one", cat_lst_1) 

In [12]:
if cat_2_on:
    models_static = filter_by_category(models_static, "category_two", cat_lst_2)
    products_static = filter_by_category(products_static, "category_two", cat_lst_2) 

In [13]:
if cat_3_on:
    models_static = filter_by_category(models_static, "category_three", cat_lst_3)
    products_static = filter_by_category(products_static, "category_three", cat_lst_3) 

# Filter 2: Keep only Established Products and Products that Sellers are Updating

Filter out products that have been sold for less than a specified time. 

In [14]:
if time_on: 
    from datetime import datetime
    import math 

    def filter_out_new_products(df, min_years, unix_time_now):
        '''
        Filters out rows from our products_static df that 
        have been sold less than a specified time
        '''

        min_s = min_years * 365 * 24 * 60 * 60 

        mask = (unix_time_now - products_static["ctime"] > min_s)

        return df[mask]

    products_static = filter_out_new_products(products_static, min_years,
                                              unix_time_now)

    def filter_out_unupdated_products(df, updated_previously_weeks):
        '''
        Filters out rows from our products_static df that 
        have been sold less than a specified time
        '''

        unix_time_now = math.ceil(datetime.now().timestamp())

        max_s = updated_previously_weeks * 7 * 24 * 60 * 60

        mask = (unix_time_now - products_static["modified_at"] < max_s)

        return df[mask]

    products_static = filter_out_unupdated_products(products_static, 
                                                    updated_previously_weeks)

# Brand Definition

Extract unique brand names from the remaining products

In [15]:
mask = products_static["brand"] != "No Brand(ไม่มียี่ห้อ)"
products_static = products_static[mask]

brands_list = products_static["brand"].unique()
brands_list = np.delete(brands_list, np.where(brands_list == ('No Brand(ไม่มียี่ห้อ)')))

In [16]:
#Clean up brands_df
brands_df = pd.DataFrame(brands_list, columns = ["Brand Name"])

brands_df.dropna(inplace=True)

s1 = brands_df["Brand Name"] != "None"
brands_df = brands_df[s1]

s2 = brands_df["Brand Name"] != "0"
brands_df = brands_df[s2] 


In [17]:
#Set up columns for our brands_df
brands_df["Brand_GMV"] = 0
brands_df["Average_Unit_Price"] = 0
brands_df["GMV_Concentration"] = 0
brands_df["Weighted_Star_Rating"] = 0
brands_df["Bad_Rating_Percent"] = 0
brands_df.set_index("Brand Name", inplace = True)

In [18]:
#Make new column in products_static for GMV of a product (price * sold)
products_static["weighted_star"] = products_static["rating_star"] * products_static["sold"]

In [19]:
for brand in brands_df.index:
    
    #make a sub-df containing only rows with the correct brand
    my_brand_df = products_static[products_static["brand"] == brand]
    my_brand_df = my_brand_df.sort_values(by=["product_gmv"], ascending=False) 
    
    #gmv calculation
    brand_gmv = my_brand_df["product_gmv"].sum()
    brands_df.loc[brand, "Brand_GMV"] = brand_gmv
    
    #volume calculation
    brand_volume = my_brand_df["sold"].sum()
    brands_df.loc[brand, "Average_Unit_Price"] = brand_gmv / brand_volume
    
    #GMV Concentration Calculation
    if len(my_brand_df) >= top_n: #in case brand sells less than like 5 products
        top_n_df = my_brand_df.head(top_n)
        top_n_gmv = top_n_df["product_gmv"].sum()
        brands_df.loc[brand, "GMV_Concentration"] = top_n_gmv / brand_gmv
        
    #Weighted Star Rating
    total_star = my_brand_df["weighted_star"].sum()
    brands_df.loc[brand, "Weighted_Star_Rating"] = total_star / brand_volume
    
    #Bad Rating Count
    bad_rating_count = my_brand_df["rating_count_one"].sum() + my_brand_df["rating_count_two"].sum()
    total_rating_count = my_brand_df["rating_count_total"].sum()
    
    if total_rating_count != 0:
        val = bad_rating_count / total_rating_count 
    else: 
        val = -1
    
    brands_df.loc[brand, "Bad_Rating_Percent"] = val

brands_df["Bad_Rating_Percent"] = brands_df["Bad_Rating_Percent"].round(decimals=3)

# Filter 3: Brand Total GMV

In [20]:
if min_gmv_on:
    brands_df = brands_df[brands_df["Brand_GMV"] >= min_gmv]

# Filter 4: Brand Average Price

In [21]:
if min_unit_price_on:
    brands_df = brands_df[brands_df["Average_Unit_Price"] >= min_unit_price]

# Filter 5: GMV Concentration

In [22]:
if min_gmv_conc_on:
    brands_df = brands_df[brands_df["GMV_Concentration"] >= min_concentration]

# Filter 6: Average Product Rating

In [23]:
if min_avg_star_on:
    brands_df = brands_df[brands_df["Weighted_Star_Rating"] >= min_avg_star]

# Filter 7: Bad Customer Ratings

In [24]:
if max_bad_rating_on:
    brands_df = brands_df[brands_df["Bad_Rating_Percent"] <= max_bad_rating]

# (GMV Growth?)

# Filter 8: Brand GMV (Across All Categories) 

Reimport the products data but only filter for the brands in our df. Note: We do this step at the end b/c we'll have to reimport all of products data, so makes sense to have the smallest brands list possible. 

In [25]:
if min_total_brand_gmv:
    for brand in brands_df.index:

        #make a sub-df containing only rows with the correct brand
        mask_brand = products_static_untouched["brand"] == brand
        my_brand_df = products_static_untouched[mask_brand]

        #TOTAL Brand gmv calculation
        total_brand_gmv = my_brand_df["product_gmv"].sum()
        brands_df.loc[brand, "Brand_GMV_All_Cats"] = total_brand_gmv
        
    brands_df = brands_df[brands_df["Brand_GMV_All_Cats"] >= min_total_brand_gmv]

# Output: Brands that Meet Our Criterion

In [26]:
brands_df

Unnamed: 0_level_0,Brand_GMV,Average_Unit_Price,GMV_Concentration,Weighted_Star_Rating,Bad_Rating_Percent,Bad_Rating_count,Total_Rating_count,Brand_GMV_All_Cats
Brand Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Bata(บาต้า),16222,331.061224,0.458267,4.822607,0.017,4.0,233.0,16222.0
Adidas(อาดิดาส),988356,729.414022,0.625807,4.778315,0.013,347.0,26687.0,11521804.0
Bata(บาจา),12595,151.746988,0.541405,4.761095,0.022,14.0,637.0,1024224.0
Ten&Co(เทนแอนด์โค),3950,197.500000,0.810127,4.791701,0.006,2.0,342.0,19220.0
The Simsons(เดอะซิมป์สันส์),2300,230.000000,0.945652,4.884444,0.000,0.0,137.0,3620.0
...,...,...,...,...,...,...,...,...
Madyombyyanis(มัดย้อมบายยานิส),9800,140.000000,1.000000,4.869820,0.006,2.0,351.0,16940.0
Coworker(โคเวิคเกอร์),12394,563.363636,0.806358,4.922396,0.000,0.0,246.0,29803.0
Chinrada(ชินรดา),2350,156.666667,0.872340,4.801933,0.012,2.0,160.0,2866.0
Sitranansilk (ศิร์ตราซิลล์),10602,279.000000,1.000000,4.660874,0.031,3.0,97.0,10602.0


#### Misc: Category Mappings (to convert word-cats into numbers)

In [None]:
mapping_1 = products_static[["category_one", "category_one_en"]].drop_duplicates()
mapping_1.set_index("category_one").dropna().sort_index()

In [None]:
mapping_2 = products_static[["category_two", "category_two_en"]].drop_duplicates()
mapping_2.set_index("category_two").dropna().sort_index()

In [None]:
mapping_3 = products_static[["category_three", "category_three_en"]].drop_duplicates()
mapping_3.set_index("category_three").dropna().sort_index()