In [1]:
import os
import sys

module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import pandas as pd

from src.process_labels import (
    dataframe_to_pydantic,
    get_weight,
    parse_composition,
    preprocess_series,
    pydanticlist_to_json,
    replace_words,
    split_colors,
    split_components,
    split_sentence,
    strip_and_trim_punctuation,
)

#Constants 
ALL_GSM = [ # Standardize unit of measure 
    "g/m2",
    "g/m²",
    "gm²",
    "gm2",
    " gram.",
    " gram ",
    "gr ",
    "gr.",
    " g ",
]

#Read file
label_file = pd.read_csv("../data/raw/care_labels.csv")
label_file.head()

Unnamed: 0,product_id,product_category,care_label
0,#113,PANTS,"Main: 40% Cotton, 60% Polyester, 290 g/m².\nCo..."
1,#212,PANTS,"Main: DuraTwill, 52% Cotton 48% Polyamide, 240..."
2,#213,PANTS,"Main: 40% Cotton, 60% Polyester, 290 g/m².\nCo..."
3,#214,PANTS,"Main: Canvas+, 60% Cotton, 40% Polyester, 340 ..."
4,#312,PANTS,"Main: DuraTwill, 52% Cotton 48% Polyamide, 240..."


In [3]:
# Clean dataframe 

# Lower dataframe
clean_label_file = label_file.applymap(lambda x: x.lower() if pd.notnull(x) else x)

# Split categories to get main category and subcategory field
clean_label_file[["product_main_category","product_sub_category"]]= clean_label_file.product_category.str.split("/",expand= True)

# Preprocess care label series
clean_label_file["updated_care_label"] = preprocess_series(clean_label_file.care_label)

# Standardize unit of measure 
clean_label_file.updated_care_label = replace_words(clean_label_file.updated_care_label, ALL_GSM, "gsm")

clean_label_file

Unnamed: 0,product_id,product_category,care_label,product_main_category,product_sub_category,updated_care_label
0,#113,pants,"main: 40% cotton, 60% polyester, 290 g/m².\nco...",pants,,"main: 40% cotton, 60% polyester, 290 gsm. cont..."
1,#212,pants,"main: duratwill, 52% cotton 48% polyamide, 240...",pants,,"main: duratwill, 52% cotton 48% polyamide, 240..."
2,#213,pants,"main: 40% cotton, 60% polyester, 290 g/m².\nco...",pants,,"main: 40% cotton, 60% polyester, 290 gsm. cont..."
3,#214,pants,"main: canvas+, 60% cotton, 40% polyester, 340 ...",pants,,"main: canvas, 60% cotton, 40% polyester, 340 g..."
4,#312,pants,"main: duratwill, 52% cotton 48% polyamide, 240...",pants,,"main: duratwill, 52% cotton 48% polyamide, 240..."
...,...,...,...,...,...,...
568,#9794,pants,100% cordura®-polyamide 300 g/m².,pants,,100% cordura-polyamide 300 gsm
569,#9795,accessory/phone-case,100% leather.,accessory,phone-case,100% leather
570,#9796,accessory/phone-case,100% polyamide.,accessory,phone-case,100% polyamide
571,#9797,pants,"49% modacrylic fr, 42% cotton, 5% aramid, 3% p...",pants,,"49% modacrylic fr, 42% cotton, 5% aramid, 3% p..."


In [4]:
# Get colors 
clean_label_file_colors = split_colors(clean_label_file,"updated_care_label")
print(clean_label_file_colors.shape)
clean_label_file_colors.query("color.notna()").head()

(615, 7)


Unnamed: 0,product_id,product_category,care_label,product_main_category,product_sub_category,updated_care_label,color
43,#1549,jacket,"main: 47% cotton, 53% polyester, 237 g/m². co...",jacket,,"main: 61% polyester 39% sorona polyester, 252 ...",0904
66,#2405,tshirt/long-sleeve,"color 9567: main: 100% polyester, 140 g/m².",tshirt,long-sleeve,"main: 100% polyester, 140 gsm",9567
78,#2496,tshirt/long-sleeve,"col 0400, 0900, 5800 and 9500: 100% cotton, 16...",tshirt,long-sleeve,"100% cotton, 160 gsm","0400, 0900, 5800, 9500"
79,#2496,tshirt/long-sleeve,"col 0400, 0900, 5800 and 9500: 100% cotton, 16...",tshirt,long-sleeve,"95% cotton, 5% viscose, 160 gsm",2800
83,#2502,tshirt,"100% cotton, col. 2800: 95% cotton, 5% viscose...",tshirt,,"95% cotton, 5% viscose 160 gsm",2800


In [5]:
# Create one row per component of each item
clean_label_file_item = split_sentence(clean_label_file_colors, "updated_care_label")
print(clean_label_file_item.shape)
clean_label_file_item.updated_care_label.head(10)

(1141, 7)


0             main: 40% cotton, 60% polyester, 290 gsm
1          contrast: 53% cotton 47% polyester, 290 gsm
2    reinforcement knee: 100% cordura-polyamide, 20...
3    main: duratwill, 52% cotton 48% polyamide, 240...
4                reinforcement: 100% cordura-polyamide
5             main: 40% cotton, 60% polyester, 290 gsm
6         contrast: 53% cotton, 47% polyester, 290 gsm
7    reinforcement knee: 100% cordura-polyamide, 20...
8     main: canvas, 60% cotton, 40% polyester, 340 gsm
9                reinforcement: 100% cordura-polyamide
Name: updated_care_label, dtype: object

In [6]:
# Extract component name 
clean_label_file_component = split_components(
    clean_label_file_item,
    "updated_care_label",
    )
clean_label_file_component[["updated_care_label","component"]].head(10)

Unnamed: 0,updated_care_label,component
0,"40% cotton, 60% polyester, 290 gsm",main
1,"53% cotton 47% polyester, 290 gsm",contrast
2,"100% cordura-polyamide, 205 gsm",reinforcement knee
3,"duratwill, 52% cotton 48% polyamide, 240 gsm",main
4,100% cordura-polyamide,reinforcement
5,"40% cotton, 60% polyester, 290 gsm",main
6,"53% cotton, 47% polyester, 290 gsm",contrast
7,"100% cordura-polyamide, 205 gsm",reinforcement knee
8,"canvas, 60% cotton, 40% polyester, 340 gsm",main
9,100% cordura-polyamide,reinforcement


In [7]:
clean_label_file_component.component.unique()

array(['main', 'contrast', 'reinforcement knee', 'reinforcement',
       'lining', 'cuff stretch', 'insulation', 'mesh', 'pocket lining',
       'contrast main', 'padding', 'collar lining', 'reinforcements',
       'material', 'main fabric', 'stretch', 'isolation', 'cuffs', 'rib',
       'stretch fabric', 'detail', 'contrast stretch back thigh panels',
       'knee pad', 'pockets', 'gusset', 'reinforced with', 'ripstop',
       'polartec power stretch', 'polartec thermal pro', 'cuff',
       'contrast fabric', 'polartec insulation', 'main face',
       'main backing', 'polartec', 'coating', 'elastane mesh',
       'main material', 'pile', 'acrylic lining', 'shell', 'filling',
       'dipping', 'palm', 'backing', 'weight'], dtype=object)

In [8]:
# Fail for one item because of the use of colon after "weight" :(
clean_label_file_component.query("component=='weight'")

Unnamed: 0,product_id,product_category,care_label,product_main_category,product_sub_category,updated_care_label,color,component
1080,#9448,sweater,main: 96% polyester 4% elastane. weight: 198 g...,sweater,,198 gsm,,weight
1082,#9448,sweater,main: 96% polyester 4% elastane. weight: 198 g...,sweater,,94 gsm,,weight


In [9]:
# Extract weight information from the updated care label column
clean_label_file_weight = get_weight(clean_label_file_component, "updated_care_label")
clean_label_file_weight[["product_id","updated_care_label","weight"]].head(10)

Unnamed: 0,product_id,updated_care_label,weight
0,#113,"40% cotton, 60% polyester",290.0
1,#113,53% cotton 47% polyester,290.0
2,#113,100% cordura-polyamide,205.0
3,#212,"duratwill, 52% cotton 48% polyamide",240.0
4,#212,100% cordura-polyamide,
5,#213,"40% cotton, 60% polyester",290.0
6,#213,"53% cotton, 47% polyester",290.0
7,#213,100% cordura-polyamide,205.0
8,#214,"canvas, 60% cotton, 40% polyester",340.0
9,#214,100% cordura-polyamide,


In [10]:
# Quick Check first None weight
clean_label_file_weight.query("product_id =='#212'").loc[3,"care_label"]

'main: duratwill, 52% cotton 48% polyamide, 240 g/m².\nreinforcement: 100% cordura®-polyamide.'

Sometimes weight information is not available for all components of an item. 

In [11]:
# Final file for display purposes
kept_columns = [
    "product_id",
    "product_main_category",
    "product_sub_category",
    "component","color",
    "updated_care_label",
    "weight",
]
clean_label_file_weight[kept_columns]


Unnamed: 0,product_id,product_main_category,product_sub_category,component,color,updated_care_label,weight
0,#113,pants,,main,,"40% cotton, 60% polyester",290
1,#113,pants,,contrast,,53% cotton 47% polyester,290
2,#113,pants,,reinforcement knee,,100% cordura-polyamide,205
3,#212,pants,,main,,"duratwill, 52% cotton 48% polyamide",240
4,#212,pants,,reinforcement,,100% cordura-polyamide,
...,...,...,...,...,...,...,...
1146,#9794,pants,,main,,100% cordura-polyamide,300
1147,#9795,accessory,phone-case,main,,100% leather,
1148,#9796,accessory,phone-case,main,,100% polyamide,
1149,#9797,pants,,main,,"49% modacrylic fr, 42% cotton, 5% aramid, 3% p...",300


In [12]:
# Extract composition details  
clean_label_file_weight[["remaining_text", "composition_dict"]] = clean_label_file_weight["updated_care_label"].apply(
    lambda x: pd.Series(parse_composition(x))
)

# Clean remaining text 
clean_label_file_weight["remaining_text"] = clean_label_file_weight["remaining_text"].str.replace(",", " ", regex=True)
clean_label_file_weight["remaining_text"] = strip_and_trim_punctuation(clean_label_file_weight["remaining_text"])

# Set weight data type
clean_label_file_weight.weight=clean_label_file_weight.weight.astype(float)

# Products with remaining text may need special attention 
print(clean_label_file_weight[clean_label_file_weight["remaining_text"]!=""].shape)
clean_label_file_weight[clean_label_file_weight["remaining_text"]!=""].to_excel("to_review.xlsx")

(80, 11)


In [13]:
# Transform dtaframe to structured json and save results
clean_label_file_weight.to_csv("../data/processed/final_care_label.csv")
products = dataframe_to_pydantic(clean_label_file_weight)
pydanticlist_to_json(products, "../data/processed/products_database")

Improvements:
- remove "weight" words from text and following punctuation
- handle order of material name + composition in % or vice versa
- handle brackets
- handle sub components 

Ideally for the future : 
Set a standard format with no natural language possible