### Installs

In [2]:
! pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.5-py3-none-any.whl.metadata (9.5 kB)
Collecting setuptools>=0.7.0 (from fasttext)
  Using cached setuptools-74.0.0-py3-none-any.whl.metadata (6.7 kB)
Collecting numpy (from fasttext)
  Downloading numpy-2.1.0-cp312-cp312-win_amd64.whl.metadata (59 kB)
Using cached pybind11-2.13.5-py3-none-any.whl (240 kB)
Using cached setuptools-74.0.0-py3-none-any.whl (1.3 MB)
Downloading numpy-2.1.0-cp312-cp312-win_amd64.whl (12.6 MB)
   ---------------------------------------- 0.0/12.6 MB ? eta -:--:--
   -------------- ------------------------

  error: subprocess-exited-with-error
  
  × Building wheel for fasttext (pyproject.toml) did not run successfully.
  │ exit code: 1
  ╰─> [33 lines of output]
      !!
      
              ********************************************************************************
              Usage of dash-separated 'description-file' will not be supported in future
              versions. Please use the underscore name 'description_file' instead.
      
              By 2024-Sep-26, you need to update your project and remove deprecated calls
              or your builds will no longer be supported.
      
              See https://setuptools.pypa.io/en/latest/userguide/declarative_config.html for details.
              ********************************************************************************
      
      !!
        opt = self.warn_dash_deprecation(opt, section)
      running bdist_wheel
      running build
      running build_py
      creating build
      creating build\lib.win-amd64-cp

### Imports

In [3]:
import fasttext
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

ModuleNotFoundError: No module named 'fasttext'

### Data Loading

In [3]:
! unzip input_data.zip

Archive:  input_data.zip
  inflating: attribute_test.data     
  inflating: attribute_train.data    
  inflating: attribute_train.solution  
  inflating: attribute_val.data      
  inflating: attribute_val.solution  


In [75]:
x_train_df = pd.read_json("./attribute_train.data", lines=True)
y_train_df = pd.read_json("./attribute_train.solution", lines=True)

x_train_df.shape, y_train_df.shape

((443499, 4), (443499, 7))

In [76]:
x_val_df = pd.read_json("./attribute_val.data", lines=True)
y_val_df = pd.read_json("./attribute_val.solution", lines=True)

x_val_df.shape, y_val_df.shape

((95035, 4), (95035, 7))

In [77]:
x_test_df = pd.read_json("./attribute_test.data", lines=True)
x_test_df.shape

(95036, 4)

### Preprocess

In [78]:
def preprocess(df):
  df.fillna("None", inplace=True)
  df["features"] = "title: " + df["title"] + " " + "store: " + df["store"] + " " + "manufacturer: " + df["details_Manufacturer"]
  df["details_Brand"] = df["details_Brand"].str.replace(' ', '%%')
  df["L0_category"] = df["L0_category"].str.replace(' ', '%%')
  df["L1_category"] = df["L1_category"].str.replace(' ', '%%')
  df["L2_category"] = df["L2_category"].str.replace(' ', '%%')
  df["L3_category"] = df["L3_category"].str.replace(' ', '%%')
  df["L4_category"] = df["L4_category"].str.replace(' ', '%%')
  df["label"] = '__label__' + df["details_Brand"] + '__' + df["L0_category"] + '__' + df["L1_category"] + '__' + df["L2_category"] + '__' + df["L3_category"] + '__' + df["L4_category"]
  df["training_row"] = df["label"] + ' ' + df["features"]
  return df


In [79]:
x_train_df.sort_values(by="indoml_id", ascending=True, inplace=True)
y_train_df.sort_values(by="indoml_id", ascending=True, inplace=True)

train_df = pd.concat([x_train_df, y_train_df.drop("indoml_id", axis=1)], axis=1)

train_df = preprocess(train_df)

train_df.head()

Unnamed: 0,indoml_id,title,store,details_Manufacturer,details_Brand,L0_category,L1_category,L2_category,L3_category,L4_category,features,label,training_row
0,0,"Enclume Angled Pot Hook, Set of 6, Use with Po...",Enclume,Enclume,Enclume,Home%%&%%Kitchen,Kitchen%%&%%Dining,Storage%%&%%Organization,Racks%%&%%Holders,Pot%%Racks,"title: Enclume Angled Pot Hook, Set of 6, Use ...",__label__Enclume__Home%%&%%Kitchen__Kitchen%%&...,__label__Enclume__Home%%&%%Kitchen__Kitchen%%&...
1,1,Schutt Vengeance DCT Hybrid Youth Football H,Schutt,Schutt,Schutt,Sports%%&%%Outdoors,Sports,Team%%Sports,Football,Protective%%Gear,title: Schutt Vengeance DCT Hybrid Youth Footb...,__label__Schutt__Sports%%&%%Outdoors__Sports__...,__label__Schutt__Sports%%&%%Outdoors__Sports__...
2,2,Easton 2014 MAKO SL14MK9 Baseball Bat (-9),Easton,"Easton Sports, Inc.",Easton,Sports%%&%%Outdoors,Sports,Team%%Sports,Baseball,Baseball%%Bats,title: Easton 2014 MAKO SL14MK9 Baseball Bat (...,__label__Easton__Sports%%&%%Outdoors__Sports__...,__label__Easton__Sports%%&%%Outdoors__Sports__...
3,3,Bilstein B46-0929 Heavy-Duty Gas Shock Absorber,Bilstein,Bilstein,Bilstein,Automotive,Replacement%%Parts,"Shocks,%%Struts%%&%%Suspension",Shocks,na,title: Bilstein B46-0929 Heavy-Duty Gas Shock ...,__label__Bilstein__Automotive__Replacement%%Pa...,__label__Bilstein__Automotive__Replacement%%Pa...
4,4,Apple Red Cardstock - 8.5 x 11 inch - 65Lb Cov...,Clear Path Paper,Clear Path Paper,Clear%%Path%%Paper,"Arts,%%Crafts%%&%%Sewing",Crafting,Paper%%&%%Paper%%Crafts,Paper,Card%%Stock,title: Apple Red Cardstock - 8.5 x 11 inch - 6...,"__label__Clear%%Path%%Paper__Arts,%%Crafts%%&%...","__label__Clear%%Path%%Paper__Arts,%%Crafts%%&%..."


In [80]:
x_val_df.sort_values(by="indoml_id", ascending=True, inplace=True)
y_val_df.sort_values(by="indoml_id", ascending=True, inplace=True)

val_df = pd.concat([x_val_df, y_val_df.drop("indoml_id", axis=1)], axis=1)

val_df = preprocess(val_df)

val_df.head()

Unnamed: 0,indoml_id,title,store,details_Manufacturer,details_Brand,L0_category,L1_category,L2_category,L3_category,L4_category,features,label,training_row
0,0,"Pendleton, Eco-Wise Washable Wool Blanket, Bla...",Pendleton,Pendleton Woolen Mills,Pendleton,Home%%&%%Kitchen,Bedding,Blankets%%&%%Throws,Bed%%Blankets,na,"title: Pendleton, Eco-Wise Washable Wool Blank...",__label__Pendleton__Home%%&%%Kitchen__Bedding_...,__label__Pendleton__Home%%&%%Kitchen__Bedding_...
1,1,JP London MD3A049 DM1578 Space Nebula Removabl...,JP London,JP London,JP%%London,Tools%%&%%Home%%Improvement,"Paint,%%Wall%%Treatments%%&%%Supplies",Wall%%Stickers%%&%%Murals,na,na,title: JP London MD3A049 DM1578 Space Nebula R...,__label__JP%%London__Tools%%&%%Home%%Improveme...,__label__JP%%London__Tools%%&%%Home%%Improveme...
2,2,Lawn Fawn LF2938 Fangtastic Friends - Lawn Cut...,Lawn Fawn,Lawn Fawn,Lawn%%Fawn,"Arts,%%Crafts%%&%%Sewing",Scrapbooking%%&%%Stamping,Die-Cutting%%&%%Embossing,Die-Cuts,na,title: Lawn Fawn LF2938 Fangtastic Friends - L...,"__label__Lawn%%Fawn__Arts,%%Crafts%%&%%Sewing_...","__label__Lawn%%Fawn__Arts,%%Crafts%%&%%Sewing_..."
3,3,ANCHEER Foldable Elliptical Machine for Home U...,ANCHEER,ANCHEER,ANCHEER,Sports%%&%%Outdoors,Exercise%%&%%Fitness,Cardio%%Training,Elliptical%%Trainers,na,title: ANCHEER Foldable Elliptical Machine for...,__label__ANCHEER__Sports%%&%%Outdoors__Exercis...,__label__ANCHEER__Sports%%&%%Outdoors__Exercis...
4,4,Schecter Jeff Loomis JLV-7 NT Left Handed 7-St...,Schecter,,Schecter,Musical%%Instruments,Guitars,Electric%%Guitars,Solid%%Body,na,title: Schecter Jeff Loomis JLV-7 NT Left Hand...,__label__Schecter__Musical%%Instruments__Guita...,__label__Schecter__Musical%%Instruments__Guita...


### Training

In [81]:
train_df["training_row"][0]

'__label__Enclume__Home%%&%%Kitchen__Kitchen%%&%%Dining__Storage%%&%%Organization__Racks%%&%%Holders__Pot%%Racks title: Enclume Angled Pot Hook, Set of 6, Use with Pot Racks, Copper Plated store: Enclume manufacturer: Enclume'

In [83]:
train_df["features"][0]

'title: Enclume Angled Pot Hook, Set of 6, Use with Pot Racks, Copper Plated store: Enclume manufacturer: Enclume'

In [82]:
train_df["label"][0]

'__label__Enclume__Home%%&%%Kitchen__Kitchen%%&%%Dining__Storage%%&%%Organization__Racks%%&%%Holders__Pot%%Racks'

In [84]:
training_rows = train_df["training_row"].tolist()
with open("indoml.train", "w") as fp:
  for row in tqdm(training_rows):
    fp.write(row)
    fp.write("\n")

100%|██████████| 443499/443499 [00:00<00:00, 783969.72it/s]


In [85]:
model = fasttext.train_supervised(input="indoml.train", lr=1.0, epoch=25, wordNgrams=2, bucket=200000, dim=50, loss='hs')

In [73]:
model.save_model("indoml.bin")

### Validation

In [86]:
val_df["training_row"][0]

'__label__Pendleton__Home%%&%%Kitchen__Bedding__Blankets%%&%%Throws__Bed%%Blankets__na title: Pendleton, Eco-Wise Washable Wool Blanket, Black Watch, King store: Pendleton manufacturer: Pendleton Woolen Mills'

In [87]:
val_df["features"][0]

'title: Pendleton, Eco-Wise Washable Wool Blanket, Black Watch, King store: Pendleton manufacturer: Pendleton Woolen Mills'

In [88]:
val_df["label"][0]

'__label__Pendleton__Home%%&%%Kitchen__Bedding__Blankets%%&%%Throws__Bed%%Blankets__na'

In [96]:
validation_rows = val_df["features"]

val_result_list = []
for index, row in tqdm(enumerate(validation_rows)):
  prediction = model.predict(row)
  val_result_list.append(prediction[0][0])

95035it [00:03, 28651.64it/s]


### Metrics

In [98]:
accuracy_score(val_df["label"], val_result_list)

0.8731835639501236

In [99]:
precision_score(val_df["label"], val_result_list, average='macro', zero_division=0)

0.8690874097438622

In [100]:
recall_score(val_df["label"], val_result_list, average='macro', zero_division=0)

0.8526913936104816

### Test

In [102]:
x_test_df.shape

(95036, 4)

In [103]:
x_test_df.fillna("None", inplace=True)
x_test_df["features"] = "title: " + x_test_df["title"] + " " + "store: " + x_test_df["store"] + " " + "manufacturer: " + x_test_df["details_Manufacturer"]

In [105]:
test_rows = x_test_df["features"]

test_result_list = []
for index, row in tqdm(enumerate(test_rows)):
  prediction = model.predict(row)
  test_result_list.append(prediction[0][0])

95036it [00:03, 31228.33it/s]


In [122]:
ids = x_test_df["indoml_id"].tolist()

In [124]:
category_list = []
for index, row in enumerate(test_result_list):
  row = row.replace('__label__', '')
  details_Brand, L0_category, L1_category, L2_category, L3_category, L4_category = row.split('__')
  details_Brand = details_Brand.replace('%%', ' ')
  L0_category = L0_category.replace('%%', ' ')
  L1_category = L1_category.replace('%%', ' ')
  L2_category = L2_category.replace('%%', ' ')
  L3_category = L3_category.replace('%%', ' ')
  L4_category = L4_category.replace('%%', ' ')
  category_dict = {"indoml_id": ids[index], "details_Brand": details_Brand, "L0_category": L0_category, "L1_category": L1_category, "L2_category": L2_category, "L3_category": L3_category, "L4_category": L4_category}
  category_list.append(category_dict)

In [126]:
for row in category_list:
  print(str(row))
  break

{'indoml_id': 0, 'details_Brand': 'CURT', 'L0_category': 'Automotive', 'L1_category': 'Exterior Accessories', 'L2_category': 'Towing Products & Winches', 'L3_category': 'Hitch Accessories', 'L4_category': 'Wiring'}


In [128]:
import json
with open("attribute_test_30082024.predict", "w") as fp:
  for row in category_list:
    fp.write(json.dumps(row))
    fp.write("\n")