In [22]:
import pandas as pd
import numpy as np
import torch
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
import joblib
import lightgbm as lgb

In [2]:
def split_features_and_labels(dataframe: pd.DataFrame) -> pd.DataFrame:
    features = dataframe.iloc[:, :-11]  # All columns except the last eleven
    labels = dataframe.iloc[:, -11:]    # The last eleven columns
    return features, labels

In [None]:

# Load embeddings data
embeddings_file = "image_embeddings.pt"

embeddings_dict = torch.load(embeddings_file)

products = pd.read_csv('data/product_with_attributes.csv')
embeddings = [embeddings_dict[filename] for filename in products['des_filename']]

# embeddings = [embedding.numpy() for embedding in embeddings]



features, labels = split_features_and_labels(products)

# Convert the list of embeddings to a NumPy array
embeddings_array = np.array(embeddings)

# Add each dimension as a separate feature
for i in range(512):
    features[f'embedding_dim_{i}'] = embeddings_array[:, i]
# products = pd.concat([products, embedding_columns], axis=1)



# Given a fashion clip embedding
# input_embedding = np.array([...])  # Replace with the actual embedding
# embeddings_file_test = "image_embeddings.pt"
# embeddings_dict_test = torch.load(embeddings_file)
# mbeddings = [embeddings_dict[filename] for filename in products['des_filename']]

encoded_labels = labels.apply(lambda x: pd.factorize(x)[0])
unique_values = {col: pd.factorize(labels[col])[1] for col in labels.columns}

In [42]:
encoded_labels

Unnamed: 0,silhouette_type,neck_lapel_type,woven_structure,knit_structure,heel_shape_type,length_type,sleeve_length_type,toecap_type,waist_type,closure_placement,cane_height_type
0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,0,1,1,0,0,1,0
2,0,0,1,1,0,0,1,0,0,2,0
3,2,2,2,1,0,1,2,0,1,2,0
4,3,0,1,1,0,0,0,0,0,3,0
...,...,...,...,...,...,...,...,...,...,...,...
61479,0,4,1,1,0,0,1,0,0,2,0
61480,0,0,1,1,0,0,3,0,0,2,0
61481,16,2,0,1,0,2,2,0,0,2,0
61482,0,3,1,1,0,5,5,0,0,5,0


In [44]:
unique_values

{'silhouette_type': Index(['Recto', 'Evase', 'Skinny', 'Regular', 'Relaxed', 'INVALID', 'Jogger',
        'Oversize', 'Slim', 'Palazzo', 'Paperbag', 'Acampanado/Flare',
        'Push Up', 'Acampanado/Bootcut', 'Cargo', 'Tapered', 'Fino', 'Culotte',
        'Slouchy', '5 Bolsillos', 'Mom', 'Wide leg', 'Lápiz', 'Boyfriend',
        'Ancho', 'Chino', 'Superslim', 'Loose', 'Parachute', 'Halter',
        'Modern slim', 'Bandeau', 'Sarouel', 'Carrot'],
       dtype='object'),
 'neck_lapel_type': Index(['Redondo', 'Caja', 'INVALID', 'Camisero', 'Pico', 'Peak Lapel',
        'Alto/Envolvente', 'Chimenea', 'Polo', 'Halter', 'Shawl', 'Capucha',
        'Escotado', 'Mao', 'Off Shoulder', 'Solapa', 'Cruzado', 'Panadero',
        'Regular', 'Drapeado', 'Barca', 'Perkins', 'Hawaiano/Bowling',
        'Palabra Honor', 'Cisne', 'Button Down', 'Cutaway', 'Asimétrico',
        'Waterfall', 'Babydoll/Peter Pan', 'Espalda Abierta', 'Smoking',
        'Kimono', 'Sin solapa'],
       dtype='object'),
 'wove

In [5]:
features = features.drop('cod_modelo_color', axis=1)
features.head(1)

Unnamed: 0,des_sex,des_age,des_line,des_fabric,des_product_category,des_product_aggregated_family,des_product_family,des_product_type,des_filename,des_color,...,embedding_dim_502,embedding_dim_503,embedding_dim_504,embedding_dim_505,embedding_dim_506,embedding_dim_507,embedding_dim_508,embedding_dim_509,embedding_dim_510,embedding_dim_511
0,Female,Kids,KIDS,TRICOT,Tops,Sweaters and Cardigans,Sweater,Sweater,83_1124642_17074019-82_B.jpg,ROSA LIGHT,...,0.252419,-0.853426,-0.126018,-0.65395,0.638628,0.435513,0.365749,0.165159,0.394995,0.221404


In [4]:
features_arr = np.array(features)
 
features_arr[:, -512:].shape


(61484, 512)

In [9]:
labels

Unnamed: 0,silhouette_type,neck_lapel_type,woven_structure,knit_structure,heel_shape_type,length_type,sleeve_length_type,toecap_type,waist_type,closure_placement,cane_height_type
0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,0,1,1,0,0,1,0
2,0,0,1,1,0,0,1,0,0,2,0
3,2,2,2,1,0,1,2,0,1,2,0
4,3,0,1,1,0,0,0,0,0,3,0
...,...,...,...,...,...,...,...,...,...,...,...
61479,0,4,1,1,0,0,1,0,0,2,0
61480,0,0,1,1,0,0,3,0,0,2,0
61481,16,2,0,1,0,2,2,0,0,2,0
61482,0,3,1,1,0,5,5,0,0,5,0


In [26]:
X_train, X_test, y_train, y_test = train_test_split(features_arr[:, -512:], encoded_labels, test_size=0.9, random_state=42)
# X_train = features_arr[:, -512:]
# y_train = encoded_labels

In [27]:
X_train.shape

(6148, 512)

In [None]:
for label in labels:
    # Use XGBoost as an example
    clf = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', n_jobs=-1)
    # clf = XGBClassifier(
    #     use_label_encoder=False,
    #     eval_metric='mlogloss',
    #     n_jobs=-1,
    #     max_depth=8,
    #     learning_rate=0.1,
    #     n_estimators=200,
    #     # subsample=0.8,
    #     # colsample_bytree=0.8,
    #     gamma=0.1,
    #     min_child_weight=5
    # )
    # clf = lgb.LGBMClassifier(
    #     n_estimators=200,
    #     learning_rate=0.1,
    #     max_depth=8,
    #     subsample=0.8,
    #     colsample_bytree=0.8,
    #     min_child_weight=5,
    #     n_jobs=-1,
    #     verbose=-1
    # )
    clf.fit(X_train, y_train[label])

    # Save the model
    model_filename = f"model_{label}_xgb_3.joblib"
    joblib.dump(clf, model_filename)
    print(f"Model for {label} saved as {model_filename}")

In [28]:
param_dist = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [50, 100, 150],
    'gamma': [0, 0.1, 0.2],
    'min_child_weight': [1, 3, 5]
}

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

for label in labels:
    # Use XGBoost as an example
    clf = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', n_jobs=-1)
    
    random_search = RandomizedSearchCV(
        estimator=clf,
        param_distributions=param_dist,
        n_iter=10,  # Limit iterations for speed
        scoring='accuracy',
        cv=cv,
        verbose=1,
        n_jobs=-1,
        random_state=42
    )
    
    random_search.fit(X_train, y_train[label])
    best_clf = random_search.best_estimator_
    
    best_clf.fit(
        X_train, y_train[label],
        early_stopping_rounds=10,
        eval_set=[(X_val, y_val[label])],
        verbose=False
    )
    
    # clf = XGBClassifier(
    #     use_label_encoder=False,
    #     eval_metric='mlogloss',
    #     n_jobs=-1,
    #     max_depth=8,
    #     learning_rate=0.1,
    #     n_estimators=200,
    #     # subsample=0.8,
    #     # colsample_bytree=0.8,
    #     gamma=0.1,
    #     min_child_weight=5
    # )
    # clf = lgb.LGBMClassifier(
    #     n_estimators=200,
    #     learning_rate=0.1,
    #     max_depth=8,
    #     subsample=0.8,
    #     colsample_bytree=0.8,
    #     min_child_weight=5,
    #     n_jobs=-1,
    #     verbose=-1
    # )
    clf.fit(X_train, y_train[label])

    # Save the model
    model_filename = f"model_{label}_xgb_4_optimized.joblib"
    joblib.dump(clf, model_filename)
    print(f"Model for {label} saved as {model_filename}")

Fitting 3 folds for each of 10 candidates, totalling 30 fits




ValueError: 
All the 30 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/home/guimcc/anaconda3/envs/datascience/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/guimcc/anaconda3/envs/datascience/lib/python3.12/site-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "/home/guimcc/anaconda3/envs/datascience/lib/python3.12/site-packages/xgboost/sklearn.py", line 1491, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29], got [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 24
 25 26 27 28 29 30]

--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/home/guimcc/anaconda3/envs/datascience/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/guimcc/anaconda3/envs/datascience/lib/python3.12/site-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "/home/guimcc/anaconda3/envs/datascience/lib/python3.12/site-packages/xgboost/sklearn.py", line 1491, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31], got [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 33]

--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/home/guimcc/anaconda3/envs/datascience/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/guimcc/anaconda3/envs/datascience/lib/python3.12/site-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "/home/guimcc/anaconda3/envs/datascience/lib/python3.12/site-packages/xgboost/sklearn.py", line 1491, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30], got [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 30 33]


In [9]:
model_lo_p = "model_neck_lapel_type_xgb_2.joblib"  # Replace with your actual file name
model_lo = joblib.load(model_lo_p)

In [10]:
y_pred = model_lo.predict(X_test)
yo = labels['neck_lapel_type']

In [16]:
y_test['neck_lapel_type'].shape

(12297,)

In [18]:
accuracy = accuracy_score(y_test['neck_lapel_type'], y_pred)
accuracy

0.6856956981377572

In [10]:
models = {}

for label in labels:
    model_filename = f"model_{label}_xgb_3.joblib"
    
    models[label] = joblib.load(model_filename)


In [11]:
for label in labels:
    model = models[label]
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test[label], y_pred)
    print(f"Accuracy for {label}: {accuracy}")

Accuracy for silhouette_type: 0.0
Accuracy for neck_lapel_type: 0.0
Accuracy for woven_structure: 0.0
Accuracy for knit_structure: 0.0
Accuracy for heel_shape_type: 0.0
Accuracy for length_type: 0.0
Accuracy for sleeve_length_type: 0.0
Accuracy for toecap_type: 0.0
Accuracy for waist_type: 0.0
Accuracy for closure_placement: 0.0
Accuracy for cane_height_type: 0.0


### Test

In [14]:
PATH = "/media/guimcc/Elements/datathon"
IMAGES_PATH = f"{PATH}/archive/images/images/"
CSV_PATH = f"{PATH}/archive/"
HEIGHT = 224
WIDTH = 160

In [15]:
embeddings_file_test = "image_embeddings_test.pt"
embeddings_dict_test = torch.load(embeddings_file_test)
df_test = pd.read_csv(f"{CSV_PATH}/test_data.csv")
embeddings_test = [embeddings_dict_test[filename] for filename in df_test['des_filename']]

embeddings_test = np.array(embeddings_test)

  embeddings_dict_test = torch.load(embeddings_file_test)


In [16]:
predictions = pd.DataFrame()
predictions['des_filename'] = df_test['des_filename']
predictions['test_id'] = df_test['test_id']
for label in labels:
    model = models[label]
    prediction = model.predict(embeddings_test)
    predictions[label] = prediction


In [17]:
def map_back_to_original(predictions, unique_values):
    # Iterate over each column that needs to be mapped
    for column in unique_values.keys():
        # Replace the encoded values with the original labels
        predictions[column] = predictions[column].apply(lambda x: unique_values[column][x])
    return predictions

predictions_mapped = map_back_to_original(predictions, unique_values)

In [18]:
predictions_mapped['attribute_name'] = predictions_mapped['test_id'].apply(lambda x: "_".join(x.split('_')[2:]))

predictions_mapped

Unnamed: 0,des_filename,test_id,silhouette_type,neck_lapel_type,woven_structure,knit_structure,heel_shape_type,length_type,sleeve_length_type,toecap_type,waist_type,closure_placement,cane_height_type,attribute_name
0,88_49711373_67080432-99_.jpg,88_49711373_cane_height_type,INVALID,INVALID,INVALID,INVALID,Plano,INVALID,INVALID,Redonda,INVALID,INVALID,Alta,cane_height_type
1,88_49718802_67030656-99_.jpg,88_49718802_cane_height_type,INVALID,INVALID,INVALID,INVALID,Bloque,INVALID,INVALID,Con punta,INVALID,INVALID,INVALID,cane_height_type
2,88_49709572_67030418-01_B.jpg,88_49709572_cane_height_type,Recto,Redondo,Ligero,INVALID,INVALID,Standard,Corta,INVALID,INVALID,INVALID,INVALID,cane_height_type
3,88_49722701_67066002-02_.jpg,88_49722701_cane_height_type,Evase,Redondo,Ligero,INVALID,INVALID,Corto,INVALID,INVALID,INVALID,INVALID,INVALID,cane_height_type
4,88_49724926_67056330-02_B.jpg,88_49724926_cane_height_type,Recto,Redondo,Ligero,INVALID,INVALID,Standard,Corta,INVALID,INVALID,Cierre Delantero,INVALID,cane_height_type
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71814,88_49727540_67069223-56_.jpg,88_49727540_knit_structure,Slim,Regular,Ligero,INVALID,INVALID,Standard,Larga,INVALID,INVALID,Cierre Delantero,INVALID,knit_structure
71815,88_49733648_67017145-56_.jpg,88_49733648_knit_structure,Recto,Polo,INVALID,INVALID,INVALID,Standard,Corta,INVALID,INVALID,Cuello,INVALID,knit_structure
71816,88_49735572_67076755-81_.jpg,88_49735572_knit_structure,Regular,Pico,Ligero,INVALID,INVALID,Standard,Corta,INVALID,INVALID,INVALID,INVALID,knit_structure
71817,88_49713624_67092528-70_.jpg,88_49713624_knit_structure,Evase,Caja,Ligero,INVALID,INVALID,Largo,Tirante Fino,INVALID,INVALID,Sin cierre,INVALID,knit_structure


In [19]:
submission = pd.DataFrame({
    'test_id': predictions_mapped['test_id'],
    'des_value': predictions_mapped.apply(lambda row: row[row['attribute_name']], axis=1)
})

In [57]:
submission

Unnamed: 0,test_id,des_value
0,88_49711373_cane_height_type,Alta
1,88_49718802_cane_height_type,INVALID
2,88_49709572_cane_height_type,INVALID
3,88_49722701_cane_height_type,INVALID
4,88_49724926_cane_height_type,INVALID
...,...,...
71814,88_49727540_knit_structure,INVALID
71815,88_49733648_knit_structure,INVALID
71816,88_49735572_knit_structure,INVALID
71817,88_49713624_knit_structure,INVALID


In [20]:
submission.to_csv('submission_2.csv', index=False)