In [25]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

from xgboost import XGBClassifier
import joblib

### Preprocessing

In [26]:
def split_features_and_labels(dataframe: pd.DataFrame) -> pd.DataFrame:
    features = dataframe.iloc[:, :-11]
    labels = dataframe.iloc[:, -11:]
    return features, labels

Embedding and data loading

In [None]:
# Load product data
products = pd.read_csv('data/product_with_attributes.csv')
features, labels = split_features_and_labels(products)

# Load image embeddings
embeddings_file = "image_embeddings.pt"
embeddings_dict = torch.load(embeddings_file)
embeddings = [embeddings_dict[filename] for filename in products['des_filename']]
embeddings_array = np.array(embeddings)
embedding_df = pd.DataFrame(embeddings_array, columns=[f'embedding_dim_{i}' for i in range(embeddings_array.shape[1])])

features = pd.concat([features, embedding_df], axis=1)

# Drop the columns that are not needed
features = features.drop('cod_modelo_color', axis=1)
features = features.drop('des_filename', axis=1)

Label encoding - Store the encodings for later use

In [None]:
label_encoder_dict = {}
encoded_labels = pd.DataFrame()
for label in labels.columns:
    label_encoder = LabelEncoder()
    encoded_labels[label] = label_encoder.fit_transform(labels[label])
    label_encoder_dict[label] = label_encoder

Turn to a np.array

In [None]:
features_arr = np.array(features)
features_arr.shape

Encode the categorical features

In [None]:
feature_encoder_dict = dict()
for feature_idx in range(0, 9):
    le = LabelEncoder()
    features_arr[:, feature_idx] = le.fit_transform(features_arr[:, feature_idx])
    feature_encoder_dict[feature_idx] = le

**Filtering invalid types:**
- Set $\texttt{FILTER\_INVALID\_TRAIN} = \texttt{TRUE}$ if invalid types are to be filtered before **training**.
- Let $\texttt{FILTER\_INVALID\_TRAIN} = \texttt{FALSE}$ to just filter at **predict** time.

**Training with all data:**
- Set $\texttt{TRAIN\_ALL\_DATA} = \texttt{TRUE}$ to push training further and try to achieve beter metrics at **test** dataset.
- Let $\texttt{TRAIN\_ALL\_DATA} = \texttt{FALSE}$ to split a **validation** dataset.

In [None]:
FILTER_INVALID_TRAIN = True
TRAIN_ALL_DATA = True

Obtain the dataset for each attribute

In [None]:
X_train_dict = dict()
y_train_dict = dict()
X_val_dict = dict()
y_val_dict = dict()

invalid_types = torch.load('invalid_types.pth')
for label in labels.columns:
    # Retrieve the original categorical values
    transformed_features = feature_encoder_dict[7].inverse_transform(features_arr[:, 7].astype(int))

    # Create a mask to filter out the invalid types - Check if the transformed features are in the invalid types
    mask = np.isin(transformed_features, invalid_types[label])

    if FILTER_INVALID_TRAIN:
        filtered_features = features_arr[~mask]
        filtered_encoded_labels = encoded_labels[label][~mask]
    else:
        filtered_features = features_arr
        filtered_encoded_labels = encoded_labels[label]

    if TRAIN_ALL_DATA:    
        X_train_dict[label], X_val_dict[label], y_train_dict[label], y_val_dict[label] = train_test_split(filtered_features, filtered_encoded_labels, test_size=0.1, random_state=42)
    else:
        X_train_dict[label] = filtered_features
        y_train_dict[label] = filtered_encoded_labels

### Training

Fit a $\texttt{XBGClassifier}$ model for each attribute. After lots of hyperparameter tunning, this config performed the best:
- $\texttt{eval\_metric = 'mlogloss'}$
- $\texttt{max\_depth = 8}$
- $\texttt{learning\_rate = 0.1}$
- $\texttt{n\_estimators = 400}$
- $\texttt{subsample = 0.8}$
- $\texttt{colsample\_bytree = 0.8}$
- $\texttt{gamma = 0.1}$
- $\texttt{min\_child\_weight = 5}$

In [None]:
for label in labels:
    # Define the classifier model fel each attribute
    clf_model = XGBClassifier(
        eval_metric='mlogloss',
        max_depth=8,
        learning_rate=0.1,
        n_estimators=400,
        subsample=0.8,
        colsample_bytree=0.8,
        gamma=0.1,
        min_child_weight=5,
        
        n_jobs=-1
    )
    
    # Fit the model
    clf_model.fit(X_train_dict[label], y_train_dict[label])

    # Save the model
    model_filename = f"model_{label}_xgb.joblib"
    joblib.dump(clf_model, model_filename)

### Test

In [34]:
PATH = "/media/guimcc/Elements/datathon"
CSV_PATH = f"{PATH}/archive/"

Load the test embeddings

In [None]:
test = pd.read_csv(f'{CSV_PATH}test_data.csv')
embeddings_test = torch.load('image_embeddings_test.pt')

Load the already trained models

In [None]:
models = {label: joblib.load(f"model_{label}_xgb.joblib") for label in labels.columns}

Some initial declarations

In [None]:
col_names = ['des_sex', 'des_age',	'des_line',	'des_fabric', 'des_product_category', 'des_product_aggregated_family', 'des_product_family', 'des_product_type', 'des_color']
predictions = pd.DataFrame()
predictions['test_id'] = test['test_id']

Make the predictions using the corresponding model for each attribute

In [None]:
invalid_count = 0

for index, row in tqdm(test.iterrows()):
    # Retrieve the product id and the attribute
    splitted = row['test_id'].split('_')
    product_id = '_'.join(splitted[:2])
    attribute = '_'.join(splitted[2:])

    # If the product type is invalid, set the prediction to INVALID and skip prediction
    if test.loc[index]['des_product_type'] in invalid_types[attribute]:
        predictions.loc[index, 'des_value'] = 'INVALID'
        invalid_count += 1
        continue
    
    # Encode the test features
    encoded_features = np.zeros((1, 9))
    for feature_idx in range(0, 9):
        le = feature_encoder_dict[feature_idx]
        try:
            encoded_features[0][feature_idx] = le.transform([test.loc[index, col_names[feature_idx]]])[0]
        except ValueError:
            encoded_features[0][feature_idx] = np.nan
        
    
    # Get the embedding for the product
    key = test.loc[index]['des_filename']
    model = models[attribute]
    embedding = embeddings_test[key].numpy()
    embedding = embedding.reshape(1, -1)
    
    # Concatenate the encoded features with the embedding
    input_features = np.concatenate((encoded_features, embedding), axis=1)
    
    # Make the prediction
    prediction = model.predict(input_features)
    predictions.loc[index, 'des_value'] = label_encoder_dict[attribute].inverse_transform(prediction) #unique_values[attribute][prediction[0]]

print(f"Number of invalid products: {invalid_count}")

Store the predictions for submission

In [37]:
predictions.to_csv('data/submission.csv', index=False)