In [None]:
import pickle
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
from scipy.sparse import hstack, vstack, csr_matrix
from datetime import datetime
import itertools
import pandas as pd
from lightgbm import LGBMClassifier
import gc
import functools
import pickle
from collections import Counter

!pip install lsi-tagger
from lsi_tagger.text_cleaner import TextCleaner

from tqdm import tqdm
tqdm.pandas()

from IPython.display import clear_output
clear_output()

# Read in Training Data
Generate labels by treating each item in an outfit as the missing item. Leverage both test sets (Stage 1 and Stage 2) since they are nearly complete outfits.

In [None]:
def get_full_outfits():
    df = pd.read_parquet('/content/drive/MyDrive/Colab Notebooks/sigir_2022/data/parquet_files/manual_outfits.parquet')
    df['stage1'] = False
    return df

def add_stage1_data(full_outfits):
    stage1_df = pd.read_parquet('/content/drive/MyDrive/Colab Notebooks/sigir_2022/data/parquet_files/stage1.parquet')
    stage1_df = stage1_df[['outfit_id', 'incomplete_outfit']]
    stage1_df.rename(columns={'incomplete_outfit':'products'}, inplace=True)
    stage1_df['stage1'] = True
    return pd.concat([full_outfits, stage1_df])

def add_stage2_data(full_outfits):
    stage2_df = pd.read_parquet('/content/drive/MyDrive/Colab Notebooks/sigir_2022/data/parquet_files/stage2.parquet')
    stage2_df = stage2_df[['outfit_id', 'incomplete_outfit']]
    stage2_df.rename(columns={'incomplete_outfit':'products'}, inplace=True)
    stage2_df['stage1'] = True
    return pd.concat([full_outfits, stage2_df])

def generate_leave_one_out_sample_per_outfit(df):
    # Take out each product from an outfit and treat it as a missing product
    df['missing_product'] = df['products'].copy()
    df = df.explode('missing_product')
    df['incomplete_outfit'] = df.progress_apply(
        lambda x: [y for y in x['products'] if y!=x['missing_product']], 
        axis=1
    )
    return df



full_outfits = get_full_outfits()
full_outfits = add_stage1_data(full_outfits)
full_outfits = add_stage2_data(full_outfits)
full_outfits = generate_leave_one_out_sample_per_outfit(full_outfits)

# Generate negative product samples

In [None]:
def _filter_candidates(x, n_candidates):
    res = set([x['missing_product']])
    for candidate in list(set(x["candidates"].tolist())):
        if (candidate not in res) & (candidate not in x['incomplete_outfit']):
            res.add(candidate)
        if len(res) >= n_candidates:
            break
    return list(res)

def generate_n_random_candidates_per_outfit(full_outfits, n_candidates=5):
    all_products = pd.read_parquet('/content/drive/MyDrive/Colab Notebooks/sigir_2022/data/parquet_files/products.parquet')['product_id']
    possible_choices = np.random.choice(all_products.unique(), 
                                        size=(len(full_outfits), n_candidates+1), 
                                        replace=True)
    full_outfits["candidates"] = list(possible_choices)
    full_outfits["candidates"] = full_outfits.progress_apply(
        functools.partial(_filter_candidates, n_candidates=n_candidates), 
        axis=1)
    return full_outfits

def validate_missing_product_in_candidates(full_outfits):
    num_matches = full_outfits.progress_apply(lambda x: x['missing_product'] in x['candidates'], axis=1).sum()
    assert num_matches == len(full_outfits)



full_outfits = generate_n_random_candidates_per_outfit(full_outfits, 
                                                        n_candidates=20)
validate_missing_product_in_candidates(full_outfits)

# Process product information

In [None]:
class Products:
    def __init__(self, product_features_config):
        self.product_features_config = product_features_config
        self.products_df = pd.read_parquet('/content/drive/MyDrive/Colab Notebooks/sigir_2022/data/parquet_files/products.parquet')
        print(f'Warning: Not processing/using columns ({set(self.products_df.columns) - set(list(self.product_features_config.keys()))})')
        print(f'Adding new columns ({set(list(self.product_features_config.keys())) - set(self.products_df.columns)})')

    def process_product_image_path(self):
        self.products_df['product_image_path'] = self.products_df['product_image_path'].apply(
            lambda x: [f"{n}_{y}" for n, y in enumerate(x.split('/')[:-1])]
        )

    def _process_product_highlights(self, x):
        if (x==x) & (x is not None):
            res = []
            list_x = x.replace('[','').replace(']','').lower().replace('\xa0','').replace('-', ' ').replace('.', '').split(', ')
            for word in list_x:
                for split_word in word.split('/'):
                    res.append(split_word.rstrip().lstrip().rstrip(';'))
            return res
        return ['null']

    def _postprocess_product_highlights(self, x, keep_values):
        res = [y for y in x if y in keep_values]
        if len(res) == 0:
            return ['null']
        return res

    def process_product_highlights(self):
        self.products_df['product_highlights'] = self.products_df['product_highlights'].apply(
            self._process_product_highlights
        )
        counts = Counter(list(itertools.chain(*self.products_df['product_highlights'].values.tolist()))).most_common()
        keep_values = set([c[0] for c in counts if c[1]>=50])
        self.products_df['product_highlights'] = self.products_df['product_highlights'].apply(
            functools.partial(self._postprocess_product_highlights, keep_values=keep_values)
        )
    
    def _process_product_attributes(self, x):
        res = []
        for d in x:
            attribute_name = d['attribute_name'].lower()
            attribute_values = d['attribute_values']
            res.append(attribute_name)
            for attribute_value in attribute_values:
                attribute_value = attribute_value.lower()
                res.append(f"{attribute_name}_{attribute_value}")
                res.append(attribute_value)
        return list(set(res))

    def process_product_attributes(self):
        self.products_df['product_attributes'] = self.products_df['product_attributes'].apply(
            lambda x: eval(x) if ((x==x) & (x is not None)) else [{'attribute_name':'null','attribute_values':['null']}]
        )
        self.products_df['product_attributes'] = self.products_df['product_attributes'].apply(
            self._process_product_attributes
        )

    def add_full_category(self):
        self.products_df['full_category'] = self.products_df['product_family'] + '||' \
                                                + self.products_df['product_category'] + '||' \
                                                + self.products_df['product_sub_category']

    def process_product_materials(self):
        self.products_df['product_materials'].fillna('null', inplace=True)
        self.products_df['product_materials'] = self.products_df['product_materials'].apply(
            lambda x: x if (not isinstance(x, str)) else ['null']
        )
        self.products_df['product_materials'] = self.products_df['product_materials'].apply(lambda x: [y.lower() for y in x])

    def process_product_second_color(self):
        self.products_df['product_second_color'].fillna('null', inplace=True)
        self.products_df['product_second_color'] = self.products_df['product_second_color'].apply(lambda x: x.lower())
        
    def process_product_main_colour(self):
        self.products_df['product_main_colour'] = self.products_df['product_main_colour'].apply(lambda x: x.lower())

    def _process_product_short_description(self):
        tc = TextCleaner(word_count_min=50, word_length_min=2, 
                         bigram_kwargs={'bigrams_pmi_min_value':1, 'bigrams_min_freq':20})
        self.products_df['product_short_description'] = tc.fit_transform(
            self.products_df['product_short_description'].values.tolist()
        )

    def process_product_short_description(self):
        self.products_df['product_short_description'].fillna('null', inplace=True)
        self.products_df['product_short_description'] = self.products_df['product_short_description'].apply(lambda x: x.lower())
        self._process_product_short_description()

    def process_product_gender(self):
        self.products_df['product_gender'] = self.products_df['product_gender'].apply(lambda x: x.lower())

    def process_product_brand(self):
        self.products_df['product_brand'] = self.products_df['product_brand'].apply(lambda x: x.lower())
        
    def keep_as_is(self):
        pass
        
    def run(self):
        str_cols = ['product_family','product_category','product_sub_category',
                    'product_gender','product_main_colour','product_second_color',
                    'product_brand','full_category']
        list_cols = ['product_attributes','product_materials','product_highlights',
                     'product_short_description', 'product_image_path']
        
        for col, func in tqdm(self.product_features_config.items(), 
                              desc='Process product columns'):
            # Process columns
            getattr(self, func)()
            
            # Prepend column name to values
            if col in str_cols:
                self.products_df[col] = self.products_df[col].apply(lambda x: f"{col}_{x}")
            elif col in list_cols:
                self.products_df[col] = self.products_df[col].apply(lambda x: [f"{col}_{y}" for y in x])
            else:
                raise ValueError(f'Not implemented: column type for {col}')
                
        assert self.products_df[list(self.product_features_config.keys())].isnull().sum().sum() == 0



product_features_config = {
    'full_category': 'add_full_category',
    'product_family': 'keep_as_is',
    'product_category': 'keep_as_is',
    'product_sub_category': 'keep_as_is',
    'product_gender': 'process_product_gender',
    'product_main_colour': 'process_product_main_colour',
    'product_second_color': 'process_product_second_color',
    'product_brand': 'process_product_brand',
    'product_attributes': 'process_product_attributes',
    'product_materials': 'process_product_materials',
    'product_image_path': 'process_product_image_path',
    'product_highlights': 'process_product_highlights',
    'product_short_description': 'process_product_short_description'
}
products = Products(product_features_config)
products.run()

In [None]:
def add_popular_product_ids(full_outfits, products, product_features_config):
    pid_counts = Counter(itertools.chain(*full_outfits['products'].values.tolist()))
    popular_products = set([pc[0] for pc in pid_counts.items() if pc[1]>=149]) # 99.5th percentile of counts of outfits
    products.products_df['popular_product_ids'] = products.products_df['product_id'].apply(
        lambda x: 'popular_product_ids_'+str(x) if x in popular_products else 'popular_product_ids_null'
    )
    product_features_config['popular_product_ids'] = ''
    return products, product_features_config



products, product_features_config = add_popular_product_ids(full_outfits, products, product_features_config)

In [None]:
def unpack(x):
    res = []
    for y in x:
        if isinstance(y, list):
            for z in y:
                res.append(z)
        else:
            res.append(y)
    return res

def get_pid2ohe_all_products(products_df, cols):
    pid2features = dict(zip(products_df['product_id'], 
                            [unpack(v) for v in products_df[cols].values]))
    mlb = MultiLabelBinarizer(sparse_output=True)
    pids = products_df['product_id'].values.tolist()
    pid2ohe = dict(
        zip(pids, mlb.fit_transform(np.array([pid2features[pid] for pid in pids], dtype='object')))
    )
    return pid2ohe



pid2ohe = get_pid2ohe_all_products(products.products_df, list(product_features_config.keys()))
pid2hash = pickle.loads(open('/content/drive/MyDrive/Colab Notebooks/sigir_2022/data/pid2phash.p', 'rb').read()) # This is a perceptual image hash from the ImageHash library (https://pypi.org/project/ImageHash/)
pid2ohe = {pid:hstack([ohe, csr_matrix(pid2hash[pid])]) for pid, ohe in tqdm(pid2ohe.items(), desc='Adding image hashes')}

# Modeling

In [None]:
def df2Xy(df, pid2ohe):
    incomplete_outfit_ohe = df['incomplete_outfit'].progress_apply(lambda x: np.array([pid2ohe[y] for y in x]).sum(0))
    candidates_ohe = df['candidates'].progress_apply(lambda x: [pid2ohe[y] for y in x])

    X1, X2 = [], []
    for incomplete_outfit, candidates in tqdm(zip(incomplete_outfit_ohe.values, candidates_ohe.values)):
        for candidate in candidates:
            X1.append(incomplete_outfit)
            X2.append(candidate)
    X1 = vstack(X1)
    X2 = vstack(X2)
    X = hstack([X1, X2]).astype(float)
    del X1
    del X2
    
    labels = df.progress_apply(lambda x: [x['missing_product']==c for c in x['candidates']], axis=1)
    y = np.array(list(itertools.chain(*labels))).astype(float)
    return X, y



models = {}
model_kwargs = {
    'random_state': 42, 
    'verbose': 1,
    'n_jobs': -1, 
    'class_weight': 'balanced',
    'num_leaves': 1000,
    'n_estimators': 1000,
    'device': 'gpu'
} # With a NVIDIA P100 GPU and ~50GB RAM, this takes ~20 minutes per iteration
for n, df_chunk in enumerate(np.array_split(full_outfits, 20)):
    start = datetime.utcnow()
    print(f"Iteration: {n}")
    print('Converting df to ohe...')
    X_train, y_train = df2Xy(df_chunk, pid2ohe)
    print('Training model...')
    model = LGBMClassifier(**model_kwargs)
    model.fit(X_train, y_train)
    models[n] = model
    gc.collect()
    print(f'Took {(datetime.utcnow() - start).seconds/60} minutes.')
    print()
    print('='*100)
    print()

# Predict missing products from Stage 2's test set

In [None]:
def _filter_candidates(x):
    candidates = x['candidates'].values
    sorted_inds = np.argsort(x['predicted_product_proba'].values)[::-1]
    for candidate in candidates[sorted_inds]:
        if candidate not in x['incomplete_outfit']:
            return candidate

# Read in data
stage2_df = pd.read_parquet('/content/drive/MyDrive/Colab Notebooks/sigir_2022/data/parquet_files/stage2.parquet')

# Format stage2_df
incomplete_outfit_ohe = stage2_df['incomplete_outfit'].progress_apply(lambda x: np.array([pid2ohe[y] for y in x]).sum(0))
candidates_ohe = stage2_df['candidates'].progress_apply(lambda x: [pid2ohe[y] for y in x])

X1, X2 = [], []
for incomplete_outfit, candidates in tqdm(zip(incomplete_outfit_ohe.values, candidates_ohe.values)):
    for candidate in candidates:
        X1.append(incomplete_outfit)
        X2.append(candidate)
X_stage2 = hstack([vstack(X1), vstack(X2)])
del X1
del X2
gc.collect()

# Make predictions
stage2_df['row_id'] = list(range(len(stage2_df)))

explode_stage2_df = stage2_df.explode('candidates').copy()
for model_num, model in enumerate(tqdm(models.values(), desc='Making predictions')):
    explode_stage2_df[f'predicted_product_proba_{model_num}'] = model.predict_proba(X_stage2.astype(float))[:,1]
explode_stage2_df['predicted_product_proba'] = explode_stage2_df[
    [c for c in explode_stage2_df.columns if c.startswith('predicted_product_proba')]
].sum(1)

g = explode_stage2_df.groupby('row_id').progress_apply(_filter_candidates)
outfit2predicted = dict(g)
stage2_df['predicted_product'] = stage2_df['row_id'].map(outfit2predicted)

stage2_df[['outfit_id', 'predicted_product']].to_csv('2022_06_13_3_submission.csv', index=False)