In [1]:
import os
import gc
import re
import csv
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm
tqdm.pandas()

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

In [2]:
train = pd.read_csv("../input/amazon-ml-challenge-2021-hackerearth/train.csv", escapechar="\\", quoting=csv.QUOTE_NONE, usecols=["TITLE", "BROWSE_NODE_ID"])
test = pd.read_csv("../input/amazon-ml-challenge-2021-hackerearth/test.csv", escapechar="\\", quoting=csv.QUOTE_NONE, usecols=["PRODUCT_ID", "TITLE"])
ss = pd.read_csv("../input/amazon-ml-challenge-2021-hackerearth/sample_submission.csv", escapechar="\\", quoting=csv.QUOTE_NONE)

In [5]:
# temp fix of nan values
train = train.fillna(" ")

# TITLE Only

In [7]:
def clean_title(string):
    # remove special characters
    string = re.sub("[^a-zA-Z0-9]\s?", ' ', string)
    # remove single characters
    string = re.sub("(^| ).(( ).)*( |$)", ' ', string)
    # remove repeated spaces
    string = re.sub(r"\s+", " ", string)
    # lower all characters
    string = string.lower()
    # remove html/css stuffs
    return string
    pass

In [8]:
train["cleaned_title"] = train["TITLE"].progress_apply(clean_title)
del train["TITLE"]
_ = gc.collect()

100%|██████████| 2903024/2903024 [01:30<00:00, 32152.19it/s]


# Modeling

In [9]:
import tensorflow as tf
from tensorflow.keras import backend as K
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertModel, create_optimizer
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Flatten, Input
from keras.preprocessing.sequence import pad_sequences

In [10]:
class CFG:
    MAX_LEN_TITLE = 96
    EPOCHS = 20
    TRAIN_BS = 32
    VALIDATION_BS = 64
    N_CLASSES = 9919
    
    model_name = "bert-base-uncased"
    tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True)
    
def tokenize_sentences(sentences, tokenizer, max_seq_len):
    tokenized_sentences = []

    for sentence in tqdm(sentences):
        tokenized_sentence = tokenizer.encode(
                            sentence,                  # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            max_length = max_seq_len,  # Truncate all sentences.
                            truncation=True
                    )
        
        tokenized_sentences.append(tokenized_sentence)

    return tokenized_sentences

def create_attention_masks(tokenized_and_padded_sentences):
    attention_masks = []

    for sentence in tokenized_and_padded_sentences:
        att_mask = [int(token_id > 0) for token_id in sentence]
        attention_masks.append(att_mask)

    return np.asarray(attention_masks)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [11]:
def focal_loss(inputs, targets, alpha=1, gamma=2):
    ce_loss = tf.keras.losses.categorical_crossentropy(inputs, targets)
    pt = K.exp(-ce_loss)
    _focal_loss = alpha * (1 - pt)**gamma * ce_loss

    return _focal_loss
    pass

In [12]:
input_ids = tokenize_sentences(train['cleaned_title'], CFG.tokenizer, CFG.MAX_LEN_TITLE)
input_ids = pad_sequences(input_ids, maxlen=CFG.MAX_LEN_TITLE, dtype="long", value=0, truncating="post", padding="post")
attention_masks = create_attention_masks(input_ids)

100%|██████████| 2903024/2903024 [28:45<00:00, 1682.91it/s]


In [None]:
le = LabelEncoder().fit(train["BROWSE_NODE_ID"].values)
labels_map = le.transform(train["BROWSE_NODE_ID"].values)
labels = tf.keras.utils.to_categorical(labels_map)

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=0, test_size=0.3)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, random_state=0, test_size=0.3)

train_size = len(train_inputs)
validation_size = len(validation_inputs)

# train_dataset = create_dataset(((train_inputs, train_masks), train_labels), batch_size=CFG.TRAIN_BS)
# validation_dataset = create_dataset(((validation_inputs, validation_masks), validation_labels), batch_size=CFG.VALIDATION_BS)

In [23]:
def build_model(max_len, n_classes, loss, optimizer):
    ids = tf.keras.layers.Input((max_len,), dtype=tf.int32)
    att = tf.keras.layers.Input((max_len,), dtype=tf.int32)

    bert_model = TFBertModel.from_pretrained(CFG.model_name)
    x = bert_model(ids,attention_mask=att)
    out = Dense(n_classes, activation="softmax")(x[1])

    model = tf.keras.models.Model(inputs=[ids, att], outputs=out)
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
    model.compile(loss=loss, optimizer=optimizer, metrics=["accuracy"])
    
    return model

model = build_model(CFG.MAX_LEN_TITLE, len(le.classes_),
                    loss=focal_loss,
                    optimizer="adam")

history = model.fit(x=(train_inputs, train_masks),
                    y=train_labels,
                    validation_data=((validation_inputs, validation_masks), validation_labels),
                    steps_per_epoch=train_size//CFG.TRAIN_BS,
                    validation_steps=validation_size//CFG.VALIDATION_BS,
                    epochs=10)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Inference

In [13]:
# fill NaN
test = test.fillna(" ")

test["cleaned_title"] = test["TITLE"].progress_apply(clean_title)
test_input_ids = tokenize_sentences(test['cleaned_title'], CFG.tokenizer, CFG.MAX_LEN_TITLE)
test_input_ids = pad_sequences(test_input_ids, maxlen=CFG.MAX_LEN_TITLE, dtype="long", value=0, truncating="post", padding="post")
test_attention_masks = create_attention_masks(test_input_ids)

preds = model.predict((test_input_ids, test_attention_masks),
                     batch_size=64, verbose=1)

100%|██████████| 110775/110775 [00:03<00:00, 30174.48it/s]
100%|██████████| 110775/110775 [01:12<00:00, 1518.40it/s]




In [14]:
sub = pd.DataFrame()
sub["PRODUCT_ID"] = test["PRODUCT_ID"].values
sub["BROWSE_NODE_ID"] = le.inverse_transform(np.argmax(preds, axis=1))

sub

Unnamed: 0,PRODUCT_ID,BROWSE_NODE_ID
0,1,4
1,2,55
2,3,55
3,4,98
4,5,55
...,...,...
110770,110771,55
110771,110772,209
110772,110773,55
110773,110774,75


In [15]:
sub.to_csv("submission.csv", index=False)