In [1]:
%config Completer.use_jedi = False

# Import Packages

In [2]:
import sys
sys.path.append('../input/timmmaster')
import timm

In [3]:
import math
import os
import numpy as np
import cv2
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
import timm
import torch
from torch import nn 
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F 
import albumentations
from albumentations.pytorch.transforms import ToTensorV2
from torch.optim import lr_scheduler
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn import metrics
from datetime import date

# Load Dataset


In [4]:
train_df = pd.read_csv('../input/shopee-product-matching/train.csv')
train_df.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069


In [5]:
test_df = pd.read_csv('../input/shopee-product-matching/test.csv')
test_df.head()

Unnamed: 0,posting_id,image,image_phash,title
0,test_2255846744,0006c8e5462ae52167402bac1c2e916e.jpg,ecc292392dc7687a,Edufuntoys - CHARACTER PHONE ada lampu dan mus...
1,test_3588702337,0007585c4d0f932859339129f709bfdc.jpg,e9968f60d2699e2c,(Beli 1 Free Spatula) Masker Komedo | Blackhea...
2,test_4015706929,0008377d3662e83ef44e1881af38b879.jpg,ba81c17e3581cabe,READY Lemonilo Mie instant sehat kuah dan goreng


# Load Cross Validation Folds

In [6]:
# Somebody uploaded this folds in notebook, I am not able to recall his or her name, this saved time for me
data = pd.read_csv('../input/crossvalidationfolds/folds.csv')
data.head(10)

Unnamed: 0,posting_id,image,image_phash,title,label_group,fold
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794,0
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045,2
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891,0
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188,1
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069,3
5,train_2464356923,0013e7355ffc5ff8fb1ccad3e42d92fe.jpg,bbd097a7870f4a50,CELANA WANITA (BB 45-84 KG)Harem wanita (bisa...,2660605217,0
6,train_1802986387,00144a49c56599d45354a1c28104c039.jpg,f815c9bb833ab4c8,Jubah anak size 1-12 thn,1835033137,0
7,train_1806152124,0014f61389cbaa687a58e38a97b6383d.jpg,eea7e1c0c04da33d,KULOT PLISKET SALUR /CANDY PLISKET /WISH KULOT...,1565741687,0
8,train_86570404,0019a3c6755a194cb2e2c12bfc63972e.jpg,ea9af4f483249972,"[LOGU] Tempelan kulkas magnet angka, tempelan ...",2359912463,2
9,train_831680791,001be52b2beec40ddc1d2d7fc7a68f08.jpg,e1ce953d1a70618f,BIG SALE SEPATU PANTOFEL KULIT KEREN KERJA KAN...,2630990665,0


In [7]:
# number of unique classes in dataset
print(len(np.unique(data['label_group'])))

11014


# Configuration Options


In [8]:
TRAIN_DIR = '../input/shopee-product-matching/train_images'
TEST_DIR = '../input/shopee-product-matching/test_images'
TRAIN_CSV = '../input/crossvalidationfolds/folds.csv'
MODEL_PATH = './'


class CFG:
    seed = 123 
    img_size = 512
    classes = 11014
    fc_dim = 512
    epochs = 15
    batch_size = 32
    num_workers = 3
    model_name = 'tf_efficientnet_b4'
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    scheduler_params = {
        "lr_start": 1e-3,
        "lr_max": 1e-5 * batch_size,
        "lr_min": 1e-6,
        "lr_ramp_ep": 5,
        "lr_sus_ep": 0,
        "lr_decay": 0.8,
    }
    isTraining=False
    

# Create Custom DataSet

In [9]:
class ShopeeDataset(Dataset):
    
    def __init__(self, df,root_dir, isTraining=False, transform=None):
        self.df = df
        self.transform = transform
        self.root_dir = root_dir
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        # get row at index idx
#         print("idx",idx)
        
        row = self.df.iloc[idx]
#         print(row)
        label = row.label_group
        image_path = os.path.join(self.root_dir, row.image)
        
        # read image convert to RGB and apply augmentation
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        if self.transform:
            aug = self.transform(image=image)
            image = aug['image']
        
        return image, torch.tensor(label).long()
            




# Create data Augmentation For training and validation Data

In [10]:

def getAugmentation(IMG_SIZE, isTraining=False):
    
    if isTraining:
        return albumentations.Compose([
            albumentations.Resize(IMG_SIZE, IMG_SIZE, always_apply=True),
            albumentations.HorizontalFlip(p=0.5),
            albumentations.VerticalFlip(p=0.5),
            albumentations.Rotate(limit=120, p=0.75),
            albumentations.RandomBrightness(limit=(0.09, 0.6), p=0.5),
            albumentations.Normalize(
                mean = [0.485, 0.456, 0.406],
                std = [0.229, 0.224, 0.225]
            ),
            ToTensorV2(p=1.0)
        ])
    else:
        return albumentations.Compose([
            albumentations.Resize(IMG_SIZE, IMG_SIZE, always_apply=True),
            albumentations.Normalize(
                mean = [0.485, 0.456, 0.406],
                std = [0.229, 0.224, 0.225]
            ),
            ToTensorV2(p=1.0)
        ])

# Build Model

In [11]:
class ShopeeLabelGroupClassfier(nn.Module):
    
    def __init__(self,
                     model_name='tf_efficientnet_b0',
                     loss_fn='softmax',
                     classes = CFG.classes,
                     fc_dim = CFG.fc_dim,
                     pretrained=True,
                     use_fc=True,
                     isTraining=False
                ):
        
        
        super(ShopeeLabelGroupClassfier,self).__init__()
        
        # create bottlenack backbone network from pretrained model 
        self.backbone = timm.create_model(model_name, pretrained=pretrained)
        in_features = self.backbone.classifier.in_features
        # we will put FC layers over backbone to classfy images based on label groups
        self.backbone.classifier = nn.Identity()
        self.backbone.global_pool = nn.Identity()
        self.pooling = nn.AdaptiveAvgPool2d(1)
        self.use_fc = use_fc
        self.loss_fn =loss_fn
        
        # build top fc layers
        if self.use_fc:
            self.dropout = nn.Dropout(0.2)
            self.fc = nn.Linear(in_features,fc_dim )
            self.bn = nn.BatchNorm1d(fc_dim)
            in_features = fc_dim
        self.loss_fn = loss_fn
        
        if self.loss_fn=='softmax':
            self.final = nn.Linear(in_features, CFG.classes)
    
    def forward(self, image, label):
        features = self.get_features(image)
        
        if self.loss_fn=='softmax':
            logits = self.final(features)
            
        return logits
    
    def get_features(self,inp):
        batch_dim = inp.shape[0]
        inp = self.backbone(inp)
        inp = self.pooling(inp).view(batch_dim, -1)
        if self.use_fc:
            inp = self.dropout(inp)
            inp = self.fc(inp)
            inp = self.bn(inp)
        
        return inp
    
    
# shoppe_label_classfier = ShopeeLabelGroupClassfier()


# Build training  and validation Loop

In [12]:
def training_one_epoch(epoch_num,model, dataloader,optimizer, scheduler, device, loss_criteria):
    avgloss = 0.0
    # put model in traning model
    model.train()
    tq = tqdm(enumerate(dataloader), total=len(dataloader))
    
    for idx, data in tq:
        batch_size = data[0].shape[0]
        images = data[0]
        targets = data[1]
        # zero out gradient
        optimizer.zero_grad()
        # put input and target to device
        images = images.to(device)
        targets = targets.to(device)
        # pass input to the model
        output = model(images,targets)
        # get loss
        loss = loss_criteria(output,targets)
        # backpropogation 
        loss.backward()
        # update learning rate step
        optimizer.step() 
        # avg loss
        avgloss += loss.item() 

        tq.set_postfix({'loss' : '%.6f' %float(avgloss/(idx+1)), 'LR' : optimizer.param_groups[0]['lr']})
        
    # lr scheduler step after each epoch
    scheduler.step()
    return avgloss / len(dataloader)
    
    
    
    
    


def validation_one_epoch(model, dataloader, epoch, device, loss_criteria):
    avgloss = 0.0
    # put model in traning model
    model.eval()
    tq = tqdm(enumerate(dataloader), desc = "Training Epoch { }" + str(epoch+1))
    
    #     tq = tqdm(enumerate(dataloader), total=len(dataloader))
    with torch.no_grad():
        for idx, data in tq:
            batch_size = data[0].shape[0]
            images = data[0]
            targets = data[1]

            images = images.to(device)
            targets = targets.to(device)

            output = model(images,targets)
            loss = loss_criteria(output,targets)

            avgloss += loss.item() 

            tq.set_postfix({'validation loss' : '%.6f' %float(avgloss/(idx+1))})

    return avgloss / len(dataloader)
        
        

In [13]:
import numpy as np 
def get_class_weights(data):
    
#     data=train_data


    weight_dict=dict()
    for t in data.values:
        weight_dict[t[4]]=0
    print(len(weight_dict))
    for t in data.values:
        weight_dict[t[4]]+=1

    class_sample_count= np.array([weight_dict[t[4]] for t in data.values])
    weight = 1. / class_sample_count
    weight=torch.from_numpy(weight)
    return weight

In [14]:
# data = pd.read_csv('../input/crossvalidationfolds/folds.csv')
    
# # label encoding
# labelencoder= LabelEncoder()
# data['label_group_original']=data['label_group']
# data['label_group'] = labelencoder.fit_transform(data['label_group'])
# #data['weights'] = data['label_group'].map(1/data['label_group'].value_counts())
# # create training_data and validation data initially not using k fold
# train_data = data[data['fold']!=0]

In [15]:


    
    
    

# def make_weights_for_balanced_classes(images, nclasses):
#     '''
#         Make a vector of weights for each image in the dataset, based
#         on class frequency. The returned vector of weights can be used
#         to create a WeightedRandomSampler for a DataLoader to have
#         class balancing when sampling for a training batch.
#             images - torchvisionDataset.imgs
#             nclasses - len(torchvisionDataset.classes)
#         https://discuss.pytorch.org/t/balanced-sampling-between-classes-with-torchvision-dataloader/2703/3
#     '''
#     count = [0] * nclasses
#     for item in images:
#         count[item[1]] += 1  # item is (img-data, label-id)
#     weight_per_class = [0.] * nclasses
#     N = float(sum(count))  # total number of images
#     for i in range(nclasses):
#         weight_per_class[i] = N / float(count[i])
#     weight = [0] * len(images)
#     for idx, val in enumerate(images):
#         weight[idx] = weight_per_class[val[1]]

#     return weight

In [16]:
def run_training():
    data = pd.read_csv('../input/crossvalidationfolds/folds.csv')
    
    # label encoding
    labelencoder= LabelEncoder()
    data['label_group_original']=data['label_group']
    data['label_group'] = labelencoder.fit_transform(data['label_group'])
    #data['weights'] = data['label_group'].map(1/data['label_group'].value_counts())
    # create training_data and validation data initially not using k fold
    train_data = data[data['fold']!=0]
    # get weights for  classes
    samples_weight=get_class_weights(train_data)
    
    print("samples_weight", len(samples_weight))
    validation_data = data[data['fold']==0]
    
    # training augmentation
    train_aug = getAugmentation(CFG.img_size,isTraining=True )
    validation_aug = getAugmentation(CFG.img_size, isTraining=False)
    # create custom train and validation dataset
    
    trainset = ShopeeDataset(train_data, TRAIN_DIR, isTraining=True, transform = train_aug)
    validset = ShopeeDataset(validation_data, TRAIN_DIR, isTraining=False, transform = validation_aug)
    print(len(data), len(samples_weight))
    print(len(trainset))
    # create data sampler
                  
    sampler = torch.utils.data.sampler.WeightedRandomSampler(samples_weight, num_samples=len(samples_weight))   
    
    # create custom training and validation data loader num_workers=CFG.num_workers,
    train_dataloader = DataLoader(trainset, batch_size=CFG.batch_size,
                          drop_last=True,pin_memory=True, sampler=sampler)
    
    validation_dataloader = DataLoader(validset, batch_size=CFG.batch_size,
                         drop_last=True,pin_memory=True)
    
    
    # define loss function
    loss_criteria = nn.CrossEntropyLoss()
    loss_criteria.to(CFG.device)
    # define model
    
    model = ShopeeLabelGroupClassfier()
    model.to(CFG.device)
    
    # define optimzer
    optimizer = torch.optim.Adam(model.parameters(),lr= CFG.scheduler_params['lr_start'])
    
    # learning rate scheudler
    scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=7, T_mult=1, eta_min=1e-6, last_epoch=-1)
    
    history = {'train_loss':[],'validation_loss':[]}
    for epoch in range(CFG.epochs):
        
        # get current epoch training loss
        avg_train_loss = training_one_epoch(epoch_num = epoch,
                                           model = model,
                                           dataloader = train_dataloader,
                                           optimizer = optimizer,
                                           scheduler = scheduler,
                                           device = CFG.device, 
                                           loss_criteria = loss_criteria)
        
        # get current epoch validation loss
        avg_validation_loss = validation_one_epoch(model = model,
                                           dataloader = validation_dataloader,
                                           epoch = epoch,
                                           device = CFG.device,
                                           loss_criteria = loss_criteria)
        
        
        history['train_loss'].append(avg_train_loss)
        history['validation_loss'].append(avg_validation_loss)
        
        # save model
        torch.save(model.state_dict(), MODEL_PATH + str(date.today()) +'_softmax_512x512_{}.pt'.format(CFG.model_name))
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
#             'scheduler': lr_scheduler.state_dict()
            },
            MODEL_PATH + str(date.today()) +'_softmax_512x512_{}_checkpoints.pt'.format(CFG.model_name)
        )
        
    return model, history

In [17]:
# weight = 1. / class_sample_count
# samples_weight = np.array([weight[t] for t in y_train])
# samples_weight = torch.from_numpy(samples_weight)
# Now, that we have the weights for each of the classes, we can define a sampler.

# sampler = WeightedRandomSampler(samples_weight.type('torch.DoubleTensor'), len(samples_weight))
# Finally, we can use the sampler, while defining the Dataloader.

# train_dataloader = DataLoader(train_dataset, batch_size=4, sampler=sampler)

In [18]:
CFG.epochs

15

In [19]:
history=None
if CFG.isTraining:
    model, history = run_training()
    

In [20]:
if CFG.isTraining:
    epoch_lst = [ i+1 for i in range(15)]
    plt.plot(epoch_lst,history['train_loss'])

    plt.xlabel("Epoch number")
    plt.ylabel('Training Loss')
    plt.title('Training Loss SoftMax Loss Function')
    plt.show()

In [21]:
if CFG.isTraining:
    plt.plot(epoch_lst,history['validation_loss'])
    plt.xlabel("Epoch number")
    plt.ylabel('Validation Loss')
    plt.title('Validation Loss SoftMax Loss Function')
    plt.show()