In [None]:
import numpy as np
import pandas as pd

import os
import sys
import hashlib
import time
import torch as th
import json
import gc
import pdb
from tqdm import tqdm
from datetime import datetime

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
 
import torch
import torch.nn as nn
import torch.nn.functional as F

import matplotlib.pyplot as plt


from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Configuration

In [None]:
WORKSPACEK = '/content/gdrive/MyDrive/Kaggle_HM/Kaggle_HM/'


args = {

    # Dataset
    "train_data"    : WORKSPACEK + 'data/item_cus_train.zip',
    "predict_data"   : WORKSPACEK + 'data/item_cus_predict.zip',
    "submit"      : './sample_submission_cb_v1.csv',

    "test_data_proportion"   : 0.1,
    "test_splite_random"    : 42,

    "use_data"     :  20000, 

    # Model

    "model_name"         : "/content/gdrive/MyDrive/Kaggle_HM/Kaggle_HM/model/GRU_2.pt",

    # Train
    
    "epoch"           : 50,
    "batch_size"        : 256,
    "lr"            : 1e-4,

    # Log
    "log_loss_period"       : 10,
    "evaluate_period"       : 100,

}

os.chdir(WORKSPACEK + '/notebook/')
sys.path.append(WORKSPACEK + '/notebook/')

DEVICE = 'cuda' if th.cuda.is_available() else 'cpu'

# Data Prepare

### Utility

In [None]:
def generate_text_file(length=1e6, ncols=20):
    data = np.random.random((length, ncols))
    np.savetxt('large_text_file.csv', data, delimiter=',')

def iter_loadtxt(filename, delimiter=',', skiprows=0, dtype=float):
    def iter_func():
        with open(filename, 'r') as infile:
            for _ in range(skiprows):
                next(infile)
            for line in infile:
                line = line.rstrip().split(delimiter)
                for item in line:
                    yield dtype(item)
        iter_loadtxt.rowlength = len(line)

    data = np.fromiter(iter_func(), dtype=dtype)
    data = data.reshape((-1, iter_loadtxt.rowlength))
    return data


In [None]:
df = pd.read_csv(args["train_data"])
X = df.values[:, :-1].copy().astype(np.float32)
y_true = df.values[:,-1].copy()
y_true = [x.split("#") for x in y_true]

del df 
gc.collect(generation=2)

0

## Use part data for develop

In [None]:
X = X[:args["use_data"]]
y_true = y_true[:args["use_data"]]

## Train MultiLabelBinarizer ON Y

We can't transform all Y into multilable model because the result is too large.

In [None]:
mlb = MultiLabelBinarizer()
mlb.fit(y_true)

MultiLabelBinarizer()

In [None]:
print(f" there are {len(mlb.classes_)} labels")
mlb.transform([y_true[0]]).sum()

 there are 17382 labels


4

In [None]:
# Train, Test split
X_train, X_test, y_train, y_test = train_test_split(X, y_true, test_size=args["test_data_proportion"], random_state=args["test_splite_random"], shuffle=True)

FEATURE_SIZE = X.shape[1]
LABLE_SIZE = len(mlb.classes_)


print(f"Train include: {len(X_train)} recoreds.")
print(f"Test include: {len(X_test)} recoreds.")

print(f"X memory size: {sys.getsizeof(X)} recoreds.")
print(f"y memory size: {sys.getsizeof(y_true)} recoreds.")

print(f"Feature size: {FEATURE_SIZE}")
print(f"Output size: {LABLE_SIZE}")

Train include: 18000 recoreds.
Test include: 2000 recoreds.
X memory size: 120 recoreds.
y memory size: 160072 recoreds.
Feature size: 313
Output size: 17382


In [None]:
from torch.utils.data import DataLoader, Dataset

class HMDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, x, y):
        self.X = x
        self.y = y
 
    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        return self.X[idx], mlb.transform([self.y[idx]])


train_loader = DataLoader(HMDataset(X_train, y_train), batch_size=args["batch_size"])  
test_loader = DataLoader(HMDataset(X_test, y_test), batch_size=args["batch_size"])  

# Model

In [None]:


class ItemNet(nn.Module):
  def __init__(self):
      super(ItemNet, self).__init__()
      self.fc1   = nn.Linear(FEATURE_SIZE, 120)
      self.fc2   = nn.Linear(120, 64)
      self.fc3   = nn.Linear(64, LABLE_SIZE)

  def forward(self, x):
      x = F.relu(self.fc1(x))
      x = F.relu(self.fc2(x))
      x = self.fc3(x)
      return torch.sigmoid(x)


## Utility

In [None]:
def get_accuracy(model, data_loader, device):
  '''
  Function for computing the accuracy of the predictions over the entire data_loader
  ''' 
  correct_pred = 0 
  total_pred = 0
  
  with torch.no_grad():

    # Set model to eval mode : no drop, 
    model.eval()

    for X, y_true in data_loader:

      X = X.to(device)
      y_true = y_true.to(device).squeeze()

      y_prob = model(X)
      
      #pdb.set_trace()

      correct_pred += (((y_prob > 0.5)  == y_true) * y_true).sum()

      total_pred += y_true.sum()

  return correct_pred.float() / total_pred

def validate(valid_loader, model, criterion, device):
    '''
    Function for the validation step of the training loop
    '''
    model.eval()
    running_loss = 0
    
    for X, y_true in valid_loader:
    
        X = X.to(device)
        y_true = y_true.to(device).squeeze()

        # Forward pass and record loss
        y_hat = model(X) 

        pos_loss = criterion(y_hat*y_true, y_true.to(torch.float32))
        neg_loss = criterion(y_hat * (y_true == 0), y_true.to(torch.float32))
        loss = (pos_loss + neg_loss) / 2

        running_loss += loss.item() * X.size(0)

    epoch_loss = running_loss / len(valid_loader.dataset)
        
    return model, epoch_loss

def train(train_loader, model, criterion, optimizer, device):
    '''
    Function for the training step of the training loop
    '''
    model.train()
    running_loss = 0
    
    for X, y_true in tqdm(train_loader):

        # Pytorch accumulate gradient, so we should zero it at first 
        optimizer.zero_grad()
        
        X = X.to(device)
        y_true = y_true.to(device).squeeze()

        # Forward pass
        y_hat = model(X) 

        #pdb.set_trace()

        pos_loss = criterion(y_hat*y_true, y_true.to(torch.float32))
        neg_loss = criterion(y_hat * (y_true == 0), y_true.to(torch.float32))
        loss = (pos_loss + neg_loss) / 2

        running_loss += loss.item() * X.size(0)

        # Backward pass
        loss.backward()
        optimizer.step()
        
    epoch_loss = running_loss / len(train_loader.dataset)
    return model, optimizer, epoch_loss

def plot_losses_accuracy(train_losses, valid_losses, train_accu, valid_accu):
    '''
    Function for plotting training and validation losses
    '''
    plt.style.use('seaborn')

    train_losses = np.array(train_losses) 
    valid_losses = np.array(valid_losses)

    fig, ax = plt.subplots(figsize = (8, 4.5))

    ax.plot(train_losses, color='blue', label='Training loss') 
    ax.plot(valid_losses, color='red', label='Validation loss')
    ax.set(title="Loss over epochs", 
            xlabel='Epoch',
            ylabel='Loss') 
    
    ax2 = ax.twinx()
    ax2.plot(train_accu, color='orange', label='Training accuracy') 
    ax2.plot(valid_accu, color='green', label='Validation accuracy')

    ax.legend(loc='upper right')
    ax2.legend(loc='lower left')

    ax2.grid(None)

    fig.show()
    
    # change the plot style to default
    #plt.style.use('default')


def training_loop(model, criterion, optimizer, train_loader, valid_loader):
    '''
    Function defining the entire training loop
    '''
    # set objects for storing metrics
    best_loss = 1e10
    train_losses, train_accuracy = [], []
    valid_losses, valid_accuracy = [], []
 
    train_start = time.time()
 
    # Train model
    for epoch in range(args["epoch"]):

      # Start timing
      epoch_start = time.time()

      # training
      model, optimizer, train_loss = train(train_loader, model, criterion, optimizer, DEVICE)
      train_losses.append(train_loss)

      # validation
      with torch.no_grad():
        model, valid_loss = validate(valid_loader, model, criterion, DEVICE)
        valid_losses.append(valid_loss)

      train_acc = get_accuracy(model, train_loader, device=DEVICE).to('cpu')
      valid_acc = get_accuracy(model, valid_loader, device=DEVICE).to('cpu')

      train_accuracy.append(train_acc)
      valid_accuracy.append(valid_acc)
          
      epoch_end = time.time()
      print(f'{datetime.now().time().replace(microsecond=0)} --- '
            f'Epoch: {epoch}\t'
            f'Train loss: {train_loss:.4f}\t'
            f'Valid loss: {valid_loss:.4f}\t'
            f'Train accuracy: {100 * train_acc:.2f}\t'
            f'Valid accuracy: {100 * valid_acc:.2f}\t'
            f'Elapse: {(epoch_end - epoch_start):.2f} seconds\t'
            )
      
    # Print Traning Time
    train_end = time.time()
    print("="*50)
    print(f'Total Elapse: {(train_end - train_start):.2f} seconds\t')

    return model, optimizer, train_losses, valid_losses, train_accuracy, valid_accuracy

# Train

In [None]:
# Set fixed random number seed
torch.manual_seed(args["test_splite_random"])

model = ItemNet().to(DEVICE)

# Define Optmizer and Loss Function
opt = torch.optim.Adam(model.parameters(), lr=args["lr"])
criterion = nn.BCEWithLogitsLoss()

model, optimizer, train_losses, valid_losses, train_accu, valid_accu = training_loop(model, criterion, opt, train_loader, test_loader) 
plot_losses_accuracy(train_losses, valid_losses, train_accu, valid_accu)