# Import Data

In [1]:
from google.colab import files
uploaded = files.upload()
%ls

Saving test.csv to test.csv
Saving train.csv to train.csv
[0m[01;34msample_data[0m/  test.csv  train.csv


# Import Library

In [62]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow import feature_column

import time
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from torch.optim import Adam, SGD
import math
import matplotlib.pyplot as plt
import torch.nn as nn
from torch.nn import Linear, ReLU, CrossEntropyLoss, Sequential, Conv2d, MaxPool2d, Module, Softmax, BatchNorm2d, Dropout
import torch
from torch.utils.data import Dataset, DataLoader

#Load data

In [77]:
def load_data():
  train_file = "train.csv"
  test_file = "test.csv"
  df_train = pd.read_csv(train_file)
  df_test = pd.read_csv(test_file)

  return df_train, df_test

In [78]:
df_train, df_test = load_data()

# We do not need description for this part
train_data = df_train.drop(['noisyTextDescription'],axis=1)
test_data = df_test.drop(['id','noisyTextDescription'],axis=1)
y_train = df_train['category']

In [21]:
categorical_columns = ['gender', 'baseColour', 'season', 'usage']
y_columns = ['category']

train_categorys = df_train["category"]
all_categorys = list(set(train_categorys))
n_categorys = len(all_categorys) # 27

train_genders = df_train["gender"]
all_genders = list(set(train_genders))
n_genders = len(all_genders) # 5

train_baseColour = df_train["baseColour"]
all_baseColour = list(set(train_baseColour))
n_baseColour = len(all_baseColour) # 46

train_seasons = df_train["season"]
all_seasons= list(set(train_seasons))
n_seasons = len(all_seasons) # 4

train_usages = df_train["usage"]
all_usages = list(set(train_usages))
n_usages = len(all_usages) # 7

train_usages = df_train["usage"]
all_usages = list(set(train_usages))
n_usages = len(all_usages) # 7

### Use label encoder to preprocess tabular data

In [51]:
GENDER_ENCODER = LabelEncoder()
GENDER_ENCODER.fit(train_data["gender"])
train_data['gender'] = GENDER_ENCODER.transform(train_data['gender'])
test_data['gender'] = GENDER_ENCODER.transform(test_data['gender'])

SEASON_ENCODER = LabelEncoder()
SEASON_ENCODER.fit(train_data["season"])
train_data['season'] = SEASON_ENCODER.transform(train_data['season'])
test_data['season'] = SEASON_ENCODER.transform(test_data['season'])

COLOR_ENCODER = LabelEncoder()
COLOR_ENCODER.fit(train_data["baseColour"])
train_data['baseColour'] = COLOR_ENCODER.transform(train_data['baseColour'])
test_data['baseColour'] = COLOR_ENCODER.transform(test_data['baseColour'])

USAGE_ENCODER = LabelEncoder()
USAGE_ENCODER.fit(train_data["usage"])
train_data['usage'] = USAGE_ENCODER.transform(train_data['usage'])
test_data['usage'] = USAGE_ENCODER.transform(test_data['usage'])

CATEGORY_ENCODER = LabelEncoder()
CATEGORY_ENCODER.fit(y_train)
train_data['category'] = CATEGORY_ENCODER.transform(train_data['category'])
y_train = CATEGORY_ENCODER.transform(y_train)

In [81]:
train_df, valid_df = train_test_split(train_data, test_size=0.2, random_state=11)

In [82]:
train_df.reset_index(drop=True, inplace=True)
valid_df.reset_index(drop=True, inplace=True)

In [54]:
class Table_Dataset(Dataset):

    def __init__(self, df):
        self.IDs = list(df['id'])
        self.genders = list(df['gender'])
        self.seasons = list(df['season'])
        self.colors = list(df['baseColour'])
        self.usages = list(df['usage'])
        self.labels = list(df['category'])

    def __len__(self):
        return len(self.IDs)
    
    def __getitem__(self, idx):
        # read X
        label = self.labels[idx]
        row_data = []
        features = torch.Tensor([self.genders[idx], self.seasons[idx], self.colors[idx], self.usages[idx]])

        return features, label

In [65]:
train_dataset = Table_Dataset(train_df)
train_iterator = DataLoader(train_dataset, batch_size=512, shuffle=True)

valid_dataset = Table_Dataset(valid_df)
valid_iterator = DataLoader(valid_dataset, batch_size=512, shuffle=True)

# Create NN model and related methods

In [87]:
class NN(nn.Module):

    def __init__(self, input_size, hidden_size, output_size) :
        super().__init__()
        hidden_size = 27
        self.fc1 = nn.Linear(input_size, hidden_size)

        self.fc4 = nn.Linear(hidden_size, hidden_size)
        self.fc5 = nn.Linear(hidden_size, hidden_size)
        self.fc6 = nn.Linear(hidden_size, hidden_size)
        self.fc7 = nn.Linear(hidden_size+4, hidden_size)

        self.dropout = nn.Dropout(0.25)

        self.fc2 = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        temp = x
        x = self.fc1(x)
        x = self.relu(x)

        x = self.fc4(x)
        x = self.relu(x)
        x = self.fc5(x)
        x = self.relu(x)

        x = self.fc6(x)
        x = self.relu(x)

        x = torch.cat((temp, x), dim=1)

        x = self.fc7(x)
        x = self.relu(x)

        x = self.fc2(x)
        return x.squeeze()

In [88]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

def calculate_accuracy(y_pred, y):
    top_pred = y_pred.argmax(1, keepdim = True)
    correct = top_pred.eq(y.view_as(top_pred)).sum()
    acc = correct.float() / y.shape[0]
    return acc

def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for feature, label in iterator:

        optimizer.zero_grad()
                
        y_pred = model(feature)
        
        
        loss = criterion(y_pred, label)
        
        acc = calculate_accuracy(y_pred, label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate_nn(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():

        for feature, label in iterator:

            y_pred = model(feature)
            
            loss = criterion(y_pred, label)
            
            acc = calculate_accuracy(y_pred, label)
            
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [89]:
criterion = CrossEntropyLoss()

model = NN(input_size=4, hidden_size=15, output_size=27)

optimizer = Adam(model.parameters(), lr=0.001)

In [90]:
train_accuracy_list = []
train_loss_list = []
valid_acc_list = []
valid_loss_list = []
best_valid_loss = float('inf')

for epoch in range(100):

    start_time = time.monotonic()

    # train
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    # valid
    valid_loss, valid_acc = evaluate_nn(model, valid_iterator, criterion)

    # save best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model, 'linear-model.pt')

    # Track the accuracy
    train_accuracy_list.append(train_acc)
    train_loss_list.append(train_loss)
    valid_acc_list.append(valid_acc)
    valid_loss_list.append(valid_loss)
        
    # print epoch info
    end_time = time.monotonic()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 0s
	Train Loss: 3.248 | Train Acc: 17.03%
	 Val. Loss: 2.647 |  Val. Acc: 33.19%
Epoch: 02 | Epoch Time: 0m 0s
	Train Loss: 2.516 | Train Acc: 33.07%
	 Val. Loss: 2.374 |  Val. Acc: 32.69%
Epoch: 03 | Epoch Time: 0m 0s
	Train Loss: 2.352 | Train Acc: 34.88%
	 Val. Loss: 2.297 |  Val. Acc: 36.15%
Epoch: 04 | Epoch Time: 0m 0s
	Train Loss: 2.297 | Train Acc: 35.85%
	 Val. Loss: 2.233 |  Val. Acc: 37.35%
Epoch: 05 | Epoch Time: 0m 0s
	Train Loss: 2.229 | Train Acc: 36.68%
	 Val. Loss: 2.165 |  Val. Acc: 40.09%
Epoch: 06 | Epoch Time: 0m 0s
	Train Loss: 2.168 | Train Acc: 38.44%
	 Val. Loss: 2.109 |  Val. Acc: 40.24%
Epoch: 07 | Epoch Time: 0m 0s
	Train Loss: 2.122 | Train Acc: 40.02%
	 Val. Loss: 2.074 |  Val. Acc: 42.38%
Epoch: 08 | Epoch Time: 0m 0s
	Train Loss: 2.080 | Train Acc: 42.18%
	 Val. Loss: 2.029 |  Val. Acc: 43.17%
Epoch: 09 | Epoch Time: 0m 0s
	Train Loss: 2.039 | Train Acc: 42.88%
	 Val. Loss: 1.992 |  Val. Acc: 43.86%
Epoch: 10 | Epoch Time: 0m 0

# One hot encode

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_valid, label_train, label_valid = train_test_split(data_onehot, y_labels, test_size=0.3, random_state=0, stratify=y_labels)

In [None]:
def bin_to_category(predict_data):
  n_data = len(predict_data)
  y_test = []
  for i in range(n_data):
    index = np.argmax(predict_data[i])
    y_test.append(all_categorys[index])
  return y_test

In [None]:
LGBM_data = []
test_id = df_test['id']
y_res = bin_to_category(y_pred)

for i in range(21628):
  row_data = []
  row_data.append(int(test_id[i]))
  row_data.append(y_res[i])
  LGBM_data.append(row_data)

df = pd.DataFrame(LGBM_data, columns=['id', 'category'])
df.to_csv('LGBM.csv', index=False)