<a href="https://colab.research.google.com/github/dongr0510/deep-learning-coursera/blob/master/DL_HW1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV

from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from tqdm import tqdm_notebook

import matplotlib.pyplot as plt

In [0]:
import pandas as pd
import numpy as np
import torch
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import random
import math

from pathlib import Path

In [0]:
import xgboost as xgb
from sklearn.metrics import r2_score

In [0]:
PATH = Path('/content')

In [7]:
pip install pmlb

Collecting pmlb
  Downloading https://files.pythonhosted.org/packages/0d/38/bb7bf1785add978af1e6c2bd27926405959c71ebaeefa6ff610bf53d8b25/pmlb-0.3.tar.gz
Building wheels for collected packages: pmlb
  Building wheel for pmlb (setup.py) ... [?25l[?25hdone
  Created wheel for pmlb: filename=pmlb-0.3-cp36-none-any.whl size=11918 sha256=6c6970915bc9a681acd5ae9ce26ded390d28fbde4fb9993334ed66e18a49e38a
  Stored in directory: /root/.cache/pip/wheels/5a/c7/a4/be59e63a2cb56f6c58f068305b95c212e0aac1a930fd77d6b0
Successfully built pmlb
Installing collected packages: pmlb
Successfully installed pmlb-0.3


In [0]:
from pmlb import fetch_data, regression_dataset_names

In [0]:
list_dataset = {}
X, y = fetch_data('1595_poker', return_X_y=True, local_cache_dir='/content')

list_dataset['1595_poker'] = (X.shape[0], X.shape[1])

In [15]:
list_dataset

{'1595_poker': (1025010, 10)}

In [0]:
datasets = list(list_dataset.keys())

In [17]:
dataset = datasets[0]
dataset

'1595_poker'

# Split train / valid / test

In [0]:
def split_dataset(dataset, seed=1):
    X, y = fetch_data(dataset, return_X_y=True, local_cache_dir=PATH)
    train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=seed)
    valid_X, test_X, valid_y, test_y = train_test_split(test_X, test_y, test_size=0.5, random_state=seed)
    return train_X, valid_X, test_X, train_y, valid_y, test_y

In [0]:
train_X, valid_X, test_X, train_y, valid_y, test_y = split_dataset(dataset)

In [0]:
def GB_scores(train_X, valid_X, train_y, valid_y):
    xgb_pars = {'min_child_weight': 50, 'eta': 0.01, 'colsample_bytree': 0.5, 'max_depth': 5,
            'subsample': 0.8, 'lambda': 1., 'nthread': -1, 'booster' : 'gbtree', 'silent': 1,
            'eval_metric': 'rmse', 'objective': 'reg:linear'}
    dtrain = xgb.DMatrix(train_X, label=train_y)
    dvalid = xgb.DMatrix(valid_X, label=valid_y)
    watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
    gbm = xgb.train(xgb_pars, dtrain, 1000, watchlist, early_stopping_rounds=50,
                maximize=False, verbose_eval=200)
    y_pred = gbm.predict(dvalid)
    return r2_score(valid_y, y_pred), gbm

In [26]:
score, gbm = GB_scores(train_X, valid_X, train_y, valid_y)

[0]	train-rmse:0.784316	valid-rmse:0.775518
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[200]	train-rmse:0.749144	valid-rmse:0.741102
[400]	train-rmse:0.727816	valid-rmse:0.719965
[600]	train-rmse:0.711204	valid-rmse:0.703554
[800]	train-rmse:0.696025	valid-rmse:0.688501
[999]	train-rmse:0.682363	valid-rmse:0.675066


## Encoding features

In [0]:
def encode_cat_variables(x, help_dict = None, N = 100):
    """Encodes a categorical variable
    
    The index 0 is left for values not in training
    """
    uniqs = np.unique(x)
    if help_dict is None: help_dict = {v:k+1 for k, v in enumerate(uniqs)}
    levels = len(help_dict.keys()) + 1
    x_t = np.array([help_dict.get(x_i, 0) for x_i in x])
    return x_t, help_dict, levels

In [0]:
def split_num_cat_columns_all_cat(X):
    """Splits variables into: categorical and numeric
    
    Heuristic to determine which variable is categorical.
    Returns:
       cat_ind: array of indices for categorical variables
       cont_ind: array of indices for continous variables
    """
    cat_ind = []
    cont_ind = []
    for i in range(X.shape[1]):
#         uniqs_len = len(np.unique(X[:,i]))
#         fraction = uniqs_len/X.shape[0]
#         if fraction > 0.01: # you can change this rule
#             cont_ind.append(i)
        cat_ind.append(i)
    return cat_ind, cont_ind

In [0]:
def split_num_cat_columns_all_con(X):
    """Splits variables into: categorical and numeric
    
    Heuristic to determine which variable is categorical.
    Returns:
       cat_ind: array of indices for categorical variables
       cont_ind: array of indices for continous variables
    """
    cat_ind = []
    cont_ind = []
    for i in range(X.shape[1]):
#         uniqs_len = len(np.unique(X[:,i]))
#         fraction = uniqs_len/X.shape[0]
#         if uniqs_len == 2 or fraction > 0.01: # you can change this rule
        cont_ind.append(i)
#         else:
#             cat_ind.append(i)
    return cat_ind, cont_ind   

In [0]:
def split_num_cat_columns_4(X):
    """Splits variables into: categorical and numeric
    
    Heuristic to determine which variable is categorical.
    Returns:
       cat_ind: array of indices for categorical variables
       cont_ind: array of indices for continous variables
    """
    cat_ind = []
    cont_ind = []
    for i in range(X.shape[1]):
        uniqs_len = len(np.unique(X[:,i]))
        fraction = uniqs_len/X.shape[0]
        if uniqs_len > 4: # you can change this rule
            cont_ind.append(i)
        else:
            cat_ind.append(i)
    return cat_ind, cont_ind 

In [0]:
def transform_dataset(train_X, valid_X, test_X):
    
    cat_ind, cont_ind = split_num_cat_columns_4(train_X)
    
    train_X_cat = train_X[:, cat_ind]
    train_X_cont = train_X[:, cont_ind]
    
    valid_X_cat = valid_X[:, cat_ind]
    valid_X_cont = valid_X[:, cont_ind]
    
    test_X_cat = test_X[:, cat_ind]
    test_X_cont = test_X[:, cont_ind]
    
    scaler = StandardScaler()
    train_X_cont = scaler.fit_transform(train_X_cont)
    valid_X_cont = scaler.transform(valid_X_cont)
    test_X_cont = scaler.transform(test_X_cont)
    
    level_arr = [0]*train_X_cat.shape[1]
    for i in range(train_X_cat.shape[1]):
        x, help_arr, levels = encode_cat_variables(train_X_cat[:,i])
        train_X_cat[:,i] = x
        level_arr[i] = levels
        x, _, _ = encode_cat_variables(valid_X_cat[:,i], help_arr)
        valid_X_cat[:,i] = x
        x, _, _ = encode_cat_variables(test_X_cat[:,i], help_arr)
        test_X_cat[:,i] = x
    return (train_X_cat, train_X_cont, valid_X_cat, valid_X_cont, test_X_cat, test_X_cont), level_arr

In [0]:
def scale_dataset(train_X, valid_X, test_X):
    scaler = StandardScaler()
    train_X = scaler.fit_transform(train_X)
    valid_X = scaler.transform(valid_X)
    test_X = scaler.transform(test_X)
    return train_X, valid_X, test_X

In [0]:
def label_encode(train_X, valid_X, test_X):
    
    level_arr = [0]*train_X.shape[1]
    
    for i in range(train_X.shape[1]):
        x, help_arr, levels = encode_cat_variables(train_X[:,i])
        train_X[:,i] = x
        level_arr[i] = levels
        x, _, _ = encode_cat_variables(valid_X[:,i], help_arr)
        valid_X[:,i] = x
        x, _, _ = encode_cat_variables(test_X[:,i], help_arr)
        test_X[:,i] = x
    return train_X, valid_X, test_X

### variables with 4 unique values or less are categorical and the rest are continuous

In [34]:
cat_ind, cont_ind  = split_num_cat_columns_4(train_X)
cat_ind, cont_ind

([0, 2, 4, 6, 8], [1, 3, 5, 7, 9])

In [0]:
# transforms variables
(train_X_cat, train_X_cont, valid_X_cat, valid_X_cont, test_X_cat, test_X_cont), level_arr \
= transform_dataset(train_X, valid_X, test_X)

In [0]:
class TabularDataSet(Dataset):
    def __init__(self, X_cat, X_cont, Y):
        self.X_cat = X_cat
        self.X_cont = X_cont
        self.Y = Y

    def __getitem__(self, index):
        return self.X_cat[index], self.X_cont[index], self.Y[index]

    def __len__(self):
        return len(self.Y)

In [0]:
batch_size = 5
train_ds = TabularDataSet(train_X_cat, train_X_cont, train_y)
valid_ds = TabularDataSet(valid_X_cat, valid_X_cont, valid_y)
test_ds = TabularDataSet(test_X_cat, test_X_cont, valid_y)

train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)
test_dl = DataLoader(test_ds, batch_size=batch_size)

In [0]:
def save_model(m, p): torch.save(m.state_dict(), p)
    
def load_model(m, p): m.load_state_dict(torch.load(p))

In [0]:
def cosine_segment(start_lr, end_lr, iterations):
    i = np.arange(iterations)
    c_i = 1 + np.cos(i*np.pi/iterations)
    return end_lr + (start_lr - end_lr)/2 *c_i

def get_cosine_triangular_lr(max_lr, iterations, div_start=5, div_end=5):
    min_start, min_end = max_lr/div_start, max_lr/div_end
    iter1 = int(0.3*iterations)
    iter2 = iterations - iter1
    segs = [cosine_segment(min_start, max_lr, iter1), cosine_segment(max_lr, min_end, iter2)]
    return np.concatenate(segs)

In [0]:
def update_optimizer(optimizer, lr):
    for i, param_group in enumerate(optimizer.param_groups):
        param_group["lr"] = lr

In [0]:
def val_metrics(model, valid_dl):
    model.eval()
    total = 0
    sum_loss = 0
    y_hat = []
    ys = []
    for x1, x2, y in valid_dl:
        batch = y.shape[0]
        y = y.unsqueeze(1).float()
        out = model(x1.long().cuda(), x2.float().cuda())
        loss = F.mse_loss(out, y.cuda())
        sum_loss += batch*(loss.item())
        total += batch
        y_hat.append(out.detach().cpu().numpy())
        ys.append(y)
    
    y_hat = np.vstack(y_hat)
    ys = np.vstack(ys)
    r2 = metrics.r2_score(ys, y_hat)
    return sum_loss/total, r2

In [0]:
def train_model(model, train_dl, valid_dl, optimizer, max_lr=0.05, epochs=100):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model.to(device)
    iterations = epochs*len(train_dl)
    pbar = tqdm_notebook(total=iterations)
    idx = 0
    best_val_r2 = 0
    lrs = get_cosine_triangular_lr(max_lr, iterations)
    for t in range(epochs):
        model.train()
        total_loss = 0
        total = 0
        for x1, x2, y in train_dl:
            update_optimizer(optimizer, lrs[idx])
            x1 = x1.long().cuda()
            x2 = x2.float().cuda()
            y = y.unsqueeze(1).float().cuda()
            y_hat = model(x1, x2)
            loss = F.mse_loss(y_hat, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()*y.size(0)
            total += y.size(0)
            idx +=1
            pbar.update()
        val_loss, val_r2 = val_metrics(model, valid_dl)
        print("\tTrain loss: {:.3f} \t Valid loss: {:.3f} \t Valid R2:  {:.3f}".format(
            total_loss/total, val_loss, val_r2))  
        if best_val_r2 < val_r2:
            best_val_r2 = val_r2
            path = "{0}/models/model_{1}_acc_{2:.0f}.pth".format(PATH, dataset, 100*val_r2) 
            save_model(model, path)
            print(path)
            
    print("Best valid r2: {:.3f}".format(best_val_r2))
    return best_val_r2

In [0]:
class TabularNet_4_first(nn.Module):
    def __init__(self, num_cont, num_cat, level_arr, hidden_dim=1000, hidden_dim2=1000):
        super(TabularNet_4_first, self).__init__()
        in_dim = num_cont + 2*num_cat
        self.embs = nn.ModuleList([nn.Embedding(level_arr[i], 2) for i in range(len(level_arr))])
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.bn2 = nn.BatchNorm1d(hidden_dim2)
        self.linear1 = nn.Linear(in_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, hidden_dim2)
        self.linear3 = nn.Linear(hidden_dim2, 1)
        self.dropout = nn.Dropout(0.2)
                                  
    def forward(self, x_cat, x_cont):
        x_cat = [self.embs[i](x_cat[:,i]) for i in range(x_cat.size(1))]
        x_cat = torch.cat(x_cat, dim=1)
        x_cat = self.dropout(x_cat)
        x = torch.cat([x_cont, x_cat], dim=1)
        x = self.bn1(F.relu(self.linear1(x)))
        x = self.dropout(x)
        x = self.bn2(F.relu(self.linear2(x)))
        return self.linear3(x)

In [0]:
class TabularNet_4_second(nn.Module):
    def __init__(self, num_cont, num_cat, level_arr, hidden_dim=1000, hidden_dim2=1000):
        super(TabularNet_4_second, self).__init__()
        in_dim = num_cont + 2*num_cat
        self.embs = nn.ModuleList([nn.Embedding(level_arr[i], 2) for i in range(len(level_arr))])
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.bn2 = nn.BatchNorm1d(hidden_dim2)
        self.linear1 = nn.Linear(in_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim+in_dim, hidden_dim2)
        self.linear3 = nn.Linear(hidden_dim2+in_dim, 1)
        self.dropout = nn.Dropout(0.2)
                                  
    def forward(self, x_cat, x_cont):
        x_cat = [self.embs[i](x_cat[:,i]) for i in range(x_cat.size(1))]
        x_cat = torch.cat(x_cat, dim=1)
        x_cat = self.dropout(x_cat)
        x1 = torch.cat([x_cont, x_cat], dim=1)
        x = self.bn1(F.relu(self.linear1(x1)))
        x = torch.cat([x, x1], dim=1)
        x = self.dropout(x)
        x = self.bn2(F.relu(self.linear2(x)))
        x = torch.cat([x, x1], dim=1)
        return self.linear3(x)

In [0]:
batch_size = 8192

num_cont = train_X_cont.shape[1]
num_cat = train_X_cat.shape[1]

train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

In [0]:
model = TabularNet_4_first(num_cont, num_cat, level_arr).cuda()

In [215]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.5, weight_decay=1e-5)
best_val = train_model(model, train_dl, valid_dl, optimizer, epochs=100)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(FloatProgress(value=0.0, max=8800.0), HTML(value='')))

	Train loss: 1.322 	 Valid loss: 0.554 	 Valid R2:  0.060
/content/models/model_1595_poker_acc_6.pth
	Train loss: 0.566 	 Valid loss: 0.552 	 Valid R2:  0.063
/content/models/model_1595_poker_acc_6.pth
	Train loss: 0.565 	 Valid loss: 0.553 	 Valid R2:  0.061
	Train loss: 0.575 	 Valid loss: 0.540 	 Valid R2:  0.084
/content/models/model_1595_poker_acc_8.pth
	Train loss: 0.549 	 Valid loss: 0.508 	 Valid R2:  0.138
/content/models/model_1595_poker_acc_14.pth
	Train loss: 0.537 	 Valid loss: 0.530 	 Valid R2:  0.101
	Train loss: 0.525 	 Valid loss: 0.500 	 Valid R2:  0.151
/content/models/model_1595_poker_acc_15.pth
	Train loss: 0.516 	 Valid loss: 0.485 	 Valid R2:  0.176
/content/models/model_1595_poker_acc_18.pth
	Train loss: 0.487 	 Valid loss: 0.432 	 Valid R2:  0.266
/content/models/model_1595_poker_acc_27.pth
	Train loss: 0.447 	 Valid loss: 0.398 	 Valid R2:  0.326
/content/models/model_1595_poker_acc_33.pth
	Train loss: 0.411 	 Valid loss: 0.485 	 Valid R2:  0.178
	Train loss: 

In [0]:
model = TabularNet_4_second(num_cont, num_cat, level_arr).cuda()

In [230]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.05, weight_decay=1e-5)
best_val = train_model(model, train_dl, valid_dl, optimizer, epochs=100)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(FloatProgress(value=0.0, max=8800.0), HTML(value='')))

	Train loss: 4.040 	 Valid loss: 0.552 	 Valid R2:  0.064
/content/models/model_1595_poker_acc_6.pth
	Train loss: 0.568 	 Valid loss: 0.547 	 Valid R2:  0.072
/content/models/model_1595_poker_acc_7.pth
	Train loss: 0.563 	 Valid loss: 0.545 	 Valid R2:  0.075
/content/models/model_1595_poker_acc_8.pth
	Train loss: 0.560 	 Valid loss: 0.542 	 Valid R2:  0.081
/content/models/model_1595_poker_acc_8.pth
	Train loss: 0.554 	 Valid loss: 0.532 	 Valid R2:  0.097
/content/models/model_1595_poker_acc_10.pth
	Train loss: 0.537 	 Valid loss: 0.514 	 Valid R2:  0.128
/content/models/model_1595_poker_acc_13.pth
	Train loss: 0.525 	 Valid loss: 0.500 	 Valid R2:  0.152
/content/models/model_1595_poker_acc_15.pth
	Train loss: 0.514 	 Valid loss: 0.482 	 Valid R2:  0.182
/content/models/model_1595_poker_acc_18.pth
	Train loss: 0.504 	 Valid loss: 0.469 	 Valid R2:  0.204
/content/models/model_1595_poker_acc_20.pth
	Train loss: 0.488 	 Valid loss: 0.449 	 Valid R2:  0.238
/content/models/model_1595_p