# Train and compare two Wide and Deep models on chunks1to25 Jan_Jun_2020

In [1]:
import pandas as pd
import numpy as np

f_dtype = "/content/drive/MyDrive/Fellowship.AI/P1_bandit/dataset/dtypes.csv"
f_chunks_1to25 = "/content/drive/MyDrive/Fellowship.AI/P1_bandit/dataset/chunks_1to25.csv"

In [2]:
dtype = pd.read_csv(f_dtype)
dtype = {d[0]:d[1] for d in dtype.values}
ch1to25 = pd.read_csv(f_chunks_1to25, dtype=dtype, compression='gzip')
ch1to25 = ch1to25.drop(columns="Unnamed: 0")
print("Total number of data in chunks_1to25: {}.".format(len(ch1to25)))
ch1to25.head()

Total number of data in chunks_1to25: 7123483.


Unnamed: 0,riid,opened,unsub,rev_3dv2,sends_since_last_open,message_size,retention_score,frequency_score,sent_hour,sent_dayofweek,sent_week,sent_month,campaign_Brand,campaign_Collection,campaign_Core,campaign_Dedicated,campaign_InnovationSpotlight,campaign_NewArrivals,campaign_ProductSpotlight,campaign_Replen,campaign_Tops,campaign_Trend,campaign_Other,discount,promo,sale,reward
0,254904062,1,0,58.0,1,230057,28.0,34,18,5,0,2,0,0,0,0,0,0,0,0,0,0,1,0,1,0,8.0
1,260350242,1,0,30.09,0,139229,28.0,64,17,2,1,3,0,0,0,0,0,0,1,0,0,0,0,0,1,0,9.0
2,249036942,1,0,39.99,2,199667,14.0,48,22,0,0,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,7.0
3,8684662,1,0,69.96,2,186806,14.0,12,18,6,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,7.0
4,245107842,1,0,45.6,0,141806,28.0,9,17,6,2,2,0,0,0,0,0,0,0,0,0,1,0,40,1,0,9.0


In [3]:
# randomly chose a subset with size num_data
num_data = 100000
indices = np.random.choice(len(ch1to25),size=num_data)
df_subset = ch1to25.iloc[indices]

In [4]:
df_subset.reward.describe()

count    100000.000000
mean        -12.942990
std          51.928032
min       -4852.000000
25%         -14.000000
50%          -6.000000
75%          -2.000000
max           9.000000
Name: reward, dtype: float64

It seems that most of the rewards with action "SEND" are negative, so the best action for these data is "DO NOT SEND".

Let's try a simple policy:
- just to choose the action **"DO NOT SEND"** for any context (sounds rediculous :P)

and the accuracy of this policy is:

In [5]:
(df_subset.reward <= 0.0).sum() / len(df_subset)

0.99746

I trained the model according to this reward, the accuracy is similar, about 0.99.

In the following content, I try my own reward function to train the model

In [6]:
# create a vocabulary mapping each riid to an index 
vocab = {riid:i for i, riid in enumerate(df_subset.riid.unique())}
print("{} unique users".format(len(vocab)))
user_ids = pd.DataFrame()
user_ids['user_id'] = df_subset.riid.apply(lambda x:vocab[x])

92557 unique users


## Model 1 (class W_D): cross product features are used as the input of the Wide part, user_id is included in the DEEP part

In [7]:
campaign_types = ['campaign_Brand',
                  'campaign_Collection',
                  'campaign_Core',
                  'campaign_Dedicated',
                  'campaign_InnovationSpotlight',
                  'campaign_NewArrivals',
                  'campaign_Other',
                  'campaign_ProductSpotlight',
                  'campaign_Replen',
                  'campaign_Tops',
                  'campaign_Trend']

In [8]:
# A function to calculate the cross-product of two category features
def cross_product(data_1, columns_1, data_2, columns_2):
    cp = pd.DataFrame()
    for c_1 in columns_1:
        for c_2 in columns_2:
            cp[str(c_1)+'_'+str(c_2)] = data_1[c_1]*data_2[c_2]
    return cp

sent_dayofweek = pd.get_dummies(df_subset.sent_dayofweek,prefix='day')

# the campaign types 11 columns, and sent_dayofweek after one-hot coding 7 columns
# the output cross product have 11*7 = 77 columns, it will be the input of the Wide part
w_in = cross_product(df_subset, campaign_types, sent_dayofweek, list(sent_dayofweek.columns))

In [9]:
# other features to be feed into the Deep part
d_columns = campaign_types + ['frequency_score', 'retention_score', 'promo',	'sale']

In [10]:
# concatenated contexts for the whole Wide and Deep model, user id are used as input of the DEEP part
datasource = pd.concat((w_in, user_ids, df_subset[d_columns]),axis=1)
datasource

Unnamed: 0,campaign_Brand_day_0,campaign_Brand_day_1,campaign_Brand_day_2,campaign_Brand_day_3,campaign_Brand_day_4,campaign_Brand_day_5,campaign_Brand_day_6,campaign_Collection_day_0,campaign_Collection_day_1,campaign_Collection_day_2,campaign_Collection_day_3,campaign_Collection_day_4,campaign_Collection_day_5,campaign_Collection_day_6,campaign_Core_day_0,campaign_Core_day_1,campaign_Core_day_2,campaign_Core_day_3,campaign_Core_day_4,campaign_Core_day_5,campaign_Core_day_6,campaign_Dedicated_day_0,campaign_Dedicated_day_1,campaign_Dedicated_day_2,campaign_Dedicated_day_3,campaign_Dedicated_day_4,campaign_Dedicated_day_5,campaign_Dedicated_day_6,campaign_InnovationSpotlight_day_0,campaign_InnovationSpotlight_day_1,campaign_InnovationSpotlight_day_2,campaign_InnovationSpotlight_day_3,campaign_InnovationSpotlight_day_4,campaign_InnovationSpotlight_day_5,campaign_InnovationSpotlight_day_6,campaign_NewArrivals_day_0,campaign_NewArrivals_day_1,campaign_NewArrivals_day_2,campaign_NewArrivals_day_3,campaign_NewArrivals_day_4,...,campaign_ProductSpotlight_day_4,campaign_ProductSpotlight_day_5,campaign_ProductSpotlight_day_6,campaign_Replen_day_0,campaign_Replen_day_1,campaign_Replen_day_2,campaign_Replen_day_3,campaign_Replen_day_4,campaign_Replen_day_5,campaign_Replen_day_6,campaign_Tops_day_0,campaign_Tops_day_1,campaign_Tops_day_2,campaign_Tops_day_3,campaign_Tops_day_4,campaign_Tops_day_5,campaign_Tops_day_6,campaign_Trend_day_0,campaign_Trend_day_1,campaign_Trend_day_2,campaign_Trend_day_3,campaign_Trend_day_4,campaign_Trend_day_5,campaign_Trend_day_6,user_id,campaign_Brand,campaign_Collection,campaign_Core,campaign_Dedicated,campaign_InnovationSpotlight,campaign_NewArrivals,campaign_Other,campaign_ProductSpotlight,campaign_Replen,campaign_Tops,campaign_Trend,frequency_score,retention_score,promo,sale
6774648,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,23,28.000000,1,0
236118,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,6,28.000000,1,0
6429118,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,1,0,0,0,0,0,0,0,0,4.000000,1,0
4037935,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,1,0,0,0,0,0,0,0,4,1.750000,1,0
2944406,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,1,0,0,0,0,0,0,0,3,2.545455,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2447845,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,92552,0,0,1,0,0,0,0,0,0,0,0,1,3.500000,1,0
3692080,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,92553,0,0,1,0,0,0,0,0,0,0,0,6,4.666667,1,0
3990928,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,92554,0,0,1,0,0,0,0,0,0,0,0,5,0.756757,1,0
732210,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,92555,0,0,0,1,0,0,0,0,0,0,0,0,0.622222,1,0


In [11]:
# reward vectors 
rewardsource = pd.DataFrame()
rewardsource["reward_send"] = df_subset.opened * 1.2 - 0.2 + (df_subset.rev_3dv2 > 0) + df_subset.rev_3dv2/75.0 - 5.0*df_subset.unsub
rewardsource["reward_not_send"] = -rewardsource["reward_send"]
rewardsource

Unnamed: 0,reward_send,reward_not_send
6774648,-0.2,0.2
236118,-0.2,0.2
6429118,-0.2,0.2
4037935,-0.2,0.2
2944406,-0.2,0.2
...,...,...
2447845,-0.2,0.2
3692080,-0.2,0.2
3990928,-0.2,0.2
732210,-0.2,0.2


In [12]:
# optimal actions
opt_a = (rewardsource['reward_send'] < 0).astype(int)
opt_a

6774648    1
236118     1
6429118    1
4037935    1
2944406    1
          ..
2447845    1
3692080    1
3990928    1
732210     1
2030451    1
Name: reward_send, Length: 100000, dtype: int64

In [13]:
import torch
# customized torch dataset for the training 
class Mydataset(torch.utils.data.Dataset):
    def __init__(self, contexts, reward_vectors, optimal_actions):
      self.contexts = np.array(contexts)
      self.reward_vectors = np.array(reward_vectors)
      self.optimal_actions = np.array(optimal_actions)

    def __getitem__(self, index):
        context = self.contexts[index]
        reward_vector = self.reward_vectors[index]
        optimal_action = self.optimal_actions[index]
        return context, reward_vector, optimal_action

    def __len__(self):
        return self.contexts.shape[0]

In [14]:
split = 0.8
batch_size = 1024
# create datasets and dataloaders for training
dataset_train = Mydataset(datasource.iloc[:int(num_data * split)],rewardsource.iloc[:int(num_data * split)], opt_a.iloc[:int(num_data * split)])
dataset_valid = Mydataset(datasource.iloc[int(num_data * split):],rewardsource.iloc[int(num_data * split):], opt_a.iloc[int(num_data * split):])
dataloader_train = torch.utils.data.DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
dataloader_valid = torch.utils.data.DataLoader(dataset_valid, batch_size=batch_size, shuffle=True)

In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
from collections import OrderedDict
import numpy as np
import time

class W_D(nn.Module):
    def __init__(self,         
                 wide_in_dim=3,
                 deep_in_dim=3, 
                 action_dim=3, 
                 num_embedding=100, 
                 embed_dim=16, 
                 deep_neurons=[32, 16], 
                 activation=nn.ReLU()):
        
        super(W_D, self).__init__()
        self.wide_in_dim = wide_in_dim
        self.context_dim = wide_in_dim + deep_in_dim
        self.z_dim = wide_in_dim + deep_neurons[-1]
        self.action_dim = action_dim
        self.activation = activation
        
        self.embedding = nn.Embedding(num_embedding, embed_dim)
        deep_dict = OrderedDict([])
        in_features = deep_in_dim -1 + embed_dim 
        for i, out_features in enumerate(deep_neurons):
            deep_dict[f"fc{i}"] = nn.Linear(in_features, out_features)
            deep_dict[f"activation{i}"] = activation
            in_features = out_features

        self.deep = nn.Sequential(deep_dict)
        # define the final layer
        self.lastlayer = nn.Linear(self.z_dim, self.action_dim)
        
    def forward(self, x):
        z = self.get_z(x)
        out = self.lastlayer(z)
        return out
    
    def get_z(self, x):

        w_in = x[:, :self.wide_in_dim]
        d_in = x[:, self.wide_in_dim:]
        
        embed_in = d_in[:,0].long()
        embed_out = self.embedding(embed_in)
        nn_in = torch.cat((embed_out, d_in[:,1:]), dim=1)
        d_out = self.deep(nn_in)
        z = torch.cat((w_in, d_out),dim=1)
        return z


In [17]:
# A function to train and valid the model for each epoch, output the losses and accuracies.
def train(device, model, optimizer, loss_func, dataloader_train, dataloader_valid, num_epoch):
    start_time = time.time()
    accuracy_train = []
    loss_record_train = []
    accuracy_valid = []
    loss_record_valid = []
    s_time = start_time
    for i in range(num_epoch):
        model.train()
        corrects = 0.0
        train_loss = 0.0
        for c_train, r_train, opt_a_train in dataloader_train:
            c_train = c_train.float().to(device)
            r_train = r_train.float().to(device)
            opt_a_train = opt_a_train.long().to(device)
            pred_r_train = model(c_train)
            loss = loss_func(pred_r_train, r_train)
            pred_a_train = torch.argmax(pred_r_train, dim=1)
            corrects += torch.sum(pred_a_train==opt_a_train).item()
            train_loss += loss.item() * c_train.shape[0]

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        accuracy_train.append(corrects/len(dataloader_train.dataset))
        loss_record_train.append(train_loss/len(dataloader_train.dataset))

        model.eval()
        corrects = 0.0
        valid_loss = 0.0
        for c_valid, r_valid, opt_a_valid in dataloader_valid:
            c_valid = c_valid.float().to(device)
            r_valid = r_valid.float().to(device)
            opt_a_valid = opt_a_valid.long().to(device)
            pred_r_valid = model(c_valid)
            loss = loss_func(pred_r_valid, r_valid)
            pred_a_valid = torch.argmax(pred_r_valid, dim=1)
            corrects += torch.sum(pred_a_valid==opt_a_valid).item()
            valid_loss += loss.item() * c_valid.shape[0]

        accuracy_valid.append(corrects/len(dataloader_valid.dataset))
        loss_record_valid.append(valid_loss/len(dataloader_valid.dataset))

        print("Epoch:{:>4}\t Training Loss: {:6.3f}\t Valid Loss: {:6.3f}\t Training Accuracy: {:.3f}\t Valid Accuracy: {:.3f}\t time used: {:.3f} sec".format(i+1, loss_record_train[i], loss_record_valid[i], accuracy_train[i], accuracy_valid[i], time.time()-s_time))
        s_time = time.time()
    end_time = time.time()
    total_time = end_time - start_time
    print("total time used: {} hrs {} min {:.3f} sec".format(int(total_time//3600), int((total_time%3600)//60), total_time%60))


In [18]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print('device: ', device)
wide_in_dim = 77
deep_in_dim = 16
action_dim =2
model = W_D(wide_in_dim=wide_in_dim, deep_in_dim=deep_in_dim, action_dim=action_dim, num_embedding=len(vocab), embed_dim=64, deep_neurons=[128, 64]).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
loss_func = nn.MSELoss()
num_epoch = 10
train(device, model, optimizer, loss_func, dataloader_train, dataloader_valid, num_epoch)

device:  cuda:0
Epoch:   1	 Training Loss:  0.231	 Valid Loss:  0.224	 Training Accuracy: 0.698	 Valid Accuracy: 0.742	 time used: 1.701 sec
Epoch:   2	 Training Loss:  0.221	 Valid Loss:  0.223	 Training Accuracy: 0.744	 Valid Accuracy: 0.733	 time used: 1.485 sec
Epoch:   3	 Training Loss:  0.219	 Valid Loss:  0.223	 Training Accuracy: 0.747	 Valid Accuracy: 0.727	 time used: 1.498 sec
Epoch:   4	 Training Loss:  0.217	 Valid Loss:  0.224	 Training Accuracy: 0.749	 Valid Accuracy: 0.740	 time used: 1.459 sec
Epoch:   5	 Training Loss:  0.213	 Valid Loss:  0.227	 Training Accuracy: 0.758	 Valid Accuracy: 0.731	 time used: 1.490 sec
Epoch:   6	 Training Loss:  0.206	 Valid Loss:  0.232	 Training Accuracy: 0.768	 Valid Accuracy: 0.718	 time used: 1.475 sec
Epoch:   7	 Training Loss:  0.198	 Valid Loss:  0.240	 Training Accuracy: 0.781	 Valid Accuracy: 0.730	 time used: 1.484 sec
Epoch:   8	 Training Loss:  0.186	 Valid Loss:  0.258	 Training Accuracy: 0.798	 Valid Accuracy: 0.656	 time 

## Model_2 (class W_D_2): user_id as input of WIDE part

In [19]:
# user_id in wide
class W_D_2(nn.Module):
    def __init__(self,         
                 wide_in_dim=3,
                 deep_in_dim=3, 
                 action_dim=3, 
                 num_embedding=100, 
                 embed_dim=16, 
                 deep_neurons=[32, 16], 
                 activation=nn.ReLU()):
        
        super(W_D_2, self).__init__()
        self.wide_in_dim = wide_in_dim
        self.context_dim = wide_in_dim + deep_in_dim
        self.z_dim = embed_dim + deep_neurons[-1]
        self.action_dim = action_dim
        self.activation = activation
        
        self.embedding = nn.Embedding(num_embedding, embed_dim)

        deep_dict = OrderedDict([])
        in_features = deep_in_dim 
        for i, out_features in enumerate(deep_neurons):
            deep_dict[f"fc{i}"] = nn.Linear(in_features, out_features)
            deep_dict[f"activation{i}"] = activation
            in_features = out_features

        self.deep = nn.Sequential(deep_dict)
        # define the final layer
        self.lastlayer = nn.Linear(self.z_dim, self.action_dim)
        
    def forward(self, x):
        z = self.get_z(x)
        out = self.lastlayer(z)
        return out
    
    def get_z(self, x):

        w_in = x[:, :self.wide_in_dim]
        d_in = x[:, self.wide_in_dim:]
        
        embed_in = w_in[:,0].long()
        embed_out = self.embedding(embed_in)
        nn_in = d_in
        d_out = self.deep(nn_in)
        z = torch.cat((embed_out, d_out),dim=1)
        return z

In [20]:
# contexts to train the W_D_2 model, user id as input of the Wide part
datasource_2 = pd.concat((user_ids, df_subset[d_columns]),axis=1)
datasource_2

Unnamed: 0,user_id,campaign_Brand,campaign_Collection,campaign_Core,campaign_Dedicated,campaign_InnovationSpotlight,campaign_NewArrivals,campaign_Other,campaign_ProductSpotlight,campaign_Replen,campaign_Tops,campaign_Trend,frequency_score,retention_score,promo,sale
6774648,0,0,0,0,0,0,0,0,0,0,0,1,23,28.000000,1,0
236118,1,0,0,0,0,0,0,0,1,0,0,0,6,28.000000,1,0
6429118,2,0,0,0,1,0,0,0,0,0,0,0,0,4.000000,1,0
4037935,3,0,0,0,1,0,0,0,0,0,0,0,4,1.750000,1,0
2944406,4,0,0,0,1,0,0,0,0,0,0,0,3,2.545455,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2447845,92552,0,0,1,0,0,0,0,0,0,0,0,1,3.500000,1,0
3692080,92553,0,0,1,0,0,0,0,0,0,0,0,6,4.666667,1,0
3990928,92554,0,0,1,0,0,0,0,0,0,0,0,5,0.756757,1,0
732210,92555,0,0,0,1,0,0,0,0,0,0,0,0,0.622222,1,0


In [21]:
# datasets and dataloaders for training of W_D_2 model
dataset_train_2 = Mydataset(datasource_2.iloc[:int(num_data * split)],rewardsource.iloc[:int(num_data * split)], opt_a.iloc[:int(num_data * split)])
dataset_valid_2 = Mydataset(datasource_2.iloc[int(num_data * split):],rewardsource.iloc[int(num_data * split):], opt_a.iloc[int(num_data * split):])
dataloader_train_2 = torch.utils.data.DataLoader(dataset_train_2, batch_size=batch_size, shuffle=True)
dataloader_valid_2 = torch.utils.data.DataLoader(dataset_valid_2, batch_size=batch_size, shuffle=True)

In [22]:
model_2 = W_D_2(wide_in_dim=1, deep_in_dim=15, action_dim=2, num_embedding=len(vocab), embed_dim=64, deep_neurons=[128, 64]).to(device)
optimizer_2 = optim.Adam(model_2.parameters(), lr=0.001)
loss_func = nn.MSELoss()
num_epoch = 10
train(device, model_2, optimizer_2, loss_func, dataloader_train_2, dataloader_valid_2, num_epoch)

Epoch:   1	 Training Loss:  0.310	 Valid Loss:  0.231	 Training Accuracy: 0.643	 Valid Accuracy: 0.750	 time used: 1.338 sec
Epoch:   2	 Training Loss:  0.223	 Valid Loss:  0.224	 Training Accuracy: 0.739	 Valid Accuracy: 0.731	 time used: 1.312 sec
Epoch:   3	 Training Loss:  0.222	 Valid Loss:  0.223	 Training Accuracy: 0.739	 Valid Accuracy: 0.754	 time used: 1.311 sec
Epoch:   4	 Training Loss:  0.221	 Valid Loss:  0.224	 Training Accuracy: 0.737	 Valid Accuracy: 0.760	 time used: 1.314 sec
Epoch:   5	 Training Loss:  0.220	 Valid Loss:  0.227	 Training Accuracy: 0.740	 Valid Accuracy: 0.705	 time used: 1.322 sec
Epoch:   6	 Training Loss:  0.218	 Valid Loss:  0.226	 Training Accuracy: 0.740	 Valid Accuracy: 0.696	 time used: 1.296 sec
Epoch:   7	 Training Loss:  0.216	 Valid Loss:  0.228	 Training Accuracy: 0.735	 Valid Accuracy: 0.710	 time used: 1.306 sec
Epoch:   8	 Training Loss:  0.213	 Valid Loss:  0.232	 Training Accuracy: 0.734	 Valid Accuracy: 0.692	 time used: 1.315 sec
