In [1]:
import numpy as np
import pandas as pd
import json
import tensorflow as tf
import nltk
import pickle 

import matplotlib
import torch

from tqdm import tqdm

GLOVE_PATH = 'InferSent/glove.840B.300d.txt'
INFERSENT_MODEL_PATH = 'InferSent/infersent.allnli.pickle'
MODEL_PATH = 'model/'

  from ._conv import register_converters as _register_converters


In [9]:
with open('data/dataset_cleanup.json') as file :
    dataset  = json.load(file)

### Create Y

In [10]:
%%time 

Y = [] #is conversation with bot
X_df = pd.DataFrame(columns=['dialog_indx','is_user1','is_user2','text','embeding'])
X_tmp = []

for indx,sample in enumerate(dataset) :
    
    user1_id = sample['thread'][0]['userId'] #будим считать первым того user который начал диалог
    
    users_id = np.array([user['id'] for user in sample['users']])
    user2_id = users_id[users_id != user1_id][0]
    
    usersid_2_isbot_list = {user['id'] : 'Bot' in user['userType'] for user in sample['users']}    
    
    y = [int(usersid_2_isbot_list[user1_id]),int(usersid_2_isbot_list[user2_id])]
        
    Y.append(y)
    
    for phraze in sample['thread'] :
        is_user1 = int(phraze['userId'] == user1_id)
        is_user2 = int(phraze['userId'] == user2_id) #если сказал 2й юзер то Ture. если 1й то False
        X_tmp.append([indx,is_user1,is_user2,phraze['text']])

X_tmp = np.array(X_tmp)      
X_df['dialog_indx'] = X_tmp[:,0]
X_df['is_user1'] = X_tmp[:,1]
X_df['is_user2'] = X_tmp[:,2]
X_df['text'] = X_tmp[:,3]
Y = np.array(Y)

Wall time: 1.36 s


In [11]:
X_df['dialog_indx'] = X_df['dialog_indx'].apply(pd.to_numeric)
X_df['is_user1'] = X_df['is_user1'].apply(pd.to_numeric)
X_df['is_user2'] = X_df['is_user2'].apply(pd.to_numeric)

In [12]:
X_df.head()

Unnamed: 0,dialog_indx,is_user1,is_user2,text,embeding
0,0,1,0,f,
1,0,0,1,What is the text about?,
2,0,1,0,High-definition image sources,
3,0,1,0,Blu-ray video disc?,
4,0,0,1,Never heard about Roku boxes,


In [13]:
print('Y : %s ' % str(Y.shape))
print('len_X : %s ' % len(X_text))
print('X_text[0] : %s' % X_text[0])

Y : (2759, 2) 


NameError: name 'X_text' is not defined

### Create sent embedings with InferSent

In [14]:
%%time
model_inferSent = torch.load(INFERSENT_MODEL_PATH, map_location=lambda storage, loc: storage)
torch.set_num_threads(5)

model_inferSent.set_glove_path(GLOVE_PATH)
model_inferSent.build_vocab_k_words(K=100000)



Vocab size : 100000
Wall time: 24 s


In [16]:
%%time
embedings = model_inferSent.encode(X_df['text'].values, bsize=128, tokenize=False, verbose=True)

Nb words kept : 236938/281054 (84.3 %)
Speed : 50.82 sentences/s (cpu mode, bsize=128)
Wall time: 11min 37s


In [21]:
EMPTY_INFERSENT_VECT = np.zeros(shape=4096) #model_inferSent.encode([''])[0]

In [17]:
for indx,embd in enumerate(embedings) :
    X_df.at[indx,'embeding'] = embd

In [18]:
X_df.head()

Unnamed: 0,dialog_indx,is_user1,is_user2,text,embeding
0,0,1,0,f,"[0.011967509, -0.06771015, -0.014055675, -0.08..."
1,0,0,1,What is the text about?,"[0.0711433, 0.054118324, 0.04687867, -0.073211..."
2,0,1,0,High-definition image sources,"[0.07495881, 0.058115814, 0.08857289, -0.06326..."
3,0,1,0,Blu-ray video disc?,"[0.091638826, 0.03246896, -0.026797447, 0.0587..."
4,0,0,1,Never heard about Roku boxes,"[0.03827866, 0.04058365, 0.004118561, 0.012654..."


In [19]:
COUNT_OF_DIALOG = X_df['dialog_indx'].max() + 1

In [20]:
MAX_DIALOG_LENGTH = -100

for indx in range(COUNT_OF_DIALOG) :
    curr_dialg_len = len(X_df.loc[X_df['dialog_indx'] == indx]['dialog_indx'].values)
    if curr_dialg_len > MAX_DIALOG_LENGTH :
        MAX_DIALOG_LENGTH = curr_dialg_len

print('Max dialog len : %s ' % MAX_DIALOG_LENGTH)

Max dialog len : 34 


In [22]:
%%time

X = []
EMPTY_INFERSENT_VECT_with_user1or2 = np.hstack(([0,0],EMPTY_INFERSENT_VECT))

for indx in range(COUNT_OF_DIALOG) :
    
    curr_dialog_embeding = list(X_df.loc[X_df['dialog_indx'] == indx]['embeding'].values)
    curr_dialog_user1or2 = X_df.loc[X_df['dialog_indx'] == indx][['is_user1','is_user2']].values

    curr_dialog = []
    for embeding,user1or2 in zip(curr_dialog_embeding,curr_dialog_user1or2) :
        curr_dialog.append(np.hstack((user1or2,embeding)))
    
    while len(curr_dialog) < MAX_DIALOG_LENGTH :
        curr_dialog.append(EMPTY_INFERSENT_VECT_with_user1or2)
    curr_dialog = np.array(curr_dialog)
    X.append(curr_dialog)
    
X = np.array(X)

Wall time: 21.1 s


In [23]:
print('X shape %s ' % str(X.shape))

X shape (2759, 34, 4098) 


In [27]:
#np.save(open('tmp/X_InferSent_with_user_marker_empty0','wb'),X)
#np.save(open('tmp/Y_InferSent_v2','wb'),Y)

In [2]:
X = np.load(open('tmp/X_InferSent_with_user_marker_empty0','rb'))
Y = np.load(open('tmp/Y_InferSent_v2','rb'))

## Explore class balans

In [3]:
print('First bots part : %s' % np.average(Y[:,0]))
print('Second bots part : %s' % np.average(Y[:,1]))

First bots part : 0.5059804276911924
Second bots part : 0.35773831098223996


## Convert dataset to indx

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

X_train, X_test, y_train, y_test = train_test_split(X, Y,test_size = 0.2)

In [5]:
print('X_train shape %s' % str(X_train.shape))
print('X_test shape %s' % str(X_test.shape))

print('y_train shape %s' % str(y_train.shape))
print('y_test shape %s' % str(y_test.shape))

X_train shape (2207, 34, 4098)
X_test shape (552, 34, 4098)
y_train shape (2207, 2)
y_test shape (552, 2)


### Train model

In [4]:
def get_lstm_cell(lstm_hiden_units, keep_prob):
    lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=lstm_hiden_units, activation=tf.tanh)
    drop = tf.nn.rnn_cell.DropoutWrapper(lstm_cell, output_keep_prob=keep_prob)
    return drop

In [20]:
tf.reset_default_graph()
lstm_hiden_units = 32
#lstm_layers = 1
#keep_prob = 1
embeding_size = 4096 + 2

input_vect = tf.placeholder(tf.float32,shape=[None,None,embeding_size],name='input_vect')
input_y = tf.placeholder(tf.float32, shape=[None,2], name='input_y')
sequence_length = tf.placeholder(tf.float32, shape=[None], name='seq_length')


#rnn = tf.nn.rnn_cell.MultiRNNCell([get_lstm_cell(lstm_hiden_units,keep_prob) for _ in range(lstm_layers) ])

lstm = tf.nn.rnn_cell.BasicLSTMCell(num_units=lstm_hiden_units, activation=tf.tanh)
lstm_outputs, states = tf.nn.dynamic_rnn(lstm, input_vect,sequence_length = sequence_length, dtype=tf.float32)

#flatten = tf.reshape(lstm_outputs,shape=[-1,MAX_DIALOG_LENGTH*lstm_hiden_units],name='flatten')

dense_layer = tf.layers.dense(inputs=states.c,
                              units=64,
                              activation=tf.nn.relu,
                              kernel_initializer = tf.glorot_normal_initializer(), #tf.initializers.identity(gain=0.5)
                              kernel_regularizer = tf.contrib.layers.l2_regularizer(scale=1.0),
                              name='dense_layer')

logits = tf.layers.dense(dense_layer,
                         units=2,
                         kernel_initializer = tf.glorot_normal_initializer(),
                         activation = tf.nn.sigmoid,
                         name='logits')
print(logits)

Tensor("logits/Sigmoid:0", shape=(?, 2), dtype=float32)


In [21]:
def get_seq_len(X) :
    return np.sum(X.sum(axis=2) != 0,axis=1)

def get_next_batch(X,Y,batch_size,batch_indx) :
    '''
    Iterate over batches.
        return X_batch,y_bath,seq_length_inbatch
    
    '''
    left_bound = batch_indx*batch_size
    right_bound = (batch_indx + 1)*batch_size
    if right_bound > X.shape[0] :
        right_bound = X.shape[0]
    
    X_batch = X[left_bound:right_bound,:]
    Y_batch = Y[left_bound:right_bound,:]
    seq_length = get_seq_len(X_batch)
    return (X_batch,Y_batch,seq_length)

In [22]:
sess.close()

In [23]:
sess = tf.InteractiveSession()

In [24]:
loss = tf.losses.sigmoid_cross_entropy(multi_class_labels=input_y, logits = logits)

lr = tf.placeholder(dtype=tf.float32,shape=None)
optimizer = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss)

init = tf.global_variables_initializer()
sess.run(init)

In [29]:
%%time

test_each_iter = 10
train_epoch = 501
batch_size = 256
learning_rate = 0.0001
batch_per_epoch_train = X_train.shape[0] // batch_size
batch_per_epoch_test = X_test.shape[0] // batch_size

if batch_per_epoch_train < 1 : batch_per_epoch_train = 1
if batch_per_epoch_test < 1 : batch_per_epoch_test = 1


for current_epoch in range(train_epoch) :
    
    train_loss = []
    train_auc = []
    for batch_indx in range(batch_per_epoch_train) :    
        X_train_batch,y_train_batch,seq_length = get_next_batch(X_train,y_train,batch_size,batch_indx)
        
        _,tr_loss = sess.run([optimizer,loss],feed_dict={input_vect : X_train_batch,
                                                         input_y : y_train_batch,
                                                         sequence_length : seq_length,
                                                         lr : learning_rate})
        train_loss.append(tr_loss)
        #check train auc
        if current_epoch % test_each_iter == 0 :
            pred_train = sess.run(logits,feed_dict={input_vect:X_train_batch, 
                                                    input_y:y_train_batch,
                                                    sequence_length : seq_length})            
            auc = roc_auc_score(y_train_batch,pred_train)
            train_auc.append(auc)        
            
    train_loss = np.average(train_loss)
    train_auc = np.average(train_auc)
    print('current_epoch : %s, train_loss : %s' % (current_epoch,train_loss))
    
    
    #testing
    if current_epoch % test_each_iter == 0 :
        test_loss = []
        test_acc = []
        test_auc = []
        
        for batch_indx in range(batch_per_epoch_test) :
            X_test_batch,y_test_batch,seq_length = get_next_batch(X_test,y_test,batch_size,batch_indx)
            
            #loss
            ts_loss = sess.run(loss,feed_dict={input_vect:X_test_batch, 
                                               input_y : y_test_batch,
                                              sequence_length : seq_length})
            test_loss.append(ts_loss)
            
            #acc
            pred_test = sess.run(logits,feed_dict={input_vect:X_test_batch, 
                                                   input_y:y_test_batch,
                                                   sequence_length : seq_length})
            acc_test = np.average((pred_test > 0.5) == y_test_batch)
            test_acc.append(acc_test)

            #aucroc
            auc = roc_auc_score(y_test_batch,pred_test)
            test_auc.append(auc)
        
        test_loss = np.average(test_loss)
        test_acc = np.average(test_acc)
        test_auc = np.average(test_auc)
        print('current_epoch : %s, train_loss : %.6f, test_loss : %.6f, train_auc : %.5f, test_auc : %.5f, test_acc : %.5f ' % \
              (current_epoch,train_loss,test_loss,train_auc,test_auc,test_acc))

current_epoch : 0, train_loss : 0.5387866
current_epoch : 0, train_loss : 0.538787, test_loss : 0.577713, train_auc : 0.99364, test_auc : 0.95435, test_acc : 0.89062 


  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


current_epoch : 1, train_loss : 0.5344663
current_epoch : 2, train_loss : 0.53407097
current_epoch : 3, train_loss : 0.5334981
current_epoch : 4, train_loss : 0.5334687
current_epoch : 5, train_loss : 0.5332824
current_epoch : 6, train_loss : 0.53323996
current_epoch : 7, train_loss : 0.53322375
current_epoch : 8, train_loss : 0.53319985
current_epoch : 9, train_loss : 0.5331813
current_epoch : 10, train_loss : 0.5331653
current_epoch : 10, train_loss : 0.533165, test_loss : 0.578755, train_auc : 0.99416, test_auc : 0.95607, test_acc : 0.89453 
current_epoch : 11, train_loss : 0.53314954
current_epoch : 12, train_loss : 0.5331347
current_epoch : 13, train_loss : 0.53312135
current_epoch : 14, train_loss : 0.53310955
current_epoch : 15, train_loss : 0.53309906
current_epoch : 16, train_loss : 0.53308964
current_epoch : 17, train_loss : 0.5330811
current_epoch : 18, train_loss : 0.53307325
current_epoch : 19, train_loss : 0.5330661
current_epoch : 20, train_loss : 0.5330594
current_epoch

current_epoch : 290, train_loss : 0.532903, test_loss : 0.578894, train_auc : 0.99410, test_auc : 0.95334, test_acc : 0.89062 
current_epoch : 291, train_loss : 0.5329032
current_epoch : 292, train_loss : 0.5329031
current_epoch : 293, train_loss : 0.53290296
current_epoch : 294, train_loss : 0.53290284
current_epoch : 295, train_loss : 0.5329027
current_epoch : 296, train_loss : 0.5329026
current_epoch : 297, train_loss : 0.5329025
current_epoch : 298, train_loss : 0.53290236
current_epoch : 299, train_loss : 0.53290224
current_epoch : 300, train_loss : 0.5329021
current_epoch : 300, train_loss : 0.532902, test_loss : 0.578856, train_auc : 0.99410, test_auc : 0.95336, test_acc : 0.89160 
current_epoch : 301, train_loss : 0.532902
current_epoch : 302, train_loss : 0.5329019
current_epoch : 303, train_loss : 0.53290176
current_epoch : 304, train_loss : 0.53290164
current_epoch : 305, train_loss : 0.5329015
current_epoch : 306, train_loss : 0.5329014
current_epoch : 307, train_loss : 0.5

In [31]:
#eval honest test accuracy : 
seq_lengh = get_seq_len(X_test)

pred_test = sess.run(logits,feed_dict={input_vect:X_test, input_y:y_test , sequence_length:seq_lengh}) 
honest_test_acc = np.average((pred_test > 0.5) == y_test)
print('Honest test acc : %s ' % honest_test_acc)


#eval honest test auc (over 2d class): 
honest_test_auc = roc_auc_score(y_test,pred_test)
print('Honest test auc : : %s ' % honest_test_auc)

Honest test acc : 0.8931159420289855 
Honest test auc : : 0.9539141082111628 


In [43]:
#stuped prediction
Y_stuped = np.zeros(Y.shape)
Y_stuped[:,0] = 1

Y_stuped_test = np.zeros(y_test.shape)
Y_stuped_test[:,0] = 1

print('Stuped prediction accuracy over all dataset : %s ' %np.average(Y == Y_stuped))
print('Stuped prediction accuracy over test : %s ' % np.average(y_test == Y_stuped_test))

Stuped prediction accuracy over all dataset : 0.574121058354 
Stuped prediction accuracy over test : 0.584239130435 


# Save Model

In [28]:
!dir

 Volume in drive D is D
 Volume Serial Number is E211-0876

 Directory of D:\Programing\00_ipython_projects\CS224 Deep Learning with NLP. IPavlov\project

10-Apr-18  20:17    <DIR>          .
10-Apr-18  20:17    <DIR>          ..
03-Apr-18  16:45    <DIR>          .ipynb_checkpoints
10-Apr-18  20:17           119,743 baseline with InferSent _ smart user split.ipynb
03-Apr-18  16:44            63,644 baseline with InferSent.ipynb
28-Mar-18  21:05            14,888 baseline.ipynb
10-Apr-18  20:11    <DIR>          data
27-Mar-18  20:37            12,997 explore data.ipynb
30-Mar-18  14:20    <DIR>          InferSent
30-Mar-18  13:52            33,674 models.py
09-Apr-18  19:11    <DIR>          tmp
30-Mar-18  14:18    <DIR>          __pycache__
               5 File(s)        244,946 bytes
               7 Dir(s)  505,381,924,864 bytes free


In [19]:
# Save the variables to disk.
MODEL_NAME = 'lstm32d64_c_seql_paddzer_lr3_200_lr4_500'

saver = tf.train.Saver()
save_path = saver.save(sess, MODEL_PATH + MODEL_NAME + '.ckpt')
print("Model saved in path: %s" % save_path)

Model saved in path: model/lstm32d64_c_seql_paddzer_lr3_200_lr4_500.ckpt


## Understand lstm output

In [261]:
#test out
lstm_outputs_arr,flatten_arr,states_list = sess.run([lstm_outputs,flatten,states],
                                                    feed_dict = {input_vect : X_train_batch, input_y : y_train_batch})

In [281]:
print('lstm_outputs_arr shape : %s' % str(lstm_outputs_arr.shape))
print('flatten_arr shape %s' % str(flatten_arr.shape))

print('Sum over all lstm out : %s' % lstm_outputs_arr.sum())
print('Sum over all lstm c : %s ' % str(states_list.c.shape))
print('Sum over all lstm h : %s ' % str(states_list.h.shape))

lstm_outputs_arr shape : (256, 34, 512)
flatten_arr shape (256, 17408)
Sum over all lstm out : -4359.21
Sum over all lstm c : (256, 512) 
Sum over all lstm h : (256, 512) 


In [287]:
print(states_list.h[0].min())

-0.996468
