In [1]:
import sys, os
import pandas as pd
import numpy as np
import spacy
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
cur_dir=os.getcwd()
data_dir=os.path.join(cur_dir, 'data')
save_dir=os.path.join(data_dir, 'save')
model_dir=os.path.join(os.path.dirname(cur_dir), 'models')
if not os.path.exists(save_dir): os.mkdir(save_dir)

In [5]:
df=pd.read_csv(os.path.join(data_dir, 'train.csv'))

In [8]:
df['question1']=df['question1'].apply(str)
df['question2']=df['question2'].apply(str)

In [11]:
nlp=spacy.load('en')
vecs1=[doc.vector for doc in nlp.pipe(df['question1'], n_threads=50)]
vecs1=np.array(vecs1)
df['q1_feats']=list(vecs1)

In [12]:
vecs2=[doc.vector for doc in nlp.pipe(df['question2'], n_threads=50)]
vecs2=np.array(vecs2)
df['q2_feats']=list(vecs2)

In [14]:
pd.to_pickle(df, os.path.join(save_dir, '1_df.pkl'))

In [26]:
questions=list(df['question1'])+list(df['question2'])
tfidf=TfidfVectorizer(lowercase=False)
tfidf.fit_transform(questions)
word2tfidf=dict(zip(tfidf.get_feature_names(), tfidf.idf_))

In [34]:
def weighted_vec(df, col, word2tfidf):
    vecs=[]
    for qu in tqdm(list(df[col])):
        doc=nlp(qu)
        mean_vec=np.zeros([len(doc), 300])
        for word in doc:
            vec=word.vector
            try:
                idf=word2tfidf[str(word)]
            except:
                idf=0
            mean_vec+=vec*idf
        mean_vec=mean_vec.mean(axis=0)
        vecs.append(mean_vec)
    return list(vecs)

In [35]:
df['q1_feats']=weighted_vec(df, 'question1', word2tfidf)
df['q2_feats']=weighted_vec(df, 'question2', word2tfidf)

100%|██████████| 404290/404290 [07:23<00:00, 911.19it/s] 
100%|██████████| 404290/404290 [07:19<00:00, 919.79it/s] 


In [36]:
pd.to_pickle(df, os.path.join(save_dir, '2_df.pkl'))

In [3]:
df=pd.read_pickle(os.path.join(save_dir, '2_df.pkl'))

In [4]:
df=df.reindex(np.random.permutation(df.index))
train_split=0.88
num_train=int(df.shape[0]*train_split)
num_val=df.shape[0]-num_train
print("Train: {}\nValidation: {}".format(num_train, num_val))

Train: 355775
Validation: 48515


In [5]:
X_train=np.zeros([num_train, 2, 300])
X_val=np.zeros([num_val, 2, 300])
y_train=np.zeros([num_train])
y_val=np.zeros([num_val])

In [6]:
b=[a[None,:] for a in list(df['q1_feats'].values)]
q1_feats=np.concatenate(b, axis=0)
b=[a[None,:] for a in list(df['q2_feats'].values)]
q2_feats=np.concatenate(b, axis=0)

In [7]:
X_train[:,0,:]=q1_feats[:num_train]
X_train[:,1,:]=q2_feats[:num_train]
y_train=df[:num_train]['is_duplicate'].values

X_val[:,0,:]=q1_feats[num_train:]
X_val[:,1,:]=q2_feats[num_train:]
y_val=df[num_train:]['is_duplicate'].values

In [11]:
del df

NameError: name 'df' is not defined

In [12]:
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Lambda, merge, BatchNormalization, Activation, Input, Merge, Add, Concatenate, add, concatenate
from keras import backend as K

Using TensorFlow backend.


In [13]:
def euclidean_distance(vecs):
    x, y=vecs
    return K.sqrt(K.sum(K.square(x-y), axis=1, keepdims=True))

def eucl_dist_output_shape(shapes):
    shape1, shape2=shapes
    return (shape1[0], 1)

def cosine_distance(vecs):
    x, y=vecs
    x=K.l2_normalize(x, axis=-1)
    y=K.l2_normalize(y, axis=-1)
    return -K.mean(x*y, axis=-1, keepdims=True)

def cos_dist_output_shape(shapes):
    shape1, shape2 = shapes

    return (shape1[0],1)

def contrastive_loss(y_true, y_pred):
    margin=1
    return K.mean(y_true*K.square(y_pred)+(1-y_true)*K.square(K.maximum(margin-y_pred, 0)))

def compute_accuracy(preds, labels):
    return labels[preds.ravel()<0.5].mean()

In [14]:
def create_base_network(input_dim):
    input=Input(shape=(input_dim, ))
    
    dense1=Dense(128)(input)
    bn1=BatchNormalization()(dense1)
    relu1=Activation('relu')(bn1)
    
    dense2=Dense(128)(relu1)
    bn2=BatchNormalization()(dense2)
    res2=add([relu1, bn2])
    relu2=Activation('relu')(res2)
    
    dense3=Dense(128)(relu2)
    bn3=BatchNormalization()(dense3)
    res3=add([relu2, bn3])
    relu3=Activation('relu')(res3)
    
    feats=concatenate([relu3, relu2, relu1])
    bn4=BatchNormalization()(feats)
    
    model=Model(inputs=input, outputs=bn4)
    return model

In [15]:
def create_model(input_dim):
    base_network=create_base_network(input_dim)
    
    input_a=Input(shape=(input_dim, ))
    input_b=Input(shape=(input_dim, ))
    
    processed_a=base_network(input_a)
    processed_b=base_network(input_b)    
    
    distance=Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([processed_a, processed_b])
    
    model=Model(inputs=[input_a, input_b], outputs=distance)
    return model

In [16]:
from keras.optimizers import RMSprop, SGD, Adam

In [17]:
net=create_model(300)

In [18]:
optimizer=Adam(lr=0.001)
metric=[compute_accuracy]
net.compile(loss=contrastive_loss, optimizer=optimizer)

In [None]:
net.fit([X_train[:,0,:], X_train[:,0,:]], y_train, batch_size=32, epochs=1)

Epoch 1/1