<a href="https://colab.research.google.com/github/devloper13/SiameseNetworkProject/blob/master/Siamese.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Siamese Network Code 

The following modules in python has been used 
<ul>
  <li>Tensorflow for constructing Siamese Networks</li>
  <li>NLTK library for stemming and stop words removal</li>
  <li>Google Drive library for data strage</li>
  <li>Scikit, Numpy and Pandas for misc. tensor computations</li>
<ul>
  

In [0]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from builtins import input

#import system things
from tensorflow.examples.tutorials.mnist import input_data # for data
import tensorflow as tf
import numpy as np
import os
import keras


# Code to read csv file into Colaboratory:
!pip install -U -q PyDrive
import tensorflow as tf
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import pandas as pd
import pickle
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.util import ngrams
import nltk
nltk.download('stopwords')

nltk.download('punkt')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from tqdm import tqdm
import string

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Siamese Network Implementaiton

Implementation is mostly done using barebones tensorflow with tensor variables, placeholders and constants . No predefined library was used in this implementation. The structure below is not a generic siamese network but can be made one by generalizing a few parameters.

In [0]:
idim = 30628

class Siamese:

    # Create model
    def __init__(self,dname="Siamese"):
        self.x1 = tf.placeholder(tf.float32, [None, idim])
        self.x2 = tf.placeholder(tf.float32, [None, idim])

        with tf.variable_scope(dname) as scope:
            self.o1 = self.network(self.x1)
            scope.reuse_variables()
            self.o2 = self.network(self.x2)

        # Create loss
        self.y_ = tf.placeholder(tf.float32, [None])
        #self.y_ = tf.placeholder(tf.int32, [None])
        self.loss = self.cosineLoss()

    #create the network 
    def network(self, x):
        
        x = tf.reshape(x,shape=[-1,1,idim,1])
        activated_conv1 = self.conv_layer('conv_1',x,3)
        maxpool1 = self.maxpool_layer('maxp_1',activated_conv1)
        
        flattened_conv = tf.layers.flatten(maxpool1)   #To be removed
        activated_fc1 = self.fc_layer( "fc1",flattened_conv, 128)
        #activated_fc2 = self.fc_layer("fc2",activated_fc1, 1024)
        #activated_fc3 = self.fc_layer("fc3",activated_fc2, 2)
        
        return activated_fc1
        
    #create the convolution layer 
    def conv_layer(self,name,inputs,cur_channel):
        #print(inputs.get_shape())
        prev_channel = inputs.get_shape()[-1]
        #print(prev_channel)
        init = tf.variance_scaling_initializer(scale=2.0)
        w = tf.get_variable(name+"_w",dtype=tf.float32,shape=[1,10,prev_channel,cur_channel],initializer=init)
        b = tf.get_variable(name+"_b",dtype=tf.float32,shape=[cur_channel],initializer = init)
        conv = tf.nn.conv2d(inputs,w,strides=[1,1,1,1],padding = "SAME")
        activation = conv+b
        return activation  
      
    def maxpool_layer(self,name,inputs):
        return tf.nn.relu(tf.nn.max_pool(inputs,ksize=[1,1,100,1],strides=[1,1,100,1],padding="SAME"))  
    
    def fc_layer(self,name,inputs,cur_layer):
        print(inputs.get_shape())
        prev_layer = inputs.get_shape()[-1]
        init = tf.truncated_normal_initializer(stddev=0.01)
        w = tf.get_variable(name+"_w",dtype=tf.float32,shape=[prev_layer,cur_layer],initializer=init)
        b = tf.get_variable(name+"_b",dtype=tf.float32,shape=[cur_layer],initializer=init)
        activation = tf.matmul(inputs,w)+b
        return activation
    
        
    def cosineLoss(self):
        
        norms1 = tf.norm(self.o1,axis=1)
        norms2 = tf.norm(self.o2,axis=1)
        norm = tf.multiply(norms1,norms2)
        cosines = tf.div(tf.reduce_sum(tf.multiply(self.o1,self.o2),axis=1),norm)
        
        labels_t = self.y_
        labels_f = tf.subtract(1.0, self.y_, name="1-yi")          # labels_ = !labels;
        
        
        C = tf.constant(0.5, name="C")
        
        pos = tf.multiply(labels_t,tf.subtract(1.0,cosines), name="yi_x_cosine")
        
        neg = tf.multiply(labels_f, tf.maximum(tf.subtract(cosines,C),0), name="Nyi_x_C-cosine")
        losses = tf.add(pos, neg, name="losses")
        loss = tf.reduce_mean(losses, name="loss")
        return loss
        

          

### Get Pickled Dictionary stored in Drive


In [0]:
link="https://drive.google.com/open?id=1Jhj9OazxPnvLcuuZsZvNFfpnnsFg88I7"
fluff, id = link.split('=')
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('three_hash')  
with open('three_hash','rb') as f:
   three_hash_dict = pickle.load(f)
print(len(three_hash_dict))

30628


### Three hash generation function

This function takes the text and tokenizes it, performs steming and stop words removal using NLTK. 

In [0]:
stop_words = set(stopwords.words('english')) 
def getThreeHash(text):
  vectorizer = CountVectorizer()
  tokenizer=vectorizer.build_tokenizer()
  
  hashes=""
  tokens=tokenizer(text)
  ps = PorterStemmer()
  tokens = [ps.stem(word) for word in tokens]
  for token in tokens:
    if token not in stop_words:
      tokenModi="#"+token+"#"
      output = list(ngrams(tokenModi, 3))
      for a in output:
        hashes+=(''.join(a))+" "
  
  return(hashes)


### Get Data Set

In [0]:
link="https://drive.google.com/open?id=17VMN5CJA05vTPEs15gw-W2ocxmEITQEH" #create shareable link of google drive file
fluff, id = link.split('=')
print (id) # Verify that you have everything after '='

downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('QA.csv')  
ndata = pd.read_csv('QA.csv',error_bad_lines=False).values


17VMN5CJA05vTPEs15gw-W2ocxmEITQEH


### Take only a subset as training

In [0]:
data = ndata[:5000,1:]

### Cosine Distance SImilarity 

This is done so that systactically similar questions are taken into consideration. The distance metric used is a cosine distance on simple word hashed representation of texts.

In [0]:
def preprocess(text):
    tokens = word_tokenize(text)
    tokenlower = [word.lower() for word in tokens]
    
    stopWords = set(stopwords.words('english'))
    tokenlower = [word for word in stopWords if word not in stopWords]
    
    tokenDict = nltk.defaultdict(int)
    for word in tokens:
        tokenDict[word] += 1
    return tokenDict

def cosDistance(v1,v2):
    dotProduct = np.dot(v1,v2)
    normV1 = np.linalg.norm(v1)
    normV2 = np.linalg.norm(v2)
#     print(dotProduct,normV1,normV2)
    return dotProduct / (normV1 * normV2)

def getSimilarity(vDict1,vDict2):
    allWords = []
    for key in vDict1:
        allWords.append(key)
    for key in vDict2:
        allWords.append(key)
    allWordsSize = len(allWords)
    
    v1 = np.zeros(allWordsSize,dtype=np.int)
    v2 = np.zeros(allWordsSize,dtype=np.int)
    
    i = 0
    for key in allWords:
#         print(key)
        v1[i] = vDict1.get(key, 0)
        v2[i] = vDict2.get(key, 0)
        i += 1
#     print(v1,v2)    
    return cosDistance(v1,v2)

### Batch Creation and vectorization of text

In [0]:
def vectorize(hashString,dictionary):
  
  vectorizer = CountVectorizer()
  tokenizer = vectorizer.build_tokenizer()
  vec = [0]*idim
  
  for token in tokenizer(hashString):
    try:
      vec[dictionary[token]] += 1
    except:
      pass
    
  return vec

def createBatch(data,goodSet,badSetQues,badSetAns,dictionary):
  
  good_data = data[goodSet]
  bad_quest = data[badSetQues,0]
  bad_anser = data[badSetAns,1]
  questions = np.concatenate((good_data[:,0],bad_quest))
  answers = np.concatenate((good_data[:,1],bad_anser))
  label = np.array([1]*good_data.shape[0] + [0]*bad_quest.shape[0])
  
  ques,ans = [],[]
  for d in questions:
    ques += [vectorize(getThreeHash(d.lower()),dictionary)]
  for i,d in enumerate(answers):
    ans += [vectorize(getThreeHash(str(d).lower()),dictionary)]
  return np.array(ques),np.array(ans),label
    
def getRandomBatch(data,batchsize,dictionary,good_bad=0.5):
  
  goodSet = np.random.permutation(data.shape[0])[:int(batchsize*good_bad)]
  badSetQues = np.random.permutation(data.shape[0])[:batchsize - int(batchsize*good_bad)]
  badSetAns = np.random.permutation(data.shape[0])[:batchsize - int(batchsize*good_bad)]
  return createBatch(data,goodSet,badSetQues,badSetAns,dictionary)
  

    

### Initializing Siamese networks and optimizer

In [0]:
tf.reset_default_graph()
siamese = Siamese('siamese6')
try:
  sess.close()
except:
  pass
sess = tf.InteractiveSession()
train_Step = tf.train.MomentumOptimizer(0.01,0.05).minimize(siamese.loss)
tf.initialize_all_variables().run()

(?, 921)
(?, 921)


### Training with siamese Networs

In [0]:
def trainSiameseNetwork(data,siamese,sess,batchSize,epochs,dictionary):
  
  #Siamese Network
  #sess = tf.Session()
  #tf.reset_default_graph()
  #siamese = Siamese()
  
  while epochs > 0:
    
    avg = 0.0
    permSet = np.random.permutation(data.shape[0])

    for p in range(0,permSet.shape[0],batchSize):

      goodSet = np.array(list(range(p,p+batchSize)))
      badSetQ = goodSet.copy()
      badSetA = np.array(permSet[p:p+batchSize])
      ques,ans,labl = createBatch(data,goodSet,badSetQ,badSetA,dictionary)


      #print(ques.shape,ans.shape,labl.shape)
      _, Loss = sess.run([train_Step,siamese.loss], feed_dict={
                    siamese.x1: ques,
                    siamese.x2: ans,
                    siamese.y_: labl})
      avg += Loss
      #print("Epoch",epochs,"Batch",p/batchSize,"Loss",Loss)
    
    print("Average Epoch ",epochs,"Loss ",avg/(data.shape[0]/batchSize))
    epochs -= 1
        

In [0]:
def predict(qset,ques,siamese,sess,dictionary,alpha=0.5,best=3):
  
  vques  = preprocess(ques)
  thQues = np.array(vectorize(getThreeHash(ques.lower()),dictionary)).reshape(1,-1)

  scores = []

  for q in qset:

    oQues = np.array(vectorize(getThreeHash(str(q).lower()),dictionary)).reshape(1,-1)
    vQues = preprocess(q)
    Loss = sess.run([siamese.loss],feed_dict={
                    siamese.x1: thQues,
                    siamese.x2: oQues,
                    siamese.y_: np.array([1.0])})
    tLoss = getSimilarity(vques,vQues)
    #print(alpha*Loss[0],(1-alpha)*(1-tLoss))
    scores += [alpha*Loss[0] + (1-alpha)*(1-tLoss)]

  sscores = np.argsort(np.array(scores))[:best]
  return qset[sscores],scores
  

## Training With Data

In [0]:

trainSiameseNetwork(data,siamese,sess,100,30,three_hash_dict)

Average Epoch  30 Loss  0.2242837056517601
Average Epoch  29 Loss  0.21048615053296088
Average Epoch  28 Loss  0.2039389818906784
Average Epoch  27 Loss  0.20255933552980424
Average Epoch  26 Loss  0.20143184140324594
Average Epoch  25 Loss  0.20044633969664574
Average Epoch  24 Loss  0.20001706555485727
Average Epoch  23 Loss  0.19893319755792618
Average Epoch  22 Loss  0.1970414862036705
Average Epoch  21 Loss  0.19635800570249556
Average Epoch  20 Loss  0.19561513587832452
Average Epoch  19 Loss  0.1954384095966816
Average Epoch  18 Loss  0.19422918558120728
Average Epoch  17 Loss  0.19333649873733522
Average Epoch  16 Loss  0.1926034264266491
Average Epoch  15 Loss  0.19089846909046174
Average Epoch  14 Loss  0.1923725850880146
Average Epoch  13 Loss  0.18939405381679536
Average Epoch  12 Loss  0.1894181936979294
Average Epoch  11 Loss  0.1890207976102829
Average Epoch  10 Loss  0.18854231387376785
Average Epoch  9 Loss  0.18777668699622155
Average Epoch  8 Loss  0.1863868422806263

In [0]:
trainQues = ndata[:15000,1]

## Some Results

In [0]:
ques = input("Enter a question - ")
score = predict(trainQues,ques,siamese,sess,three_hash_dict,alpha=0.8)
print("Best Three Answers \n%s\n%s\n%s" % (score[0][0],score[0][1],score[0][2]))

Enter a question - how to attract a girl
Best Three Answers 
How should I act on a date?
Is a transponder required to fly in class C airspace?
How do I end a date with a girl I'm not interested in?


In [0]:
ques = input("Enter a question - ")
score = predict(trainQues,ques,siamese,sess,three_hash_dict,alpha=0.8)
print("Best Three Answers \n%s\n%s\n%s" % (score[0][0],score[0][1],score[0][2]))

Enter a question - best place to eat
Best Three Answers 
best place to meet guys in the bay area?
Whats a good place to eat in LA?
Where's the best place to get my FICO score?


In [0]:
ques = input("Enter a question - ")
score = predict(trainQues,ques,siamese,sess,three_hash_dict,alpha=0.8)
print("Best Three Answers \n%s\n%s\n%s" % (score[0][0],score[0][1],score[0][2]))

Enter a question - best internet site
Best Three Answers 
what is the best news site on the net?
How many websites are on the internet?
What is the singel most important thing you are missing on the internet ?


In [0]:
ques = input("Enter a question - ")
score = predict(trainQues,ques,siamese,sess,three_hash_dict,alpha=0.8)
print("Best Three Answers \n%s\n%s\n%s" % (score[0][0],score[0][1],score[0][2]))

Enter a question - find peace
Best Three Answers 
Who won the first nobel peace prize?
Where can I find help about a war?
need to find reserch about work ethics?


In [0]:
ques = input("Enter a question - ")
score = predict(trainQues,ques,siamese,sess,three_hash_dict,alpha=0.8)
print("Best Three Answers \n%s\n%s\n%s" % (score[0][0],score[0][1],score[0][2]))

Enter a question - which fruit is good for health
Best Three Answers 
Why is red wine good for your heart?
Why are blueberries so good for your health?
How do you get a toddler to eat what's good for him?


In [0]:
ques = input("Enter a question - ")
score = predict(trainQues,ques,siamese,sess,three_hash_dict,alpha=0.8)
print("Best Three Answers \n%s\n%s\n%s" % (score[0][0],score[0][1],score[0][2]))

Enter a question - Who is most famous celebrity
Best Three Answers 
Who is the most famous woman athlete of all time?
What is emo?
What is "I"?


In [0]:
ques = input("Enter a question - ")
score = predict(trainQues,ques,siamese,sess,three_hash_dict,alpha=0.8)
print("Best Three Answers \n%s\n%s\n%s" % (score[0][0],score[0][1],score[0][2]))

Enter a question - Who is most famous celebrity
Best Three Answers 
Who is the most famous hacker?
What is the most famous breed of horse?
Which celebrity would you?


In [0]:
ques = input("Enter a question - ")
score = predict(trainQues,ques,siamese,sess,three_hash_dict,alpha=0.8)
print("Best Three Answers \n%s\n%s\n%s" % (score[0][0],score[0][1],score[0][2]))

Enter a question - best way to publish a book
Best Three Answers 
how do you get a book published?
whats the best way to get blood stains out of a white t-shirt?
What is the best way to stop smoking?


In [0]:
ques = input("Enter a question - ")
score = predict(trainQues,ques,siamese,sess,three_hash_dict,alpha=0.8)
print("Best Three Answers \n%s\n%s\n%s" % (score[0][0],score[0][1],score[0][2]))

Enter a question - How to win a game?
Best Three Answers 
How long have you been a gamer? And Why?
game zone.?
What or who is a Gant?


In [0]:
ques = input("Enter a question - ")
score = predict(trainQues,ques,siamese,sess,three_hash_dict,alpha=0.9)
print("Best Three Answers \n%s\n%s\n%s" % (score[0][0],score[0][1],score[0][2]))

Enter a question - What are the advantages of Facebook?
Best Three Answers 
Facebook or MySpace?
what is rebate?and what the advantage?
what are the advantages of antibacterial products?
