# Using BERT to compare sentences

In [2]:
#150%
#bert-serving-start -cpu -model_dir C:\Users\Colin\Documents\bert\uncased_L-12_H-768_A-12 -num_worker=2

from bert_serving.client import BertClient
import pandas as pd
import numpy as np
from termcolor import colored
from sklearn import preprocessing as pre
bc = BertClient()

# Example 1.

In [3]:
sentence_1 = ["I like apples"]

sentence_2 = ["I like Apple computers"]

sentence_3 = ["I like oranges"]

In [4]:
sen_1_embed = bc.encode(sentence_1)
sen_2_embed = bc.encode(sentence_2)
sen_3_embed = bc.encode(sentence_3)

#### Similarity between vector embeddings

In [5]:
# Here we are computing scores that show the similarity between the vector embeddings.

# The more similar the vectors are, the closer the score is to 1.

# Computes the dot product of the norms of sentence 1 and sentence 2 embeddings
similarity_1_2 = np.sum(pre.normalize(sen_1_embed, norm = "l2") *
                        pre.normalize(sen_2_embed, norm = "l2"))

# Computes the dot product of the norms of sentence 1 and sentence 2 embeddings
similarity_1_3 = np.sum(pre.normalize(sen_1_embed, norm = "l2") * 
                        pre.normalize(sen_3_embed, norm = "l2")) 


In [6]:
# Similarity between "I like apples" and "I like Apple computers"

print(similarity_1_2)

0.88875353


In [7]:
# Similarity between "I like apples" and "I like oranges"

print(similarity_1_3)

0.93167186


# Example 2.

In [8]:
search = pd.read_csv('search_demo.csv')
doc_vecs = bc.encode(search["statement"].values.tolist())

#### Searching for similar sentences

In [9]:
print(search)

                                            statement
0                               Data science is cool!
1                             CMDA is the best major.
2                        I like coding in R the most.
3              Python is a very good coding language.
4        Virginia Tech football is coached by Fuente.
5                   Virginia Tech lost to Notre Dame.
6                         My favorite color is green.
7            My favorite football team is The Ravens.
8            My favorite baseball team is The Orioles
9                           Math is the best subject.
10                        Data Science is the future!
11  Our capstone project is part of the history de...
12                    There are four letters in CMDA.
13  CMDA stands for Computational Modeling and Dat...
14                    Virginia Tech is in Blacksburg.
15                                  Orange and Maroon
16                         I like the color blue too.


In [10]:
#what is my favorite color
#what are my favorite sports team
#what does cmda stand for
#Data Science

while True:
        query = input(colored('Search for : ', 'green'))
        if query == "exit":
            break
        query_vec = bc.encode([query])[0]
        # compute normalized dot product as score
        score = np.sum(query_vec * doc_vecs, axis=1) / np.linalg.norm(doc_vecs, axis=1)
        ids = np.arange(0, 17)
        data = {'ids':ids, 'scores':score}
        scores = pd.DataFrame(data)
        scores = scores.sort_values(by = "scores", ascending = False)
        print(search["statement"].values[scores["ids"].head(2).values])

Search for : what does cmda stand for
['CMDA stands for Computational Modeling and Data Analytics.'
 'There are four letters in CMDA.']
Search for : what is my favorite color
['My favorite color is green.' 'I like the color blue too.']
Search for : what are my favorite sports teams
['My favorite baseball team is The Orioles'
 'My favorite football team is The Ravens.']
Search for : exit
