### Importing libraries

In [5]:
import pandas as pd
import numpy as np


### Importing Dataset

In [7]:
data = pd.read_csv("news-text.csv")
data.shape

(1186018, 2)

### Viewing Head of Data

In [9]:
data.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [10]:
data = data[0:10000]

### Installing Lib

In [11]:
!pip install transformers



In [12]:
!pip install sentence_transformers



### Importing Sentence Transfer Lib

In [13]:
import scipy
import string
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')

In [20]:
all_headlines = [h for h in data['headline_text'] if h != "Unknown"]
len(all_headlines)

10000

### Cleaning Text

#### Convert the corpus into a list of headlines

In [21]:
def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt 

corpus = [clean_text(x) for x in all_headlines]
corpus[:10]

['aba decides against community broadcasting licence',
 'act fire witnesses must be aware of defamation',
 'a g calls for infrastructure protection summit',
 'air nz staff in aust strike for pay rise',
 'air nz strike to affect australian travellers',
 'ambitious olsson wins triple jump',
 'antic delighted with record breaking barca',
 'aussie qualifier stosur wastes four memphis match',
 'aust addresses un security council over iraq',
 'australia is locked into war timetable opp']

### Get a vector for each headline (sentence) in the corpus

In [22]:

corpus_embeddings = model.encode(corpus)

### Define search queries and embed them to vectors as well

In [23]:
queries = [
    'australia is locked into war timetable opp']
query_embeddings = model.encode(queries)

### For each search term return 5 closest sentences

In [25]:

closest_n = 5
for query, query_embedding in zip(queries, query_embeddings):
    distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings, "cosine")[0]

    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for idx, distance in results[0:closest_n]:
        print(corpus[idx].strip(), "(Score: %.4f)" % (1-distance))





Query: australia is locked into war timetable opp

Top 5 most similar sentences in corpus:
australia is locked into war timetable opp (Score: 1.0000)
australia reconsider us trip as war looms (Score: 0.7789)
australian troops ready for war (Score: 0.7621)
war of words over aus open scheduling (Score: 0.7499)
howard commits australian troops to us led war (Score: 0.7478)


In [29]:
queries = [
    'aust addresses un security council over iraq']
query_embeddings = model.encode(queries)

In [30]:
# For each search term return 5 closest sentences
closest_n = 5
for query, query_embedding in zip(queries, query_embeddings):
    distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings, "cosine")[0]

    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for idx, distance in results[0:closest_n]:
        print(corpus[idx].strip(), "(Score: %.4f)" % (1-distance))





Query: aust addresses un security council over iraq

Top 5 most similar sentences in corpus:
aust addresses un security council over iraq (Score: 1.0000)
security council meets on iraq draft (Score: 0.8859)
downer to meet us officials over post war iraq (Score: 0.8778)
powell invites un to post war iraq (Score: 0.8617)
pm to state case against iraq (Score: 0.8532)
