In [79]:
import json
import csv
import math
import pandas as pd
import copy

#NOTlinks = pd.read_csv('webpages/NOTlinks.csv', names=['Code', 'Title'], header = 0)
#dfList = list(NOTlinks['Code']) + [9429, 9671]
voc = json.load(open('vocabulary.json'))
inverted_index = json.load(open('inverted_index.json'))

In [84]:
voc['sun']

26137

# TF (Term Frequency)

In [128]:
#Calculate data frequency

dict_freq = {}
for i in range(10000):
    file = open('webpages/tsv clean/filtered_%d.tsv'%i).read().split('\n\n')[1]
    tabs = file.split('\t')[1]+file.split('\t')[2]  # list of words in intro and plots
    for word in tabs.split():            
        if word not in dict_freq.keys():
            dict_freq[word] = {i: [tabs.split().count(word)/len(tabs.split())]}
        else:
            dict_freq[word][i] = [tabs.split().count(word)/len(tabs.split())]

In [220]:
# Function that returns the data frequency of a word in a document

def df(term, document_id):  # term is a string, document_id an integer
    dict_freq[term][document_id][0]

### Compact function (without storing the dictionary)

In [75]:
def data_freq(term, i):   # (string, integer)
    file = open('webpages/tsv clean/filtered_%d.tsv'%i).read().split('\n\n')[1]
    tabs = file.split('\t')[1]+file.split('\t')[2]  # list of words in intro and plots
    if term in tabs.split():
        df = tabs.split().count(term)/len(tabs.split())
    else:
        df = 0
    return df

In [86]:
data_freq('sun', 9929)

0.021739130434782608

# IDF (Inverse Data Frequency)

In [89]:
# Calculate Inverse Data Frequency

idf = {}
N = 9898
for word in voc:
    val = len(inverted_index[str(voc[word])])
    idf[word] = math.log(N/val)

### Compact function (without storing the dictionary)

In [27]:
N = 10000
def idf(term):
    val = len(inverted_index[str(voc[term])])  # this is the number of documents containing the given word
    return math.log(N/val)

In [87]:
idf('sun')

5.521460917862246

# TF-IDF (Term Frequency - Inverse Data Frequency)

In [137]:
# Calculate the TF-IDF

for word in dict_freq:
    for document_id in dict_freq[word].keys():
        dict_freq[word][document_id].append(dict_freq[word][document_id][0]*idf[word])

# dict_freq is a dictionary with the words as keys and as values a list whose elements are {document_id : [df_{word}, TF-IDF_{document_id, word}]} 

In [28]:
def tfidf(term_id, document_id): # (integer, integer)
    return data_freq(get_key(term_id), document_id)*idf(get_key(term_id))

In [88]:
tfidf(26137, 9929)

0.12003175908396187

# Inverted index with TF-IDF score

In [99]:
# Create the Inverted Index with TF-IDF score

inverted_index_freq = dict.fromkeys(range(len(voc)))
for term_id in inverted_index_freq.keys():
    for document_id in inverted_index[str(term_id)]:
        if inverted_index_freq[term_id] == None:
            inverted_index_freq[term_id] = [(document_id, tfidf(term_id, document_id))]
        else:
            inverted_index_freq[term_id] += [(document_id, tfidf(term_id, document_id))]
            
# The new inverted_index_freq is as follow: {term_id : [document_id, TF-IDF_{document_id, term}]] 

In [100]:
with open('inverted_index_freq.json', 'w') as fp:
    json.dump(inverted_index_freq, fp)

In [212]:
# The following function returns the TF-IDF of the given word in the given document

def tfidf2(term_id, document_id):  # term_id is a string here, document_id an integer
    for doc in inverted_index_freq[term_id]:
        if doc[0][0] == document_id:
            return doc[0][1]

In [8]:
# Function that returns the a term given its term_id

def get_key(term_id): 
    for key, value in voc.items(): 
         if term_id == value:
                return key   
    return "key doesn't exist"

# Search engine 2

### Preliminary functions

In [104]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 

stop_words = set(stopwords.words('english')) 
ps = PorterStemmer()
urls = json.load(open('urls.json'))
inverted_index = json.load(open('inverted_index.json'))

In [157]:
# Build the query vector

def query_vector(query):
    query_vector = {}
    for word in query:  # words are strings
        df = query.count(word)/len(query)
        query_vector[voc[word]] = df * idf(word)
    return query_vector

In [174]:
# Function that returns the vector of a certain document

def vector(i):  # integer
    vec = {}
    file = open('webpages/tsv clean/filtered_%d.tsv'%i).read().split('\n\n')[1]
    tabs = file.split('\t')[1]+file.split('\t')[2]  # list of words in intro and plots
    for word in tabs.split():
        vec[voc[word]] = tfidf(voc[word], i)
    return vec

In [179]:
def cosine_similarity(query_vec, document_id):  # (dict, integer)
    norm_query = math.sqrt(sum(n**2 for n in query_vec.values()))
    norm_doc = math.sqrt(sum(tfidf(word,document_id) for word in vector(document_id)))
    dot_pr = 0
    for word in query_vec.keys():
        dot_pr += query_vec[word]*tfidf(word, document_id)  # (string, integer)
    return dot_pr/(norm_query*norm_doc)

### Execution of the query search

In [145]:
query = input().split()

english united states love


In [147]:
# Clean input:

for i in range(len(query)):
    if not query[i] in stop_words and query[i].isalnum():
        query[i] = ps.stem(query[i])
    else:
        del query[i]      

In [148]:
# Get the term_id of the words in the query (integers)
query_index = [voc[word] for word in query]

# Get the document_if of the documents containing the words in the query
allDOC = [inverted_index[str(word)] for word in query_index]
query_match = set(allDOC[0]).intersection(*allDOC[1:])


In [181]:
#Build the query vector

query_vec = query_vector(query)

# Rank the results by cosine similarity

# Show the result of the query in a dataframe

df = pd.DataFrame(columns=['Title','Intro','Wikipedia Url', 'Similarity'])

def make_clickable(val):  # function that make the links clickable
    return '<a target="_blank" href="{}">{}</a>'.format(val, val)

for i in query_match:
    file = open('webpages/tsv/output_%d.tsv' %i).read().split('\n\n')[1].split('\t')
    title, intro, link = file[3].encode('utf8').decode("unicode_escape"), file[1].encode('utf8').decode("unicode_escape"), urls[str(i)]
    sim = cosine_similarity(query_vec, i)
    new_row = {'Title':title, 'Intro': intro, 'Wikipedia Url': link, 'Similarity': sim}
    df = df.append(new_row, ignore_index=True)

In [182]:
# Visualization of the top 5 documents related to the query

d = dict(selector="th", props=[('text-align', 'center')])
df1 = df.sort_values(by=['Similarity'], ascending = False).head(5)
df1.style.format({'Wikipedia Url': make_clickable}).hide_index().set_table_styles([d]).set_properties(**{'text-align': 'center'}).set_properties(subset=['Title'], **{'width': '130px'})


Title,Intro,Wikipedia Url,Similarity
A Farewell to Arms,"A Farewell to Arms is a 1932 American pre-Code romance drama film directed by Frank Borzage and starring Helen Hayes, Gary Cooper, and Adolphe Menjou.[2] Based on the 1929 semi-autobiographical novel A Farewell to Arms by Ernest Hemingway, with a screenplay by Oliver H.P. Garrett and Benjamin Glazer, the film is about a tragic romantic love affair between an American ambulance driver and an English nurse in Italy during World War I. The film received Academy Awards for Best Cinematography and Best Sound, and was nominated for Best Picture and Best Art Direction.[2]",https://en.wikipedia.org/wiki/A_Farewell_to_Arms_(1932_film),0.0592752
The Gilded Lily,"The Gilded Lily is a 1935 American romantic comedy film directed by Wesley Ruggles and starring Claudette Colbert, Fred MacMurray, Ray Milland, and C. Aubrey Smith. The production's screenplay, written by Claude Binyon, is about a stenographer who becomes a famous cafÃ© entertainer courted by an English aristocrat and an American newspaper reporter. Released by Paramount Pictures in the United States on January 25, 1935, the film is one of the English language films chosen by the National Board of Review for its top-10 list of 1935. The Gilded Lily is also the first of seven films in which Claudette Colbert and Fred MacMurray costar.",https://en.wikipedia.org/wiki/The_Gilded_Lily_(1935_film),0.0531571
Ace Eli and Rodger of the Skies,"Ace Eli and Rodger of the Skies is a 1973 American adventure-comedy film based on a story by Steven Spielberg. The film centers on a barnstorming pilot (Cliff Robertson) and his son (Eric Shea) as they fly around the United States in the 1920s, having adventures along the way. English actress Pamela Franklin provided the love interest. One of the driving forces behind the production, Robertson was a pilot in real life, although Hollywood stunt pilot Frank Tallman flew most of the aerial scenes.[3]",https://en.wikipedia.org/wiki/Ace_Eli_and_Rodger_of_the_Skies,0.0338266
"Munster, Go Home!","Munster, Go Home! is a 1966 American comedy horror film based on the hit 1960s family television sitcom The Munsters. It was directed by Earl Bellamy, who also directed a number of episodes in the series. The film was produced immediately after the television series completed filming for its original run, and included the original cast with the exception of Marilyn, who was played by Debbie Watson replacing Pat Priest from the series.","https://en.wikipedia.org/wiki/Munster,_Go_Home!",0.0318399
