In [199]:
import json
import csv
import math
import pandas as pd
import copy

NOTlinks = pd.read_csv('webpages/NOTlinks.csv', names=['Code', 'Title'], header = 0)
dfList = list(NOTlinks['Code']) + [9429, 9671]
voc = json.load(open('vocabulary.json'))
inverted_index = json.load(open('inverted_index.json'))

# TF (Term Frequency)

In [128]:
#Calculate data frequency

dict_freq = {}
for i in range(10000):
    if i not in dfList:
        file = open('webpages/tsv clean/filtered_%d.tsv'%i).read().split('\n\n')[1]
        tabs = file.split('\t')[1]+file.split('\t')[2]  # list of words in intro and plots
        for word in tabs.split():            
            if word not in dict_freq.keys():
                dict_freq[word] = {i: [tabs.split().count(word)/len(tabs.split())]}
            else:
                dict_freq[word][i] = [tabs.split().count(word)/len(tabs.split())]

In [220]:
# Function that returns the data frequency of a word in a document

def df(term, document_id):  # term is a string, document_id an integer
    dict_freq[term][document_id][0]

# IDF (Inverse Data Frequency)

In [89]:
# Calculate Inverse Data Frequency

idf = {}
N = 9898
for word in voc:
    val = len(inverted_index[str(voc[word])])
    idf[word] = math.log(N/val)

# TF-IDF (Term Frequency - Inverse Data Frequency)

In [137]:
# Calculate the TF-IDF

for word in dict_freq:
    for document_id in dict_freq[word].keys():
        dict_freq[word][document_id].append(dict_freq[word][document_id][0]*idf[word])

# dict_freq is a dictionary with the words as keys and as values a list whose elements are {document_id : [df_{word}, TF-IDF_{document_id, word}]} 

In [201]:
# Create the Inverted Index with TF-IDF score

inverted_index_freq = inverted_index.copy()
for term_id in inverted_index_freq:
    for i in range(len(inverted_index_freq[term_id])):
        document_id = inverted_index_freq[term_id][i]  # integer
        inverted_index_freq[term_id][i] = [(document_id, dict_freq[get_key(int(term_id))][document_id][1])]

# The new inverted_index_freq is as follow: {term_id : [document_id, TF-IDF_{document_id, term}]] 

In [214]:
with open('inverted_index_freq.json', 'w') as fp:
    json.dump(inverted_index_freq, fp)

In [212]:
# The following function returns the TF-IDF of the given word in the given document

def tfidf(term_id, document_id):  # term_id is a string here, document_id an integer
    for doc in inverted_index_freq[term_id]:
        if doc[0][0] == document_id:
            return doc[0][1]

In [108]:
# Function that returns the a term given its term_id

def get_key(term_id): 
    for key, value in voc.items(): 
         if term_id == value:
                return key   
    return "key doesn't exist"

# Search engine 2

In [222]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 

stop_words = set(stopwords.words('english')) 
ps = PorterStemmer()
urls = json.load(open('webpages/urls.json'))
inverted_index = json.load(open('inverted_index.json'))

In [302]:
# Build the query vector

def query_vector(query):
    query_vector = {}
    for word in query:  # words are strings
        df = query.count(word)/len(query)
        query_vector[voc[word]] = df * idf[word]
    return query_vector

In [292]:
# Function that returns the vector of a certain document

def vector(i):  # integer
    vec = {}
    file = open('webpages/tsv clean/filtered_%d.tsv'%i).read().split('\n\n')[1]
    tabs = file.split('\t')[1]+file.split('\t')[2]  # list of words in intro and plots
    for word in tabs.split():
        vec[voc[word]] = tfidf(str(voc[word]), i)
    return vec

In [332]:
def cosine_similarity(query_vec, document_id):  # (dict, integer)
    norm_query = math.sqrt(sum(n**2 for n in query_vec.values()))
    norm_doc = math.sqrt(sum(tfidf(str(word),document_id) for word in vector(document_id)))
    dot_pr = 0
    for word in query_vec.keys():
        dot_pr += query_vec[word]*tfidf(str(word), document_id)  # (string, integer)
    return dot_pr/(norm_query*norm_doc)

In [300]:
query = input().split()

english united states


In [301]:
# Clean input:

for i in range(len(query)):
    if not query[i] in stop_words and query[i].isalnum():
        query[i] = ps.stem(query[i])
    else:
        del query[i]      

In [336]:
# Get the document_if of the documents containing the words in the query

allDOC = [inverted_index[str(word)] for word in query_vec.keys()]  # I'm using the simple inverted index here
query_match = set(allDOC[0]).intersection(*allDOC[1:])

# Rank the results by cosine similarity


# Show the result of the query in a dataframe

df = pd.DataFrame(columns=['Title','Intro','Wikipedia Url', 'Similarity'])

def make_clickable(val):  # function that make the links clickable
    return '<a target="_blank" href="{}">{}</a>'.format(val, val)

for i in query_match:
    file = open('webpages/tsv/output_%d.tsv' %i).read().split('\n\n')[1].split('\t')
    title, intro, link = file[3].encode('utf8').decode("unicode_escape"), file[1].encode('utf8').decode("unicode_escape"), urls[str(i)]
    sim = cosine_similarity(query_vec, i)
    new_row = {'Title':title, 'Intro': intro, 'Wikipedia Url': link, 'Similarity': sim}
    df = df.append(new_row, ignore_index=True)

In [343]:
# Visualization of the top 5 documents related to the query

d = dict(selector="th", props=[('text-align', 'center')])
df1 = df.sort_values(by=['Similarity'], ascending = False).head(5)
df1.style.format({'Wikipedia Url': make_clickable}).hide_index().set_table_styles([d]).set_properties(**{'text-align': 'center'}).set_properties(subset=['Title'], **{'width': '130px'})


Title,Intro,Wikipedia Url,Similarity
The Yankee Clipper,The Yankee Clipper is a 1927 American adventure film produced by Cecil B. DeMille and directed by Rupert Julian. It is set against the maritime rivalry between the United States and Great Britain in the mid-19th century.[1][2],https://en.wikipedia.org/wiki/The_Yankee_Clipper_(1927_film),0.0704867
A Farewell to Arms,"A Farewell to Arms is a 1932 American pre-Code romance drama film directed by Frank Borzage and starring Helen Hayes, Gary Cooper, and Adolphe Menjou.[2] Based on the 1929 semi-autobiographical novel A Farewell to Arms by Ernest Hemingway, with a screenplay by Oliver H.P. Garrett and Benjamin Glazer, the film is about a tragic romantic love affair between an American ambulance driver and an English nurse in Italy during World War I. The film received Academy Awards for Best Cinematography and Best Sound, and was nominated for Best Picture and Best Art Direction.[2]",https://en.wikipedia.org/wiki/A_Farewell_to_Arms_(1932_film),0.0556505
The Last of the Mohicans,"The Last of the Mohicans is a 1920 American film adapted from James Fenimore Cooper's novel of the same name. Clarence Brown and Maurice Tourneur directed an adaption by Robert Dillon â a story of two English sisters meeting danger on the frontier of the American colonies, in and around the fort commanded by their father. The adventure film stars Wallace Beery, Barbara Bedford, Lillian Hall and Alan Roscoe.",https://en.wikipedia.org/wiki/The_Last_of_the_Mohicans_(1920_American_film),0.0517128
The Gilded Lily,"The Gilded Lily is a 1935 American romantic comedy film directed by Wesley Ruggles and starring Claudette Colbert, Fred MacMurray, Ray Milland, and C. Aubrey Smith. The production's screenplay, written by Claude Binyon, is about a stenographer who becomes a famous cafÃ© entertainer courted by an English aristocrat and an American newspaper reporter. Released by Paramount Pictures in the United States on January 25, 1935, the film is one of the English language films chosen by the National Board of Review for its top-10 list of 1935. The Gilded Lily is also the first of seven films in which Claudette Colbert and Fred MacMurray costar.",https://en.wikipedia.org/wiki/The_Gilded_Lily_(1935_film),0.0497752
Corregidor,"""Corregidor is a 1943 American war film directed by William Nigh and starring Otto Kruger, Elissa Landi and Donald Woods.[Note 1] The film is set in December 1941 through May 1942 during the Japanese invasion of the Philippines. Corregidor opens with the following written dedication: """"Dedicated to the heroes of the United States and Philippine Armed Forces, and the American Red Cross."""" The film closes with a poem about Corregidor written and narrated by English poet Alfred Noyes.[2].mw-parser-output .toclimit-2 .toclevel-1 ul,.mw-parser-output .toclimit-3 .toclevel-2 ul,.mw-parser-output .toclimit-4 .toclevel-3 ul,.mw-parser-output .toclimit-5 .toclevel-4 ul,.mw-parser-output .toclimit-6 .toclevel-5 ul,.mw-parser-output .toclimit-7 .toclevel-6 ul{display:none}""",https://en.wikipedia.org/wiki/Corregidor_(1943_film),0.0367195
