# Homework 3 - Find the perfect place to stay in Texas!
### Group 14 - PavanKumar Alikana, Matteo Cavalletti, Francesca Porcu

The homework consists in analyzing the text of Airbnb property listings and building a search engine.

In [19]:
#Import required libraries
import pandas as pd
# For displaying search results in a table
from IPython.display import HTML, display
from os.path import join as pjoin
import csv

# For persisting indexes in an external file
import pickle
import math
import heapq
from datetime import datetime
from pathlib import Path


import nltk
import csv
import re
import os

# For word tokenization
from nltk.tokenize import RegexpTokenizer
# For stop words list
from nltk.corpus import stopwords
# For word stemming
from nltk.stem.snowball import SnowballStemmer

#First we import stopwords from nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
#To remove punctuation we use regexptokenizer, but we leave dollar symbol $ because maybe is used in some queries
tokenizer = RegexpTokenizer(r'\w+|\$')
#we create the stemmer
ps = SnowballStemmer('english')

# Path to the current working directory to refer to all the files relatively
my_path = os.path.dirname(os.path.realpath('__file__'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Marco\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Step 1: Data

In [20]:
#Reading the main CSV file
m = pd.read_csv("Airbnb_Texas_Rentals.csv")
doc_len = len(m)

# These are commented because we already processed the reviews in to invidual CSV file per review
#we found words like '\\n' in the dataset so we cleaned it
#m = m.replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'\\n',  ' ', regex=True)
#m = m.replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'\\t',  ' ', regex=True)
#m = m.replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'\\r',  ' ', regex=True)

#create tsv files,we droped the first column that was a prroblem
#m = m.drop(['Unnamed: 0'], axis=1)

# Step 2: Create documents

In [21]:
# These lines are commented because we already have the individual csv file per review,
# after first time we executed this section of code. so we no longer need to execute this every time
# Create a separate csv file for every row in the reviews csv file
#for i in range(len(m)):
#    with open(os.path.join(my_path, 'docu_hw3/doc_' + str(i) + '.tsv', 'w', newline='',encoding='utf-8') as output:
#        tsv_output = csv.writer(output, delimiter='\t')
#        tsv_output.writerow(m.iloc[i])

# Step 3: Search Engine

At this point we create the ***vocabulary***. We don't modify directly tsv files because we need them for the output of the search engines, but we create a dictionary that assigns to each file words that would have contained if we had preprocessed them. In particular, we apply to the words contained in each file these procedures:
- *Removing stopwords*
- *Removing punctuation*
- *Stemming*
- *Lower-case letters*

## 3.1) Conjunctive query

### 3.1.1) Create your index!

In [22]:
review_content_persist = {}
vocabulary_persist = {}
words_persist = {}

# Retrieving persisted information for review content and word map
# Please create a directory(in your current working directory) with name 'indexes'  
content_file = Path(os.path.join(my_path, "indexes/review_content.pkl"))
vocabulary_file = Path(os.path.join(my_path, "indexes/vocabulary.pkl"))
words_file = Path(os.path.join(my_path, "indexes/words.pkl"))


# Retrieving already persisted information

# Check if the index file exists, 
#if yes load the previously persisted indexes and content
if content_file.is_file():
    with open(content_file, "rb") as review_content:
        review_content_persist = pickle.load(review_content)
        review_content.close()
        
# Check if the vocabulary file exists, 
#if yes load the previously persisted vocabulary
if vocabulary_file.is_file():
    with open(vocabulary_file, "rb") as vocabulary:
        vocabulary_persist = pickle.load(vocabulary)
        vocabulary.close()
        
# Check if the words file exists, 
#if yes load the previously persisted words
if words_file.is_file():
    with open(words_file, "rb") as words:
        words_persist = pickle.load(words)
        words.close()

if(len(review_content_persist.keys()) == 0):
    
    review_word_map = {}
    
    # We reach here if we don't have indexes already present
    print("Indexes are being created")
    
    #we create the vocabulary of preprocessed documents,but we don't modify the documents because we''l use them in search engine
    
    for i in range(doc_len):
        with open(os.path.join(my_path, 'docu_hw3/doc_' + str(i) + '.tsv'),encoding='utf8') as tsvfile:
             tsvreader = list(csv.reader(tsvfile, delimiter="\t"))
        
        # For review title
        l1 = tsvreader[0][4]
        
        # For review content
        l2 = tsvreader[0][7]
        
        l = l1+ ' ' +l2
        l = l.lower()
        l = tokenizer.tokenize(l)
        
        # This array will contain all the valid words in a given review after removing 
        # all the stop words, punctuations, stemming etc..,, we will use this information
        # to find out the term frequency there by tf-idf values
        file_words = []
        
        for r in l :
            if not r in stop_words:
                sr = ps.stem(r)
                
                file_words.append(sr)
                
                if not  sr in review_word_map:
                    review_word_map[sr] = [i]
                else:
                    review_word_map[sr]+=[i]
                    
                    
        review_content_persist[i] = ' '.join(file_words)
    
    # Saving the content and indexes for the first time
    # We made use of pickel python module
    #Saving content dictionary
    with open(content_file, "wb") as review_content:
        pickle.dump(review_content_persist, review_content)
        review_content.close()
    
    # Word and Vocabulary indexes based on word map
    c = 0
    for key in review_word_map:
        words_persist[key] = c
        vocabulary_persist[c] = review_word_map[key]
        c += 1
    
    #Save vocabulary and words
    with open(vocabulary_file, "wb") as vocabulary:
        pickle.dump(vocabulary_persist, vocabulary)
        vocabulary.close()
        
   
    with open(words_file, "wb") as words:
        pickle.dump(words_persist, words)
        words.close()
    
    
                

            

### 3.1.2) Execute the query

In [23]:
word = input('Enter a search query: ')

# Cleaning user input similar to what we did for creating indexes for words
def clean_input(w):
    w_list = []
    w = w.lower()
    w = tokenizer.tokenize(w)
    # Check if we need to do any other preprocessing to improve the efficiency of search results
    
    for r in w :
        if not r in stop_words:
            sr = ps.stem(r)
            if not  sr in w_list:
                w_list.append(sr)
    return w_list

# Show search results in tabular format
def show_results(results, doc_list, isScore):
    
    if(len(doc_list)):
        print('Found ' + str(len(doc_list))  + ' matching reviews to your query')
    
    if(len(results)):
        if(isScore):
            tableFormat = '<table border="1"><tr><th>Title</th><th>Description</th><th>City</th><th>URL</th><th>Score</th></tr><tr>{}</tr></table>'
        else:
            tableFormat = '<table border="1"><tr><th>Title</th><th>Description</th><th>City</th><th>URL</th></tr><tr>{}</tr></table>'
        
        
        display(HTML(tableFormat.format('</tr><tr>'.join('<td>{}</td>'.format('</td><td>'.join(str(_) for _ in row)) for row in results)
)))
    else:
        display(HTML('<h1>No results found. Please try a different query</h1>'))
    


word_list = clean_input(word)

print("Cleaned word: ", word_list)

list_doc_list = []

for w in word_list:
        doc_list = []
        
        if w in words_persist: 
            doc_list = vocabulary_persist[words_persist[w]]
            
        list_doc_list.append(doc_list)  

# Initially assinging the list intersection to the matching documents of first word
list_intersect = list_doc_list[0]

def intersection(lst1, lst2): 
    lst3 = [value for value in lst1 if value in lst2] 
    return lst3 
        
for docList in list_doc_list:
    list_intersect = intersection(list_intersect, docList)
    
results = []

# Removing the duplicates in the document intersection
list_intersect = list(set(list_intersect))

i_len = len(list_intersect)

if(i_len):
    
    r_limit = 10
    
    if(i_len < 10):
        r_limit = i_len
    
    # Showing at most ten results
    for doc in list_intersect[:r_limit]:
        
        # Reading each document based on document id in list intersect 
        with open(os.path.join(my_path, 'docu_hw3/doc_' + str(doc) + '.tsv'),encoding='utf8') as tsvfile:
             tsvreader = list(csv.reader(tsvfile, delimiter="\t"))

        title = tsvreader[0][7]        
        description = tsvreader[0][4]
        city = tsvreader[0][2]
        url = tsvreader[0][8]

        results.append([title, description, city, url])

# Displaying the results
show_results(results, list_intersect, False)

Enter a search query: house welcome
Cleaned word:  ['hous', 'welcom']
Found 356 matching reviews to your query


Title,Description,City,URL
"The Wild, Wild, West Backpackers B&B - 4","Yeehaw!! ......... Welcome! Stay with us and let us show you the best of Texas Hospitality! In addition - get the most value for your money, whether you decide to stay for one day or for one month! We have Everything a budget traveller needs for one low price - internet, cable TV, off street parking, free laundry and last but not least - an amazing Texas Wild, Wild West Heritage Inspired Atmosphere. We are located in the very safe area of Historic Downtown Irving in an up-graded, well cared for 75 year old Historic house. We are very close to a major public transport hub - 5 minutes walk to the Downtown Irving Station, which provides easy daily access to all major attractions in downtown Dallas, downtown Fort Worth and to DFW airport - via the modern TRE train & air-conditioned buses - access to the extended metro area. You can use our fully equipped kitchen to prepare meals. Or you can sign up for our affordable daily breakfast $5.. You can enjoy our huge, private, fenced backyard to relax, sunbathe and rejuvenate. In addition we are within walking distance from banks, supermarkets and the Centenial Park with a 5 mile long walking and jogging trail, a self-guided tour of the area and a natural creek running through it. As an added FREE bonus you get a real chance to meet and greet young travellers from around the globe. We feature the best and greatest selection of international travellers in the metro - Those include our Backpacker Visitors, who've been staying at our place for the past seven years. They usually have loads of stories and impressions to share from their home countries and all the rest of the countries they've visited. Finally you'd get to meet your hosts - Bill, Carlos and Ivan - all with exclusive knowledge about the DFW area - what's best to do, what's best time to do it, how to get there, what's the best pricing, all your questions and queries will be addressed. It is also not un-common for the hosts to provide customized tours and rides in the area, upon request. And, of course, you are welcome to drive by with your own vehicle, plenty of off street parking available. Please, feel free to contact us with any questions or concerns. Come, stay with us and enjoy the Southern Hospitality and Fun Dallas - Fort Worth has to offer! NOTE: Our physical address is: 214 West 6th. Str. IRVING, TX, 75060 From DFW AIRPORT Two Options - by bus # 408 or by the Trinity Railway Express (the TRE) train By bus # 408 - Available Daily Take the Free Shuttle to South Remote Parking. Then transfer to bus # 408 to Downtown Irving/Heritage Crossing. By the Trinity Railway Express - the TRE: The TRE runs everyday, except on Sundays and major Holidays Take the Free Shuttle to the TRE train station located to the South of DFW airport. Take the TRE (train) towards Dallas and get off at the 2nd station - Downtown Irving/ Heritage Crossing.. Note: On Sundays and major holidays: Use bus #408 as explained above. From DOWNTOWN DALLAS/Greyhound Station Two Options - by the Trinity Railway Express (the TRE) train or by bus By the TRE train - Find Union Station, take the TRE train towards Fort Worth and get off at the Downtown Irving/Heritage Crossing Station. Note: Use the TRE train everyday except on Sundays or major holidays. If it is a Sunday or a major holiday, then use the bus service: Find the 'West Transit Center'. The Griffin & Pacific intersection is right next to it; take bus #44 going North to Parkland; at Parkland transfer to bus # 408, and get off at the South Irving Transit Center. Directions from the Downtown Irving/Heritage Crossing to the Wild, Wild West Dallas Irving Guest House: Once you arrive at the Station, cross the street and find either Hastings street by Burger King & Compass Bank to your left/ or O'Connor street /to your right/. Take either one and walk 5 minutes (5 blocks) South, both streets end at 6th street. If you take Hastings make a right on 6th. If you take O'Connor, make a left. We are at 214 West 6th. str, approximately 5 minutes fast walk from the Station and located straight across from the Lucky Convenience Store. DIRECTIONS FOR CAB DRIVERS, coming from DFW airport: From Hwy 183, exit O'Connor and turn right/South/ until it dead ends at 6th. Str. Make a left on 6th. We will be on your right, across from the Lucky convenience store - 214 W. 6th. Street, Irving, TX, 75060. We are looking forward to seeing you Soon! Yeeehaw!!!",Irving,https://www.airbnb.com/rooms/1135415?location=Coppell%2C%20TX
COZY BEDROOM IN S. AUSTIN,"Quiet and comfortable home where you can unwind and relax. Backyard equipped with a jacuzzi, hammocks and a grill that you can use. I have 3 friendly cats who come in and out of the house and are ready to welcome you :D Free Wifi and breakfast incl.",Austin,https://www.airbnb.com/rooms/8780307?location=Buda%2C%20TX
"The Frida House - ""Gold Bedroom"" / Free Breakfast!",NO SECURITY DEPOSIT! - Downtown SA! All ethnicities & identities welcome. Walk to major attractions! \,San Antonio,https://www.airbnb.com/rooms/18477015?location=Alamo%20Heights%2C%20TX
Private and Sunny Zilker Cottage,"Tucked behind our house in the Zilker neighborhood in South Austin, this sunny one-bedroom, one-bathroom cottage accommodates one or two people perfectly. We've got two great bikes you're welcome to borrow any time. Austin license: 2013 OL 122394.",Austin,https://www.airbnb.com/rooms/1484462?location=Colorado%20River%2C%20TX
RV / Trailer parking stay or large vehicle storage,"RV / trailer parking is available on either side of the front yard. There are fresh water hoses available, but no sewer dumping on site. If you are just parking the RV / trailer and not using it, space in the back lot can be made available. If the wifi signal reaches your vehicle, you are welcome to use it, but no promises there. Indoor house access is strictly off limits for this listing except in case of a dire emergency. Pasture boarding is available as a separate listing for horses.",Devine,https://www.airbnb.com/rooms/19182813?location=Castroville%2C%20TX
Luxurious 4BR home with pool 5 mins from Seaworld,"Welcome to your home away from home! A modern, tastefully decorated pool house awaits you. It is suitable for families with young children or family groups. A Walmart Neighborhood Market is within walking distance and many other restaurants and shops are a 5 min drive away. Seaworld & Aquatica are a 5 min drive & all other San Antonio based attractions are just 20 mins away. Visiting San Antonio for a graduation? Lackland AFB is less than 15 minutes away. We also offer a military discount.",San Antonio,https://www.airbnb.com/rooms/18401185?location=Castroville%2C%20TX
Speakman Family Lake House,"A great place to stay! Lake Fork is one of the best bass fishing lakes in the country! Bring your boat and fishing poles. Home has a deck and private boat launch. Plenty of room outside to play. With open floor plan, this is the perfect place for family to gather! Our home is your home. Relax, feel welcome, enjoy your vacation with family and friends!",Yantis,https://www.airbnb.com/rooms/18765386?location=Alba%2C%20TX
NW Austin Room,"Looking for a comfortable inexpensive room to stay for a night? A week or more? Join us in our northwest Austin home where the house is tidy, nicely decorated and we are friendly, responsible, courteous hosts. Single or couples welcome. We have two small, fairly quiet dogs and welcome travelers from all over. You can enjoy your coffee in the morning on the deck with a large backyard, or in the kitchen which is well equipped if you would like to cook a meal. Your own Parking spot can be found in front of the house on the granite gravel driveway just beside the mailbox. The bedroom available in this listing accommodates two persons on a queen bed. It is next to my study and craft room that sometimes serves as another guest room for a family member or friend but rarely list on airbnb unless there is a huge need (SXSW, ACL, etc.). This room listing is my primary listing but wanted you to be aware of the 2nd bedroom that may be occasionally occupied by another guest. The bathroom is in the hallway and would be a shared bath with whomever stays in the 2nd guest room (if any). Otherwise, it's all yours. The room has central air/heat and I also have a ceiling fan and a small space heater/fan for your own temperature control within the room. There is a Queen-sized bed (fresh linens, comforter and pillows provided), basic toiletries if needed, basic cable tv, a closet, desk, small microwave oven and secure wireless internet throughout the house. Nearby (by car) there are plenty of restaurants, grocery stores, dry cleaners, and a mall. A nice park is within walking distance with picnic area, tennis/volleyball court and a walking trail. We have the Alamo Drafthouse movie theatre just over 2 miles away and the Lakeline Shopping Mall. Transportation: Your own vehicle transportation is probably best because of the location. MetroRail station is about 2-1/2 miles from the house if you prefer to leave your car parked in their lot and have the train take you to the heart of downtown. We're non-smokers and prefer to host non-smokers, but if you don't mind disposing of your cigarettes properly outside, we will consider. Even though we're busy hosts we enjoy meeting new people and answering any questions. So if you're a friendly individual or at least know to say hello when you see us in the common areas, we would love to host you! ------------------------------------------------- Gravel Parking Kitchen We interact with our guests as little or as much as needed. Quiet neighborhood with lots of trees and good neighbors. We are approximately 16 miles from downtown Austin. When not traveling during rush hour, I can get to downtown Austin in about 25 minutes but during rush hour, it can take up to an hour. My location makes it easier for guests with their own mode of transportation but if needed, approximately 2 miles away is the train station which will get you downtown. The bus is approximately 1.2 miles from the house but sometimes it takes over an hour and a half for the bus to make its way downtown.",Austin,https://www.airbnb.com/rooms/5769?location=Cedar%20Park%2C%20TX
"The Wild, Wild West Dallas Backpacker's B&B - 2","Yeehaw!! ......... Welcome! Stay with us and let us show you the best of Texas Hospitality! In addition - get the most value for your money, whether you decide to stay for one day or for one month! We have Everything a budget traveller needs for one low price - internet, cable TV, off street parking, free laundry and last but not least - an amazing Texas Wild, Wild West Heritage Inspired Atmosphere. We are located in the very safe area of Historic Downtown Irving in an up-graded, well cared for 75 year old Historic house. We are very close to a major public transport hub - 5 minutes walk to the South Irving Station, which provides easy daily access to all major attractions in downtown Dallas, downtown Fort Worth and to DFW airport - via the modern TRE train & air-conditioned buses - access to the extended metro area. You can use our fully equipped kitchen to prepare meals. Or you can sign up for our affordable daily breakfast $5 or TX size dinner $9 meals. You can enjoy our huge, private, fenced backyard to relax, sunbathe and rejuvenate. In addition we are within walking distance from banks, supermarkets and the Centenial Park with a 5 mile long walking and jogging trail, a self-guided tour of the area and a natural creek running through it. As an added FREE bonus you get a real chance to meet and greet young travellers from around the globe. We feature the best and greatest selection of international travellers in the metro - Those include our Backpacker Visitors, who've been staying at our place for the past seven years. They usually have loads of stories and impressions to share from their home countries and all the rest of the countries they've visited. Finally you'd get to meet your hosts - Carlos and Ivan - both with exclusive knowledge about the DFW area - what's best to do, what's best time to do it, how to get there, what's the best pricing, all your questions and queries will be addressed. It is also not un-common for the hosts to provide customized tours and rides in the area, upon request. And, of course, you are welcome to drive by with your own vehicle, plenty of off street parking available. Please, feel free to contact us with any questions or concerns. And don't hesitate to book your inexpensive stay with us! Come, stay with us and enjoy the Southern Hospitality and Fun Dallas - Fort Worth has to offer!NOTE: Our physical address is: 214 West 6th. Str. IRVING, TX, 75060 From DFW AIRPORT Two Options - by bus # 408 or by the Trinity Railway Express (the TRE) train By bus # 408 - Available Daily Take the Free Shuttle to South Remote Parking. Then transfer to bus # 408 to Downtown Irving/Heritage Crossing. By the Trinity Railway Express - the TRE: The TRE runs everyday, except on Sundays and major Holidays Take the Free Shuttle to the TRE train station located to the South of DFW airport. Take the TRE (train) towards Dallas and get off at the 2nd station - Downtown Irving/ Heritage Crossing.. Note: On Sundays and major holidays: Use bus #408 as explained above. From DOWNTOWN DALLAS/Greyhound Station Two Options - by the Trinity Railway Express (the TRE) train or by bus By the TRE train - Find Union Station, take the TRE train towards Fort Worth and get off at the Downtown Irving/Heritage Crossing Station. Note: Use the TRE train everyday except on Sundays or major holidays. If it is a Sunday or a major holiday, then use the bus service: Find the 'West Transit Center'. The Griffin & Pacific intersection is right next to it; take bus #44 going North to Parkland; at Parkland transfer to bus # 408, and get off at the South Irving Transit Center. Directions from the Downtown Irving/Heritage Crossing to the Wild, Wild West Dallas Irving Guest House: Once you arrive at the Station, cross the street and find either Hastings street by Burger King & Compass Bank to your left/ or O'Connor street /to your right/. Take either one and walk 5 minutes (5 blocks) South, both streets end at 6th street. If you take Hastings make a right on 6th. If you take O'Connor, make a left. We are at 214 West 6th. str, approximately 5 minutes fast walk from the Station and located straight across from the Lucky Convenience Store. DIRECTIONS FOR CAB DRIVERS, coming from DFW airport: From Hwy 183, exit O'Connor and turn right/South/ until it dead ends at 6th. Str. Make a left on 6th. We will be on your right, across from the Lucky convenience store - 214 W. 6th. Street, Irving, TX, 75060. We are looking forward to seeing you Soon! Yeeehaw!!!",Irving,https://www.airbnb.com/rooms/1183502?location=Coppell%2C%20TX
Just Breathe,"Just Breathe is 3600 sq ft of living space plus a ground level bar. This house has it all! Luxurious accommodations and decor, tvs in every room including a 60 inch in the bar, a playground for the kids, 3 living rooms to relax after a fun filled day at the beach, and more. Every room has a private bathroom plus an additional full bathroom for guest that may be on the sofa bed. There are decks on both levels with plenty of seating to enjoy the view. We would love to welcome your family!",Bolivar Peninsula,https://www.airbnb.com/rooms/17515618?location=Anahuac%2C%20TX


## 3.2) Conjunctive query & Ranking score

### 3.2.1) Inverted index

In [24]:
iindex_tf_idf_persist = {}


# Check if the index file exists, if yes load the previously persisted indexes and content
# Please create a directory(in your current working directory) with name indexes  
index_file = Path(os.path.join(my_path, "indexes/iindex_tf_idf.pkl"))

# Check if the index file exists, if yes load the previously persisted indexes
if index_file.is_file():
    # Retriving precreated inverted indexes
    with open(index_file, "rb") as iindex_tf_idf:
        iindex_tf_idf_persist = pickle.load(iindex_tf_idf)
        iindex_tf_idf.close()
        
    
if(len(iindex_tf_idf_persist.keys()) == 0):
    
    print("Inverted Indexes are being calculated")

    word_iindex = {}

    #Creating inverted index using tf-idf and consine similarity
    for word in words_persist:
        word_doc_list = vocabulary_persist[words_persist[word]]
        word_iindex[word] = []

        # Store indexes based on number of times a particular word is present in a given document
        for doc in word_doc_list:
            doc_content = review_content_persist[doc]
            # Pushing the term frequency with document id
            word_iindex[word].append([doc, doc_content.split().count(word)])

    # Store indexes based on tf-idf
    docs_length = len(review_content_persist.keys())
    iindex_tf_idf_persist = word_iindex

    for key, word in iindex_tf_idf_persist.items():
        # find out the relative importance of a particular terms relating it to document count
        idf= math.log10( docs_length / len(word) )

        for elem in word:
            # Add the document score corresponding to a particular term which we then use in the 
            # search results ranking of documents
            elem[1] = idf * elem[1]
    
    # Persisting the indexes calculated 
    with open(index_file, "wb") as iindex_tf_idf:
        pickle.dump(iindex_tf_idf_persist, iindex_tf_idf)
        iindex_tf_idf.close()
        


### 3.2.2) Execute the query

In [25]:
dict_qcos = {}
dict_norm = {}

#print("Search started")

# Need to remove this 100 hardcoding
for doc in list_intersect[:100]:
    
    num = 0
    
    #print("Current document ID: " + str(doc))
    
    # Calculating numerator of the cosine similarity equation
    
    for word in word_list:
        w_index = iindex_tf_idf_persist[word]
        w_i_len = len(w_index)
        for i in range(w_i_len):
            if w_index[i][0] == doc: 
                num +=  w_index[i][1]
                
    dict_qcos[doc]=num

    
    # Calculating denominator of the cosine similarity equation
    norm = 0
    for word in iindex_tf_idf_persist.values():
        for i in range(len(word)):
            if word[i][0] == doc:
                norm +=  word[i][1]**2
    
    dict_norm[doc]=math.sqrt(norm)

#print("Numerator and Denominator calculated")

# Once numerator and denominator is calculated find the score of each document in the intersection list 
# By applying the consine similarity formala
for doc,num in dict_qcos.items():
    # Eleminating divided by zero problem to check if the normalization value for a document is non-zero
    if dict_norm[doc] != 0:
        dict_qcos[doc] = num/(math.sqrt(len(word_list))*dict_norm[doc])

#print("Cosine similarity done")

In [26]:
# Applying heap data structure to print to top-k documents

h = []
results = []

for doc in dict_qcos.keys():
    
    # Reading the document meta data to print in the search results
    
    with open(os.path.join(my_path, 'docu_hw3/doc_' + str(doc) + '.tsv'),encoding='utf8') as tsvfile:
         tsvreader = list(csv.reader(tsvfile, delimiter="\t"))
    
    title = tsvreader[0][7]        
    description = tsvreader[0][4]
    city = tsvreader[0][2]
    url = tsvreader[0][8]
    
    # Pushing the document information to heap data structure
    
    heapq.heappush(h,(dict_qcos[doc], title, description, city, url))

#print("Applying Heap")    
# Applying max heap algorithm
heapq._heapify_max(h)

limit = 10
doc_len = len(dict_qcos.keys())

if(doc_len < 10):
    limit = doc_len

for i in range(limit):
    # Popping the document with maximum score at every step and adding it to the result list
    
    # Since the data structure used in heap is a tuple, we convert it in to list for ease of manipulation
    results.append(list(heapq.heappop(h)))
    
    # re-applying the max heap algorithm
    heapq._heapify_max(h)

#print("Applying Heap")  

#print(results)
results_formatted = []

for i in results:
    
    first = round(i.pop(0), 4)
    i.append(first) 
    #first = round(float(results[i].pop(0)), 4)
    #results[i] = results[i].append(first)
    results_formatted.append(i)


# Displaying the results
show_results(results_formatted, [], True)

Title,Description,City,URL,Score
Room in Townhome close to airport,Hello and welcome to my recently purchased home. You'll be renting 1 room and bathroom. A tv will only be proved upon request. There is currently no stove but if you want breakfast your more than welcome to order with us when we do daily. There's no smoking in the house ( I have children) but your more than welcome to do so on the back porch. This is an air mattress (just moved in havent been able to get my things out of storage yet. Bed room/bathroom & any room downstairs I'm only a call away if you need me. This is a small community. Everyone is super nice. I'm an uber driver,Houston,https://www.airbnb.com/rooms/18390479?location=Channelview%2C%20TX,0.5394
Artsy Home off of I35,A nice spacious room with a comfy full size bed. House is located right off I35 for convenient travel and close to the airport. Guests are welcome to kitchen and living room. Convenient shops and grocery stores nearby. A traveler's home who welcomes other travelers! Always looking for long term guests. Military in town for training and travel nurses are always welcome to stay here!,San Antonio,https://www.airbnb.com/rooms/10908104?location=Cibolo%2C%20TX,0.5159
"Safe, Quiet, Private Entry Suite",We have a cozy place that is welcome to people of all walks of life. There is a spacious living room and kitchen that is closed off from the rest of the house. A private patio entrance from the right side of the house.,Frisco,https://www.airbnb.com/rooms/18734098?location=Anna%2C%20TX,0.3971
"Mamau's House - 2 bedrooms, 1 bath","Guests are welcome to stay and relax in the little house we originally built for my mother who now resides in heaven. The house overlooks a small pond, sometimes full if we get the rain, but always visited by deer and dragonflies!",Harker Heights,https://www.airbnb.com/rooms/355319?location=Colorado%20River%2C%20TX,0.3919
"Mamau's House - 2 bedrooms, 1 bath","Guests are welcome to stay and relax in the little house we originally built for my mother who now resides in heaven. The house overlooks a small pond, sometimes full if we get the rain, but always visited by deer and dragonflies!",Harker Heights,https://www.airbnb.com/rooms/355319?location=Brazos%20River%2C%20TX,0.3919
"Mamau's House - 2 bedrooms, 1 bath","Guests are welcome to stay and relax in the little house we originally built for my mother who now resides in heaven. The house overlooks a small pond, sometimes full if we get the rain, but always visited by deer and dragonflies!",Harker Heights,https://www.airbnb.com/rooms/355319?location=Belton%2C%20TX,0.3919
"Last minute deal, 1ml from COTA","This beautifull 3bdr, 2bth house with driveway and backyard is perfect for the F1 weekend. The house is occupied by a family year round, so you can expect the place to be very clean and everything in perfect working condition. We are a smoke free house and we have a small dog, so small pets will be welcome. There is also a two car garage and a driveway that could park 4 cars with no problem. The house is approximately 1 mile from COTA, the shuttle busses from Del Valle High school are within walking distance, or you could take a 20min bike ride to the track, but if you have a parking pass it will take you no longer than 10min to get there. You will also have easy access to the toll roads and to Hwy71, from the house is about a 20min drive to downtown, the airport is only a five minute drive. Last year we sat in our driveway to listen to the cars, yes is that close!!",Austin,https://www.airbnb.com/rooms/1905942?location=Bastrop%20County%2C%20TX,0.3502
1920's Bungalow Guest House,"Behind our main home in historic McKinney, we have refurbished a 1920’s arts & craft style guest house. This 570 sq. ft. house has been completely redone inside to now include a living area, full kitchen, bedroom, and full bath. We welcome guests for an overnight stay, or an extended stay over weeks or months. Amenities in the guest house include: Fully furnished rooms with all sheets, towels, kitchen utensils/dishes, and modern appliances Heating and air conditioning A spacious, fully equipped IKEA kitchen that we installed in 2011. New Whirlpool appliances include range/oven, microwave, toaster/oven, dishwasher, refrigerator, disposal, and washer/dryer. Included local TV and Roku (Netflix/Amazon Prime/streaming), secured wi-fi internet access, DVD/CD player, and sound system A full bathroom Iron, ironing board & hair dryer Onsite parking by the guest house, or on the street Basically, you can walk into our guest house with your suitcase(s), unpack them, and you’re home. We invite our guests to enjoy the flagstone patio and grill between the guest house and our main home. Guests also enjoy relaxing on the spacious front porch of our main home. We live in the heart of McKinney’s Historic District. In addition to seeing the many older, picturesque homes in this area, we are a short 10 minute walk from McKinney’s Historic Downtown Square. Many of our guests enjoy the variety of restaurants, entertainment, and unique stores on the Square. McKinney is 30 miles north of Dallas. Access to North Dallas is about a 25-30 minute drive on US75 or new tollways. We are long-time Dallas and McKinney residents, so we can assist guests with directions and suggestions for attractions, entertainment, restaurants, and shopping in the Dallas-Ft. Worth area. We will respect your privacy. Come enjoy McKinney and stay with us in our quiet, relaxed neighborhood.",McKinney,https://www.airbnb.com/rooms/454935?location=Celina%2C%20TX,0.3481
1920's Bungalow Guest House,"Behind our main home in historic McKinney, we have refurbished a 1920’s arts & craft style guest house. This 570 sq. ft. house has been completely redone inside to now include a living area, full kitchen, bedroom, and full bath. We welcome guests for an overnight stay, or an extended stay over weeks or months. Amenities in the guest house include: Fully furnished rooms with all sheets, towels, kitchen utensils/dishes, and modern appliances Heating and air conditioning A spacious, fully equipped IKEA kitchen that we installed in 2011. New Whirlpool appliances include range/oven, microwave, toaster/oven, dishwasher, refrigerator, disposal, and washer/dryer. Included local TV and Roku (Netflix/Amazon Prime/streaming), secured wi-fi internet access, DVD/CD player, and sound system A full bathroom Iron, ironing board & hair dryer Onsite parking by the guest house, or on the street Basically, you can walk into our guest house with your suitcase(s), unpack them, and you’re home. We invite our guests to enjoy the flagstone patio and grill between the guest house and our main home. Guests also enjoy relaxing on the spacious front porch of our main home. We live in the heart of McKinney’s Historic District. In addition to seeing the many older, picturesque homes in this area, we are a short 10 minute walk from McKinney’s Historic Downtown Square. Many of our guests enjoy the variety of restaurants, entertainment, and unique stores on the Square. McKinney is 30 miles north of Dallas. Access to North Dallas is about a 25-30 minute drive on US75 or new tollways. We are long-time Dallas and McKinney residents, so we can assist guests with directions and suggestions for attractions, entertainment, restaurants, and shopping in the Dallas-Ft. Worth area. We will respect your privacy. Come enjoy McKinney and stay with us in our quiet, relaxed neighborhood.",McKinney,https://www.airbnb.com/rooms/454935?location=Allen%2C%20TX,0.3481
Artsy Home Off I35,A nice spacious room with a comfy queen pillow top mattress. House is located right off I35 for convenient travel. Guests are welcome to kitchen and living room. Convenient shops and grocery stores nearby. A traveler's home who welcomes other travelers!,San Antonio,https://www.airbnb.com/rooms/9795479?location=Bulverde%2C%20TX,0.335


# Step 4: Define a new score!

New score is:
- *average price per night (av_r_n_u)*
- *n° of bedrooms (bedr_c_u)*
- *zone (zone_u)*

**Average price per night**: we simply use average price per night from the tsv files and from this variable any house can get at maximum 2 points and at least 0. Maximum number of points (2) is achieved if the price is lower than half of the price suggested by the user, instead if price is two times the requested price the house get 0. To get 1 point you need to have the same price or lower than 1.25 times the suggested price. Naturally, other intermediate marks can be achieved.

**n° of bedrooms**: if the n° of bedrooms requested are more or the same, house gets 1 point; if there are less bedrooms, house gets a lower mark. If there are three bedrooms less than requested or worse, house gets 0 point. 

**zone**: We found a set of coordinates (31,169621, -99,683617) that can be considered the center of texas; that is near to the city of Brady (County of McCulloch), so we divided Texas in four zones according to this center (like a cartesian coordinate system). Zones are: North East (NE), North West (NW), South East (SE) and South West (SW). If the house is in the requested zone it gets 1 point else 0. Is important to clarify that we didn't use cities because, given the fact that there are cities with the same name in different counties (e.g. we have 3 Austin), we can't use them because they can be confused.

Finally we can calculate the Score as sum/4 where sum is sum of the accomulated scores from individul fields based on user choices and 4 is maximum number points that can be accumulated.

In [27]:
av_r_n_u = 0
bedr_c_u = 1000
zone_u = 'NA'

try:
    av_r_n_u = int(input('please, enter maximum price that you can pay(e.g. 10): '))
except:
    print('No price suggested')
    pass
try:
    bedr_c_u = int(input('please, enter the number of bedrooms that you need(e.g 5): '))
except:
    print('No indication about the needed number of bedrooms')
    pass
try:
    zone_u = input('please enter the zone you want to reside(e.g. NE): ')
except:
    print('No zone was indicated')
    pass
list_ord =[]

n_h = []

for doc in list_intersect:
    with open(r'docu_hw3\doc_'+ str(doc) + '.tsv',encoding='utf8') as tsvfile:
         tsvreader = list(csv.reader(tsvfile, delimiter="\t"))
    title = tsvreader[0][7]        
    description = tsvreader[0][4]
    latitude = tsvreader[0][5]
    longitude = tsvreader[0][6]
    url = tsvreader[0][8]
    price = list(tsvreader[0][0])
    eff_price = ''
    for i in range(1,len(price)):
        eff_price += price[i]
    
    
    try:
        av_r_n = int(eff_price)
        bedr_c = int(tsvreader[0][1])
    except ValueError:
        av_r_n = 1000000
        bedr_c = 1
    
    sum_values = 0
    if av_r_n >= 2*av_r_n_u :
               sum_values += 0          
    elif av_r_n < 2*av_r_n_u and av_r_n >= 1.75*av_r_n_u:
               sum_values += 0.25
    elif  av_r_n < 1.75*av_r_n_u and av_r_n >= 1.5*av_r_n_u:
               sum_values += 0.50  
    elif av_r_n < 1.5*av_r_n_u and av_r_n >= 1.25*av_r_n_u:
               sum_values += 0.75 
    elif av_r_n < 1.25*av_r_n_u and av_r_n >= av_r_n_u:
               sum_values += 1           
    elif av_r_n < av_r_n_u and av_r_n >=0.75*av_r_n_u:
               sum_values += 1.25
    elif av_r_n < 0.75*av_r_n_u and av_r_n >=0.5*av_r_n_u :
               sum_values += 1.5
    elif av_r_n < 0.5*av_r_n_u :
               sum_values += 2.
    
    
    if bedr_c >= bedr_c_u:
               sum_values += 1
    elif bedr_c == bedr_c_u-1:
               sum_values += 0.75
    elif bedr_c == bedr_c_u-2:
               sum_values += 0.5
    elif bedr_c <= bedr_c_u-3:
               sum_values += 0 #only to make index easier to understand , no practical effect
            
    try:
        la = float(latitude)
        lo = float(longitude)
        if la <=31.169621 and lo <= -99.683617:
             zone ='SW'
        elif la <=31.169621 and lo > -99.683617:
             zone ='SE'
        elif la >31.169621 and lo > -99.683617:
             zone ='NE'
        elif la >31.169621 and lo <= -99.683617:
             zone ='NW'
        if zone == zone_u:
            sum_values +=1
    except:
        pass
    
    
    score = round(sum_values/4,2)
    
    #list_ord.append([title, description, city, url, str(score),score])
    
    heapq.heappush(n_h,(score, title, description, city, url))

results = []

# Applying heapify max algorithm
heapq._heapify_max(n_h)

limit = 10
d_len = len(list_intersect)

if(d_len < 10):
    limit = d_len

for i in range(limit):
    results.append(list(heapq.heappop(n_h)))
    heapq._heapify_max(n_h)

#print("Applying Heap")  

#print(results)
results_formatted = []

for i in results:
    first = round(i.pop(0), 4)
    i.append(first) 
    results_formatted.append(i)
    

# Displaying the results
show_results(results_formatted, [], True)

please, enter maximum price that you can pay(e.g. 10): 30
please, enter the number of bedrooms that you need(e.g 5): 5
please enter the zone you want to reside(e.g. NE): NE


Title,Description,City,URL,Score
Guest room in North Richland Hills,"Cozy spare bedroom with queen airbed. Overhead fan with dimmable light, plenty of closet space, and a locking door. There's a full bath in the hall with small essentials. You're welcome to use the kitchen or the washer & dryer. Located off of Davis Blvd just north of I-820 & HWY 183, real close to NRH20 Water Park. Multi-purpose trail access just three house down. You can park in the driveway or on the street.",Port Aransas,https://www.airbnb.com/rooms/17859326?location=Colleyville%2C%20TX,0.75
Guest room in North Richland Hills,"Cozy spare bedroom with queen airbed. Overhead fan with dimmable light, plenty of closet space, and a locking door. There's a full bath in the hall with small essentials. You're welcome to use the kitchen or the washer & dryer. Located off of Davis Blvd just north of I-820 & HWY 183, real close to NRH20 Water Park. Multi-purpose trail access just three house down. You can park in the driveway or on the street.",Port Aransas,https://www.airbnb.com/rooms/17859326?location=Burleson%2C%20TX,0.75
Guest room in North Richland Hills,"Cozy spare bedroom with queen airbed. Overhead fan with dimmable light, plenty of closet space, and a locking door. There's a full bath in the hall with small essentials. You're welcome to use the kitchen or the washer & dryer. Located off of Davis Blvd just north of I-820 & HWY 183, real close to NRH20 Water Park. Multi-purpose trail access just three house down. You can park in the driveway or on the street.",Port Aransas,https://www.airbnb.com/rooms/17859326?location=Bedford%2C%20TX,0.75
"Cozy Shared Space in Little ""D""","Welcome to our Air BnB page! We are very excited to Welcome you into our home! We have a charming 3 bedrooms’ house in Denton Texas. Yes… we do have 3 DIFFERENT listings for each space accommodating up to 2 people/rental space! More than 1 room may be booked at the same time. Feel free to reserve all the rooms if you do not want to encounter others AirBnBers! We are laid back, fun, and outgoing people! We are located about 10 minutes from city center of Denton, UNT, and TWU.",Port Aransas,https://www.airbnb.com/rooms/18043945?location=Corinth%2C%20TX,0.62
"Cozy Shared Space in Little ""D""","Welcome to our Air BnB page! We are very excited to Welcome you into our home! We have a charming 3 bedrooms’ house in Denton Texas. Yes… we do have 3 DIFFERENT listings for each space accommodating up to 2 people/rental space! More than 1 room may be booked at the same time. Feel free to reserve all the rooms if you do not want to encounter others AirBnBers! We are laid back, fun, and outgoing people! We are located about 10 minutes from city center of Denton, UNT, and TWU.",Port Aransas,https://www.airbnb.com/rooms/18043945?location=Celina%2C%20TX,0.62
"Cozy Shared Space in Little ""D""","Welcome to our Air BnB page! We are very excited to Welcome you into our home! We have a charming 3 bedrooms’ house in Denton Texas. Yes… we do have 3 DIFFERENT listings for each space accommodating up to 2 people/rental space! More than 1 room may be booked at the same time. Feel free to reserve all the rooms if you do not want to encounter others AirBnBers! We are laid back, fun, and outgoing people! We are located about 10 minutes from city center of Denton, UNT, and TWU.",Port Aransas,https://www.airbnb.com/rooms/18043945?location=Aubrey%2C%20TX,0.62
"Cozy Shared Space in Little ""D""","Welcome to our Air BnB page! We are very excited to Welcome you into our home! We have a charming 3 bedrooms’ house in Denton Texas. Yes… we do have 3 DIFFERENT listings for each space accommodating up to 2 people/rental space! More than 1 room may be booked at the same time. Feel free to reserve all the rooms if you do not want to encounter others AirBnBers! We are laid back, fun, and outgoing people! We are located about 10 minutes from city center of Denton, UNT, and TWU.",Port Aransas,https://www.airbnb.com/rooms/18043945?location=Argyle%2C%20TX,0.62
Casa de Mike #1,"Welcome to Casa de Mike!! The house is a fully renovated 50’s style ranch house has everything you need to have a comfortable no-frills homebase while you are in Dallas. Centrally located with easy access to all major highways and about 10 minutes from downtown and White Rock Lake. Full kitchen, 1GB wifi (super fast!!) , washer dryer, dishes, fridge, cooktop with ventahood it’s all here! Perfect for business travelers or anyone looking for an economical place to stay while in Dallas.",Port Aransas,https://www.airbnb.com/rooms/4942620?location=Balch%20Springs%2C%20TX,0.62
Dog lovers welcome b,2 bedrooms for booking. 1 full bathroom. you get your own corner of the house. plus full access to the kitchen. please smoke outside. free WiFi.,Port Aransas,https://www.airbnb.com/rooms/17959635?location=Colleyville%2C%20TX,0.56
"Cozy Private Room in Little ""D""","Welcome to our Air BnB page! We are very excited to Welcome you into our home! We have a charming 3 bedrooms’ house in Denton Texas. Yes… we do have 3 DIFFERENT listings for each space accommodating up to 2 people/rental space! More than 1 room may be booked at the same time. Feel free to reserve all the rooms if you do not want to encounter others AirBnBers! We are laid back, fun, and outgoing people! We are located about 10 minutes from city center of Denton, UNT, and TWU.",Port Aransas,https://www.airbnb.com/rooms/18043785?location=Corinth%2C%20TX,0.56


# Bonus Step: Make a nice visualization!

An important feature of Airbnb is the search on the map. 

Our tool will works in the following way:

- Takes in input a set of coordinates and a maximum distance from the coordinates.
- Generate a map, with a circle of the given radius, where the center is represented by the coordinates given in input.
- Shows the houses that are inside the circle of the given radius.

In [28]:
#We drop rows where latitude and/or longitude column contains missing values (NaN)
m = m.dropna(subset=['latitude', 'longitude'])

In [29]:
#We ask as input latitude, longitude and a maximum distance to generate the radius
lat = float(input('Enter a latitude: '))
lon = float(input('Enter a longitude: '))
dis = float(input('Enter distance range (in km): '))

Enter a latitude: 30.02
Enter a longitude: -95.29
Enter distance range (in km): 10


In [30]:
#!pip install geopy
import folium
from geopy import distance

#We create the map with given coordinates
mp = folium.Map(location = [lat, lon], zoom_start = 12)

In [31]:
#Search represents the set of given coordinates
search = (lat, lon)

#Now, we create the marker associated with input set of coordinates, that we call origin
folium.Marker(location = [lat, lon], popup = 'origin', icon = folium.Icon(color = 'green', icon = 'home')).add_to(mp)

#Then, we generate the circle with input set of coordinates as center. 
folium.Circle(location = [lat, lon], radius = dis * 1000).add_to(mp)

#For each house that is in the given distance from the input set of coordinates, we create a marker with price that 
#can be clicked to access the house web page
for row in m.itertuples():
    if distance.distance(search, (row.latitude, row.longitude)).km <= dis:
        folium.Marker(location = [row.latitude, row.longitude], popup = folium.Popup('<a href=' + row.url + '>' + row.average_rate_per_night + ' </a>')).add_to(mp)
        
mp.save('map.html')