# ADM - Homework 03

Importing the libraries

In [1]:
from os.path import isdir
from os import mkdir
import pandas as pd
import numpy as np
import csv
import math
from os import listdir
from os.path import isfile
from os import remove
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

from nltk import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import re
import string
import pickle
import inflect


Creating the class which it will handle the work

**CLEANING DATA PROCESS:**
1. Detect NA:

    -Delete all rows with *(lat,long)* equal to NA since if an house is not locatable it is useless for a customer.
            
    -Delete rows with both *description* and *title* equal to NA since it wouldn't be possible to match the user query.
            
    -Retain all others rows, maybe later penalizing records with NA values for other columns(e.g.:*bedrooms_count*).

In [3]:
class AirBnbPy():

    def __init__(self, data_dir_path="./data/",all_reviews_dir = "allReviews/"):
        self.dir_path = data_dir_path
        #If the path given as input doesn't exist on the filesystem
        #it is created
        if not isdir(self.dir_path):
            mkdir(self.dir_path)
        
        self.review_dir = all_reviews_dir
        
        #Initialize the data
        self.data = None
        
        #If the path given as input doesn't exist on the filesystem
        #it is created
        if not isdir(self.dir_path+self.review_dir):
            mkdir(self.dir_path+self.review_dir)
            
        #Lazy initialization of nltk objects for preprocessing
        self.tokenizer = None
        self.stopwords = None
        self.stemmer = None
        self.number_to_words = None
        
        #Initialize list which will contain indexes of file with chinese characters
        self.non_english = []
        
        #Before saving the files on the file system, the preprocessing steps are performed
        #Store in two variables the indexes of the input list which reference to the 
        #'description' and 'title' attributes.
        self.DESCRIPTION = 4
        self.TITLE = 7
        
        #Initialize an empty dictionary which will contains the integer encoding 
        #of each word
        self.term_enc = {}
        self.VOCABULARY_SIZE = 1
        
        #Initialization of the reverted indexing
        self.RI = {}
        
        #Initialization of corpus and indexToIndex
        self.corpus = None
        self.indexToindex = None
        
        #default fileName for the tfidf object 
        self.tfIdfFileName = "tfIdfMatrix.pickle"
        #default fileName for vectorizer
        self.tfIdfVectorizerFileName = "vectorizer.pickle"
        #default fileNames for corpus and indexToindex
        self.corpusFileName = "corpus.pickle"
        self.indexToindexFileName = "indexToindex.pickle"
        #default fileName for the term encoding of tfIdf
        self.term_enc2FileName = "term_encodingTFIDF.pickle"
        
        #Initialization of term_encoding 2
        self.term_enc2 = None
        
        #Initialization of the inverted index for the Ranked Search Engine
        self.RI2 = None
        #default fileName for inverted index for ranking records
        self.RI2FileName = "RI2.pickle"
        
        #Initialization of tfidf matrix
        self.tfIdfMatrix = None
        self.vectorizer = None
        
    
    def loadData(self):
        #Hardcoded the file name of the dataset
        dataFilename = "Airbnb_Texas_Rentals.csv"
        #A pandas.DataFrame is returned
        #Given the structure of the dataset, having the index encoded as a column
        #So the parameter 'index_col' is specified
        self.data = pd.read_csv(self.dir_path+dataFilename, index_col = "Unnamed: 0", encoding = 'utf8')
        self._cleanData()
        return self.data
    
    def _cleanData(self):
        #Invoking the code line "data.isnull().sum()", it is possible to observe
        #the presence of 34 NA value in the latitude,longitude columns.
        #
        #It is not possible to retain those houses, since it's not possible to
        #locate them which it is a fundamental info for users
        self.data = self.data[pd.notnull(self.data['latitude'])]
        #If the code line "data.isnull().sum()", it's newly invoked, it is possible
        #to observe that there are 2 NA values in the "description" column
        #and 3 NA values in the "title" column.
        #
        #Since the work requirements ask to realize the search engine on the "description"
        #and the "title" column, all rows which have both columns equal to NA will be
        #dropped.
        self.data = self.data.dropna(subset = ['description','title'], how = 'all')
        #The last check to be done if row duplicated exists
        #If so delete them and leave only one copy
        self.data = self.data.drop_duplicates()
        #TODO:
        #Check if the (lat,long) coordinates refer to the city and don't point
        #to another part of the world.
        #
        #TODO:
        #Check if the URL is still valid.
        #
        #So far, two assumptions are made regarding these things
        
    def _createTsv(self,x):
        
        x[self.DESCRIPTION] = "NaN" if pd.isna(x[self.DESCRIPTION]) else self._nltkProcess(x[self.DESCRIPTION])
        x[self.TITLE] = "NaN" if pd.isna(x[self.TITLE]) else self._nltkProcess(x[self.TITLE])
        for fieldString in x:
            if not self._isEnglish(str(fieldString)):
                return
        
        with open(self.dir_path+self.review_dir+"doc_"+str(x.name)+".tsv", 'w') as file:
            #Need to express delimiter as "\t" since the requested format is a .tsv
            #Was noticed strings in descriptions with foreign language characters which was not
            #possible to encode while writing tsv file so the try/except was added to skip this
            #house
            try:
                wr = csv.writer(file, quoting=csv.QUOTE_ALL, delimiter='\t')
                wr.writerow(x)
            except:
                self.non_english.append(x.name)
            
    def _nltkProcess(self, string):
        try:
            #Transform all words to lowercase
            string = string.lower()
        except:
            print(str(string)+" type:= "+str(type(string)))
        #Setup nltk objects to perform preprocessing
        self._setupNltk()
        #Tokenize the string removing puntuactions
        tokens = self.tokenizer.tokenize(string) 
        #Create new sentence
        new_sentence = []
        #Scroll through each word and stemming it
        for word in tokens:
            word = self.stemmer.stem(word)
            #exclude the word if it is a stopword
            if not word in self.stopwords:
                #if the word has length greater than one, it has sufficient information
                #value to be added
                if len(word) > 1:
                    new_sentence.append(word)
                #if the word length is equal to one and it is numeric
                #then the string representation of the number is added
                elif word.isnumeric():
                    new_sentence.append(self.number_to_words.number_to_words(word))
        #Since the object must later be saved on a .tsv file,
        #it is needed to return a string rather than a list of words
        return " ".join(new_sentence)
    
    def _setupNltk(self):
        #Lazy initialization of objects needed to preprocess strings
        if self.tokenizer == None:
            self.tokenizer = RegexpTokenizer(r'\w+')
        if self.stopwords == None:
            self.stopwords = set(stopwords.words('english'))
        if self.stemmer == None:
            self.stemmer = SnowballStemmer('english') 
        if self.number_to_words == None:
            self.number_to_words = inflect.engine()
        
    def createAllReviews(self):
        self.data.apply(lambda x: self._createTsv(x), axis = 1)
        #remove all files with chinese characters
        [remove(self.dir_path+self.review_dir+"doc_"+str(index)+".tsv") for index in self.non_english]
    
    #Function needed to check the correct language
    def _isEnglish(self,s):
        try:
            s.encode(encoding='utf-8').decode('ascii')
        except:
            #print('not english'+s)
            return False
        else:
            return True

    def buildEncoding(self,fileName):
        
        #If the file already exists then we load it, instead to compute it another time.
        if isfile(self.dir_path+fileName):
            with open(self.dir_path+fileName, 'rb') as handle:
                self.term_enc = pickle.load(handle)
                self.VOCABULARY_SIZE = len(list(self.term_enc.keys()))
            return
        
        for filePath in listdir(self.dir_path+self.review_dir):
            data = pd.read_csv(self.dir_path+self.review_dir+ filePath, delimiter = '\t',header = None, encoding = 'utf8')
            title = set(str(data.values[0][self.TITLE]).strip().split(' '))
            description = set(str(data.values[0][self.DESCRIPTION]).strip().split(' '))
            concatenatedwords = title.union(description)
            #for each word contained in the title and description fields
            for word in concatenatedwords:
                #if the word is not encoded yet
                if word not in self.term_enc.keys():
                    #store the encoding into the @term_enc dictionary
                    self.term_enc[word] = self.VOCABULARY_SIZE
                    #update the vocabulary size variable
                    self.VOCABULARY_SIZE += 1
        
        #In the end, the dictionary is saved on the filesystem to be loaded further.           
        with open(self.dir_path+fileName, 'wb') as handle:
            pickle.dump(self.term_enc, handle, protocol=pickle.HIGHEST_PROTOCOL)
    

    def createRevertedIndex(self, fileName):
        
        #If the file already exists then we load it, instead to compute it another time.
        if isfile(self.dir_path+fileName):
            with open(self.dir_path+fileName, 'rb') as handle:
                self.RI = pickle.load(handle)
            return
        
        #Each word ID is associated to a list of docs which contains it
        #At the beginning is empty of course
        for i in range(1,self.VOCABULARY_SIZE+1):
            self.RI[i] = list()
        
        #For each .tsv file contained the records of the original dataset
        for filePath in listdir(self.dir_path+self.review_dir):
            #The file is open through Pandas remembering that since it is a .tsv file,
            #the delimiter will be a tab character and the header is not present
            data = pd.read_csv(self.dir_path+self.review_dir+ filePath, delimiter = '\t',header = None, encoding = 'utf8')
            
            #Now all words from the 'title' and 'description' fields are extracted
            #and concatenated in a set since we are not interested in repetition
            #of the same doc
            title = set(str(data.values[0][self.TITLE]).strip().split(' '))
            description = set(str(data.values[0][self.DESCRIPTION]).strip().split(' '))
            concatenatedwords = title.union(description)
            #for each word contained in the title and description fields
            for word in concatenatedwords:
                #the previous list of docs associated to the word is retrieved
                mocklist = self.RI[self.term_enc[word]]
                #It is updated with the new value
                mocklist.append(filePath[:-4])
                #Finally assigned in the reverted index data structure
                self.RI[self.term_enc[word]] = mocklist
                #the mockllist is deleted since it is no useful anymore
                del mocklist
        
        #The reverted index is saved on the filesystem for further uses.
        with open(self.dir_path+fileName, 'wb') as handle:
            pickle.dump(self.RI, handle, protocol=pickle.HIGHEST_PROTOCOL)
            
    
    def query(self,queryString):
        #Eventually setup the nltk enviornment
        self._setupNltk()
        #Process the query as we have processed the dataset
        q = self._nltkProcess(queryString).split(' ')
        #list of docs for each word
        result = list()
        #For each word in the query
        for word in q:
            #If the word isn't encoded then it isn't present in any document
            #so we can return no result.
            if word not in self.term_enc.keys():
                print("No results available")
                return None
            #otherwise we append the set of the document to the list
            result.append(set(self.RI[self.term_enc[word]]))
        
        tmp = set.intersection(*result)
        #initialize the dataframe which will contain the final result of the query
        result = pd.DataFrame(columns = ["title","description", "city", "url"])
        
        #for each doc to be retrieved
        for doc in tmp:
            #retrieve the index of the record (from doc_xxx.tsv to xxx)
            index = int(doc[4:])
            #retrieve the row
            row = self.data.loc[index]
            #Append the row to the final result
            result = result.append(row[["title","description", "city", "url"]])
        
        return result, q
    
    def _buildCorpus(self):
        
        #If the variables are not yet initialized and the file names have not been specified
        #the variables are created and saved into memory.
        if self.corpus == None and self.indexToindex == None:
            #If the files exist then they are loaded, instead to compute them again.
            if isfile(self.dir_path+self.corpusFileName) and isfile(self.dir_path+self.indexToindexFileName):
                with open(self.dir_path+self.corpusFileName, 'rb') as handle:
                    self.corpus = pickle.load(handle)
                with open(self.dir_path+self.indexToindexFileName, 'rb') as handle:
                    self.indexToindex = pickle.load(handle)
                print("[LOG]: Corpus and indexToindex have been correctly loaded from the memory")
                return
            
            #Otherwise they are newly created
            self.corpus = list()
            #This dictionary maps the index position inside the corpus list to 
            #the index of the .tsv file
            self.indexToindex = dict()
            #For each .tsv file containing the records of the original dataset
            for filePath in tqdm(listdir(self.dir_path+self.review_dir)):
                #The file is open through Pandas remembering that since it is a .tsv file,
                #the delimiter will be a tab character and the header is not present
                data = pd.read_csv(self.dir_path+self.review_dir+ filePath, delimiter = '\t',header = None, encoding = 'utf8')

                #retrieve the title and the description fields
                title = str(data.values[0][self.TITLE]).strip()
                description = str(data.values[0][self.DESCRIPTION]).strip()

                #concatenate them in a string
                document = title + " " + description
                #append the document to the corpus
                self.corpus.append(document)
                #store the index association
                self.indexToindex[len(self.corpus)-1] = int(filePath[4:][:-4])
            #Save the variables on the filesystem
            with open(self.dir_path+self.corpusFileName, 'wb') as handle:
                pickle.dump(self.corpus, handle, protocol=pickle.HIGHEST_PROTOCOL)
            with open(self.dir_path+self.indexToindexFileName, 'wb') as handle:
                pickle.dump(self.indexToindex, handle, protocol=pickle.HIGHEST_PROTOCOL)
            return
           
        #If they are both initialized all went fine.
        print("[LOG]: corpus and indexToindex are already loaded")
    
    def _tfIdfSetup(self):
        #Check if the tfidf matrix exists.
        #If not then, if the file exists, it is loaded from the filesystem
        #otherwise it is created and stored on the filesystem
        if self.tfIdfMatrix == None and self.vectorizer == None:
            #If the file already exists then we load it, instead to compute it another time.
            if isfile(self.dir_path+self.tfIdfFileName) and isfile(self.dir_path+self.tfIdfVectorizerFileName):
                with open(self.dir_path+self.tfIdfFileName, 'rb') as handle:
                    self.tfIdfMatrix = pickle.load(handle)
                with open(self.dir_path+self.tfIdfVectorizerFileName, 'rb') as handle:
                    self.vectorizer = pickle.load(handle)
                print("[LOG]: The tfidf matrix and tfidf vectorizer have been correctly loaded from the memory.")
            else:
                print("[LOG]: The file "+self.tfIdfFileName+" doesn't exist")
                print("[LOG]: A new tfidf matrix will be built and saved in persistent memory with the name:= "+self.tfIdfFileName)
                self.vectorizer = TfidfVectorizer()
                self.tfIdfMatrix = self.vectorizer.fit_transform(self.corpus)
                #Save the variables on the filesystem
                with open(self.dir_path+self.tfIdfFileName, 'wb') as handle:
                    pickle.dump(self.tfIdfMatrix, handle, protocol=pickle.HIGHEST_PROTOCOL)
                with open(self.dir_path+self.tfIdfVectorizerFileName, 'wb') as handle:
                    pickle.dump(self.vectorizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
            return
        
        print("[LOG]: The tfidf matrix and tfidf vectorizer are already loaded.")
        
    def _termEnc2setup(self):
        #setup for term encoding
        if self.term_enc2 == None:
            #If the file already exists then we load it, instead to compute it another time.
            if isfile(self.dir_path+self.term_enc2FileName):
                with open(self.dir_path+self.term_enc2FileName, 'rb') as handle:
                    self.term_enc2 = pickle.load(handle)
                print("[LOG]: The term encoding have been correctly loaded from the memory")
            else:
                print("[LOG]: The file "+self.term_enc2FileName+" doesn't exist")
                print("[LOG]: A new term encoding will be built and saved in persistent memory with the name:= "+self.term_enc2FileName)
                self.term_enc2 = {k: v for v, k in enumerate(self.vectorizer.get_feature_names())}
                #Save the variables on the filesystem
                with open(self.dir_path+self.term_enc2FileName, 'wb') as handle:
                    pickle.dump(self.term_enc2, handle, protocol=pickle.HIGHEST_PROTOCOL)
            return
        
        print("[LOG]: The term encoding is already loaded")
    
    def _invertedIndex2setup(self):

        if self.RI2 == None:
            #If the file already exists then we load it, instead to compute it another time.
            if isfile(self.dir_path+self.RI2FileName):
                with open(self.dir_path+self.RI2FileName, 'rb') as handle:
                    self.RI2 = pickle.load(handle)
                print("[LOG]: The inverted index have been correctly loaded from the memory")
                return
            else:
                print("[LOG]: The file "+self.RI2FileName+" doesn't exist")
                print("[LOG]: A new inverted index will be built and saved in persistent memory with the name:= "+self.RI2FileName)
                self.RI2 = {}
                #Each word ID is associated to a list of docs which contains the word.
                for term_key in list(self.term_enc2.keys()):
                    self.RI2[term_key] = list()
                
                #Iterating over each document
                for doc_index in range(self.tfIdfMatrix.shape[0]):
                    #get the tfidf vector of the document
                    data = self.tfIdfMatrix[doc_index]
                    #the data is a sparse matrix of scipy package
                    cx = data.tocoo()    
                    #this for allows to iterate only over the non-zero entries of the data
                    for _,j,v in zip(cx.row, cx.col, cx.data):
                        #the previous list of docs associated to the word is retrieved
                        mocklist = self.RI2[j]
                        #REMEMBER:
                        #self.indexToindex maps the index value of the document
                        #in the tfidf matrix to the index of the .tsv file
                        mocklist.append(("doc_"+str(self.indexToindex[doc_index]),v))
                        #Finally assigned in the reverted index data structure
                        self.RI2[j] = mocklist
                        #the mockList is deleted since it is no useful anymore
                        del mocklist

                #The reverted index is saved on the filesystem for further uses.
                with open(self.dir_path+self.RI2FileName, 'wb') as handle:
                    pickle.dump(self.RI2, handle, protocol=pickle.HIGHEST_PROTOCOL)
                
            return
        print("[LOG]: The inverted index is already loaded")
    
    def _rankedQuerySetup(self):
        #Computing the TFIDF values involve creating the corpus of our dataset.
        #
        #The corpus consists of a collection of documents where each document
        #it's represented by the string obtained concatenating the title and 
        #the description field of each record.
        self._buildCorpus()
        #setup for the TfIdf, more details to the function
        self._tfIdfSetup()
        #setup for the term encoding
        self._termEnc2setup()
        #setup for the inverted index
        self._invertedIndex2setup()
    
    
    def rankedQuery(self, queryString, k):
        
        #the dataframe not yet ranked and the query already processed
        (notRanked, q) = self.query(queryString)
        #if the result is empty or it contains only one record
        #it is useless to rank the result
        if notRanked is None or notRanked.shape[0] == 1 :
            return notRanked
    
        #setup the environment to rank each document with respect to the query
        self._rankedQuerySetup()
        
        #Ranking the results
        #
        #get the indexes of each document in the intermediate result
        indexes = np.array(notRanked.index)
        #vectorize function to create the new series containing the 
        func = np.vectorize(lambda doc_index: self._evaluateCosineSimilarity(query = q,doc_index = doc_index))
        #create the new series containing the cosine similarity
        notRanked["Similarity"] = pd.Series(func(indexes), index = indexes)
        #return the records sorted for similarity in descending order
        return notRanked.sort_values(by = "Similarity", ascending = False )   
       
    
    def _evaluateCosineSimilarity(self,query, doc_index):
        
        denominatore = 0
        
        numeratore = 0
        count_element = Counter(query)
        
        for word in set(query):
            #tfidf value of the word for the document @doc_index
            tfidf_DOC = [item[1] for item in self.RI2[self.term_enc2[word]] if "doc_"+str(doc_index) in item][0]
            #tf value for the word in the query
            tf_Q = count_element[word]
            #idf value for the word in the query
            idf_Q = math.log10((1+(self.tfIdfMatrix.shape[0] + 1))/(1+len(self.RI2[self.term_enc2[word]]))) + 1
            #compute the tfidf for the word in the query
            tfidf_Q = tf_Q * idf_Q
            denominatore += tfidf_Q**2
            #compute the dot product
            toAdd = tfidf_DOC * (tfidf_Q)
            #add the value 
            numeratore += toAdd
            
        return numeratore / math.sqrt(denominatore)
    








Creating the object

In [4]:
airbnb = AirBnbPy()

**Code execution**

Load the data, need to return data for debugging purpose.

In [5]:
data = airbnb.loadData()

#1. load the data.scv
#2. calls cleandata() and return
#3. return data

Create the files requested at Step 2 plus preprocessing of step 3.

In [48]:
airbnb.createAllReviews()

#1. calls createAllReviews
#2. calls CreateTSV
#3. calls _nltkProcess
#4. calls _setupNltk
#5. at the end it returns the data pre processed to the search engine.

Step 3.

This step consists in:

1. Build the encoding for the words in the corpus. To each word is associated a unique int ID. 
    
2. Create the reverted index. To each word ID is associated the set of documents which contains that word.
    
3. Realize an interface to perform queries to the search engine. 
    
    3.1. The first type of query will be conjunctive, so the result is the set of documents which contain ***all*** query words.

In [6]:
airbnb.buildEncoding(fileName = "term_encoding.pickle")

In [6]:
airbnb.term_enc

{'grand': 1,
 'restaur': 2,
 'mall': 3,
 'centerra': 4,
 'access': 5,
 'outlet': 6,
 'easi': 7,
 'locat': 8,
 '10': 9,
 'shop': 10,
 'dine': 11,
 'amaz': 12,
 'near': 13,
 'beauti': 14,
 'highway': 15,
 'water': 16,
 'amp': 17,
 'park': 18,
 'typhoon': 19,
 'close': 20,
 'kati': 21,
 'conveni': 22,
 'mill': 23,
 'upscal': 24,
 'center': 25,
 '99': 26,
 'corridor': 27,
 'texa': 28,
 'hous': 29,
 'parkway': 30,
 'energi': 31,
 'travel': 32,
 'furnish': 33,
 'tast': 34,
 'great': 35,
 'solo': 36,
 'slip': 37,
 'kid': 38,
 'refresh': 39,
 'night': 40,
 'marina': 41,
 'travi': 42,
 'island': 43,
 'golf': 44,
 'condo': 45,
 'adventur': 46,
 'love': 47,
 'pool': 48,
 'rentabl': 49,
 'etc': 50,
 'get': 51,
 'lake': 52,
 'luxuri': 53,
 'villa': 54,
 '35': 55,
 'sauna': 56,
 '24': 57,
 'boat': 58,
 'cours': 59,
 'busi': 60,
 'famili': 61,
 'also': 62,
 'breathtak': 63,
 'trail': 64,
 'seren': 65,
 'enjoy': 66,
 'readi': 67,
 'coupl': 68,
 'bike': 69,
 'hike': 70,
 'transport': 71,
 'nrg': 72,
 '

In [7]:
airbnb.createRevertedIndex(fileName = "reverted_index.pickle")

In [140]:
airbnb.RI

{1: ['doc_3224',
  'doc_5143',
  'doc_13543',
  'doc_6522',
  'doc_14116',
  'doc_3116',
  'doc_2732',
  'doc_16959',
  'doc_6721',
  'doc_14332',
  'doc_5573',
  'doc_17699',
  'doc_3565',
  'doc_10471',
  'doc_7776',
  'doc_10882',
  'doc_588',
  'doc_17661',
  'doc_10263',
  'doc_3915',
  'doc_12037',
  'doc_7458',
  'doc_14234',
  'doc_620',
  'doc_8382',
  'doc_1774',
  'doc_9281',
  'doc_6307',
  'doc_17662',
  'doc_8855',
  'doc_4240',
  'doc_4416',
  'doc_3554',
  'doc_2662',
  'doc_10282',
  'doc_2629',
  'doc_6708',
  'doc_6444',
  'doc_7349',
  'doc_14368',
  'doc_17999',
  'doc_3841',
  'doc_9341',
  'doc_14194',
  'doc_6059',
  'doc_6117',
  'doc_5175',
  'doc_15488',
  'doc_7581',
  'doc_14373',
  'doc_6804',
  'doc_10258',
  'doc_6594',
  'doc_11227',
  'doc_5566',
  'doc_5469',
  'doc_13985',
  'doc_16571',
  'doc_14997',
  'doc_2774',
  'doc_12493',
  'doc_10121',
  'doc_296',
  'doc_11161',
  'doc_5967',
  'doc_8264',
  'doc_12211',
  'doc_7338',
  'doc_9924',
  'doc_

In [8]:
intermediate,_  = airbnb.query('a beautiful house')

In [9]:
intermediate.head()

Unnamed: 0,title,description,city,url
17389,RANCH LIFE! Guest House 8 miles N of Weatherford,Guest house on cattle ranch approximately 8 mi...,Weatherford,https://www.airbnb.com/rooms/9228982?location=...
9685,Carriage House,Carriage house located in the city limits of B...,Bandera,https://www.airbnb.com/rooms/7647735?location=...
10947,"Nice new home, Queen, WiFi, TV and refrigerator",A New house in a beautiful safe community. Ope...,Carrollton,https://www.airbnb.com/rooms/17750938?location...
13808,Message for pictures,Beautiful brand new home with 4 bedrooms 3.5 b...,Rosenberg,https://www.airbnb.com/rooms/15903178?location...
9784,San Antonio Lake House on Lake LBJ,The San Antonio Lake House is located in the T...,Kingsland,https://www.airbnb.com/rooms/13634952?location...


In [10]:
prova = airbnb.rankedQuery('a beautiful house', k =  10)

[LOG]: Corpus and indexToindex have been correctly loaded from the memory
[LOG]: The tfidf matrix and tfidf vectorizer have been correctly loaded from the memory.
[LOG]: The term encoding have been correctly loaded from the memory
[LOG]: The inverted index have been correctly loaded from the memory


In [33]:
from heapq import nlargest, nsmallest

In [29]:
pippo = list()
for index, row in prova.iterrows():
    pippo.append((index,row["Similarity"]))

In [30]:
pippo

[(5653, 0.7026467950144443),
 (14156, 0.44496432084442483),
 (12144, 0.3487224296746597),
 (11555, 0.3460384249006663),
 (9055, 0.3460384249006663),
 (8266, 0.34577620715876317),
 (7443, 0.34425050077781005),
 (1623, 0.34425050077781005),
 (16106, 0.34029242675783067),
 (15744, 0.33925663766744724),
 (499, 0.3388788249511112),
 (7564, 0.3367875338904195),
 (6192, 0.3308501179389358),
 (3188, 0.3308349332206224),
 (9485, 0.3308349332206224),
 (7140, 0.32060387950677827),
 (7234, 0.31354268104385224),
 (1832, 0.30649771351786986),
 (8590, 0.30347909456472605),
 (657, 0.30347909456472605),
 (15248, 0.2973135899121256),
 (15341, 0.29166171229414056),
 (7415, 0.29057862234906034),
 (9831, 0.28922215693079817),
 (4855, 0.28581478165073615),
 (16983, 0.27957924020592606),
 (11216, 0.2719602674157985),
 (11177, 0.26956939635585575),
 (2675, 0.26771119812234606),
 (13148, 0.26580440882613665),
 (8589, 0.2657361381856473),
 (2480, 0.2657361381856473),
 (10007, 0.2657361381856473),
 (6905, 0.2619

In [45]:
for t in nlargest(10, pippo, key = lambda x:x[1]):
    index = t[0]
    sim_value = t[1]
    
    print(intermediate.loc[index])
    print("Similarity:= "+str(sim_value))
    print()

title                                                Quiet place
description                                      Beautiful house
city                                                   New Caney
url            https://www.airbnb.com/rooms/16743790?location...
Name: 5653, dtype: object
Similarity:= 0.7026467950144443

title                                 Beautiful, New Beach House
description    Our house is close to restaurants and dining, ...
city                                               Crystal Beach
url            https://www.airbnb.com/rooms/15068378?location...
Name: 14156, dtype: object
Similarity:= 0.44496432084442483

title                                  1. Room in beautiful home
description    A nice clean room in a quiet, beautiful house ...
city                                                 San Antonio
url            https://www.airbnb.com/rooms/13915253?location...
Name: 12144, dtype: object
Similarity:= 0.3487224296746597

title          House in wonderful neighb

In [None]:
data.head()

## Auxiliary functions

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
def preprocess(sentence):
    sentence = sentence.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)
    filtered_words = [w for w in tokens if not w in set(stopwords.words('english'))]
    stemmer = SnowballStemmer('english')
    new_sentence = []
    for word in filtered_words:
        new_sentence.append(stemmer.stem(word))
    return " ".join(new_sentence)


sentence = "At eight o'clock on Thursday. morning Arthur didn't feel very good. French-Fries"
print(preprocess(sentence))

Suggestions: 
- Spelling correction
- Maybe Lemmatization

Trying Lemmatization

In [None]:
from textblob import Word
train['tweet'] = train['tweet'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
train['tweet'].head()

In [None]:
from textblob import Word

In [None]:
def preprocess(sentence):
    sentence = sentence.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)
    filtered_words = [w for w in tokens if not w in set(stopwords.words('english'))]
    stemmer = SnowballStemmer('english')
    new_sentence = []
    for word in filtered_words:
        new_sentence.append(stemmer.stem(word))
    return " ".join(new_sentence)


sentence = "At eight o'clock on Thursday. morning Arthur didn't feel very good. French-Fries"
print(preprocess(sentence))

In [None]:
data.isnull().values.any() #Check if NA value are present

In [None]:
data.isnull().sum() #Check NA values for each column

In [None]:
data[data.isnull().any(axis = 1)] #Show the rows where at least one NA value is present

In [None]:
NEXT STEP
build reverted index