In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict

from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer

from sklearn.metrics.pairwise import cosine_similarity
from heapq import heappush, nlargest

from collections import OrderedDict
import pickle
pd.options.mode.chained_assignment = None


# First way
#TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document)
#IDF(t) = log_e(Total number of documents / Number of documents with term t in it)

def calculate_tf_idf(airbnb_data,inverted_idx,vocabulary):
    """
    method that computes an inverted index
    
    input:  airbnbdata-just for using the number of files we made
            inverted_idx(dictionary, key=term_id, value=list of document_ids)
            vocabulary(dictionary of all unique words, key=term, value=term_id)
    output: tf_idf_dic(dictionary of tf_idf_values for all docs, key=tuple(term,doc_id ), value=tf_idf value)
    """
    tf_idf_dic=dict()
    #number of .tsv files which were made
    total_num_docs=airbnb_data.shape[0]
    result_df=pd.DataFrame()
    for i in airbnb_data.index:
        #take one file
        df=pd.read_csv('data/doc_'+str(i)+'.tsv',sep='\t',usecols=['description','title'],encoding='ISO-8859-1')
        #preprocessing 
        df=df.description[0]+' '+df.title[0] 
        filtered_words=preprocessing_text(df)
        tf_series=pd.Series(filtered_words)
        #series of tf values
        tf_series=((tf_series.value_counts())/len(tf_series)).sort_index()
        idf_series=pd.Series(list(set(filtered_words))).sort_values()
        #idf calculation
        idf_calc=idf_series.apply(lambda x: np.log(total_num_docs/len(inverted_idx[vocabulary[x]])))
        #combine tf and idf in one result_df dataframe
        result_df=pd.concat([pd.Series(idf_series.values),pd.Series(tf_series.values),pd.Series(idf_calc.values)],axis=1)#.reset_index()
        #multiply tf and idf and create tf_idf column
        result_df['tf_idf']=result_df[1]*result_df[2]
        #key=tuple(term,doc_id), value=tf_idf value
        for idx in range(result_df.shape[0]):
            tf_idf_dic[result_df[0][idx],i]=result_df['tf_idf'][idx]
    return tf_idf_dic        

# Second way--to check if it is the same like the 1st-for double checking the results
def calculate_tf_idf2(airbnb_data,inverted_idx,vocabulary):
    """
    method that computes an inverted index(stores it differently than the first one just for comparison)
    
    input:  airbnbdata-just for using the number of files we made
            inverted_idx(dictionary, key=term_id, value=list of document_ids)
            vocabulary(dictionary of all unique words, key=term, value=term_id)
    output: proba(dictionary of tf_idf_values for all docs)
    """
    #store separately tf and idf values into dictionaries
    idf_dic2={}
    tf_dic2={}
    #dictionary for tf_idf values
    proba={}
    total_num_docs=airbnb_data.shape[0]

    for i in airbnb_data.index:
        #take one file
        df=pd.read_csv('data/doc_'+str(i)+'.tsv',sep='\t',usecols=['description','title'],encoding='ISO-8859-1')
        #preprocessing 
        df=df.description[0]+' '+df.title[0] 
        #preprocessed words
        filtered_words=preprocessing_text(df)
        #tf values calculations
        tf_series=pd.Series(filtered_words)
        tf_series=((tf_series.value_counts())/len(tf_series)).sort_index()
        #idf values calculations
        idf_series=pd.Series(list(set(filtered_words))).sort_values()
        idf_calc=idf_series.apply(lambda x: np.log(total_num_docs/len(inverted_idx[vocabulary[x]])))
        #store idf values into dict
        for idx in range(len(tf_series)):
            idf_dic2[idf_series[idx],i]=idf_calc[idx] 
        #store tf values into dict
        for index,value in tf_series.iteritems():
            tf_dic2[index,i]=value
        #combine tf and idf ito a new dictionary by their multiplication using the same key
        for k in tf_dic2.keys():
            proba[k]=tf_dic2[k]*idf_dic2[k]
    return proba        