## A combined recommender system based on numeric and text listing data 

In [None]:
# import libraries
import json
import math
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import sparse
from scipy.stats import kurtosis, skew
import altair as alt
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore',category=DeprecationWarning)
pd.set_option('display.max_columns', 100)
RANDOM_STATE= 42

from collections import Counter
from wordcloud import WordCloud, STOPWORDS
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from kneed import KneeLocator
from pickle import dump, load

import re
import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('vader_lexicon')

from nltk import word_tokenize, pos_tag
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer




  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [None]:
##### filtering listing data 

def get_data(price_range,num_of_beds,num_of_bedrooms,num_of_bathrooms):
    
    df = pd.read_pickle('../data/data_cleaned/cleaned_listing_finalized_for_streamlit.zip')

        
    if len(df.loc[(df['price']>price_range[0])&(df['price']<=price_range[1])])!=0:
        df_filter = df.loc[(df['price']>=price_range[0])&(df['price']<=price_range[1])]
        
 
    else:
        df_filter = df
        
    if len(df_filter.loc[df_filter['beds']==num_of_beds])!=0:
        df_filter = df_filter.loc[df_filter['beds']==num_of_beds]
        
   
    else:
        df_filter = df_filter
        


    if len(df_filter.loc[df_filter['bedrooms']==num_of_bedrooms])!=0:
        df_filter = df_filter.loc[df_filter['bedrooms']==num_of_bedrooms]
        
      
    else:
        df_filter = df_filter
        


    if len(df_filter.loc[df_filter['bathrooms_count']==num_of_bathrooms])!=0:
        df_filter = df_filter.loc[df_filter['bathrooms_count']==num_of_bathrooms]
        
    
    else:
        df_filter = df_filter
        
                    
    return df_filter





In [None]:
price_range = (50,500)
num_of_beds = 1
num_of_bedrooms = 1
num_of_bathrooms = 1
input_query = "I like a room with a swimming pool"


## dataframe with satisfying the filter queries
filter_df = get_data(price_range,num_of_beds,num_of_bedrooms,num_of_bathrooms)




In [None]:
########################################################################################################
##### Cosine similarity

major_cluster = filter_df['cluster'].value_counts().sort_values(ascending=False).index[0]
cosine_similarity_col = ['host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_has_profile_pic', 'host_identity_verified',
       'accommodates', 'bedrooms', 'beds', 'price', 'minimum_nights',
       'maximum_nights', 'has_availability', 'number_of_reviews',
       'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'has_license', 'instant_bookable',
       'calculated_host_listings_count',
       'calculated_host_listings_count_entire_homes',
       'calculated_host_listings_count_private_rooms',
       'calculated_host_listings_count_shared_rooms', 'reviews_per_month',
       'bathrooms_count', 'amenities_count', 'host_operate_years', 'polarity']

similarity_df = filter_df.loc[filter_df['cluster']==major_cluster][cosine_similarity_col]
similarity_df = similarity_df.fillna(0)
num_similarity = cosine_similarity(similarity_df)


########################################################################################################
##### build up num-based model

model_columns_all = list(filter_df.columns.values)

ui_display_columns = ['cluster',
                      'listing_id',               
                      'listing_url',                                      
                      'listing_name',                                      
                      'price',
                      'beds',
                      'bedrooms',
                      'bathrooms_count',
                      'description',                                      
                      'room_type',                                      
                      'property_type',                                      
                      'neighborhood_overview',                                      
                      'neighbourhood_cleansed',                                      
                      'neighbourhood_group_cleansed',
                      'amenities',                                      
                      'number_of_reviews','review_scores_rating','host_about']

iloc_cols = [model_columns_all.index(x) for x in ui_display_columns]


def get_num_recommendations(df, similarity, n, listing_id=None, listing_url=None, query_element=None):

    # convert query into and a similarity matrix row index
    item_index = None
    try:    
        if listing_id is not None:
            item_index = df['listing_id'].tolist().index(listing_id)
        elif listing_url is not None:
            item_index = df['listing_url'].tolist().index(listing_url)
        elif query_element is not None:
            item_index = query_element
    except ValueError as error:
        print(error)
    
    if len(df)>=n:
        # get the top n similar items
        top_idx = np.argsort(similarity[item_index])[::-1][:n]
        result_df = df.iloc[top_idx, iloc_cols]
        # add in similarity score as a column
        top_scores = [similarity[item_index][x] for x in top_idx]
        result_df.insert(loc=2, column='similarity', value=top_scores)
        
    else:
        # get the top n similar items
        top_idx = np.argsort(similarity[item_index])[::-1]
        result_df = df.iloc[top_idx, iloc_cols]
        # add in similarity score as a column
        top_scores = [similarity[item_index][x] for x in top_idx]
        result_df.insert(loc=0, column='similarity', value=top_scores)
        
    result_df = result_df.reset_index().iloc[:,1:]
    result_df.index = np.arange(1,len(result_df)+1)

    return result_df





In [None]:
########################################################################################################
##### build up text-based model


##### prepare stopword set
added_stopwords = ["can't",'t', 'us', 'say','would', 'also','within','stay', 'since']
nltk_STOPWORDS = set(stopwords.words("english"))
nltk_STOPWORDS.update(added_stopwords)


##### preprocess input query

def preprocess_text(text, stopwords = nltk_STOPWORDS, stem=False, lemma=False):
    # clean the text
    text = text.lower()
    # remove html and all other sybols
    text = re.sub("(<.*?>)|([^0-9A-Za-z \t])","",text)
    text = re.sub("(br)", '', text)
    # tokenize the text
    text = word_tokenize(text)
    # remove stopwords and non alpha words
    text = [word for word in text if word not in stopwords]
    # get the root of word
    if stem == True:
        stemmer = PorterStemmer()
        text = [stemmer.stem(word) for word in text]
    # normalize the word
    if lemma == True:
        lemmatizer = WordNetLemmatizer()
        text = [lemmatizer.lemmatize(word) for word in text]
    # list to string
    text = ' '.join(text)
    return text

##### Vectorize data

def vectorize_data(corpus):
    # TfidfVectorizer
    tfidf_vectorizer = TfidfVectorizer(
                                    ngram_range = (1,2),
                                    stop_words='english')
    # update: use todense() and np.asarray to avoid error in streamlit app
    tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

    return tfidf_vectorizer, tfidf_matrix

corpus = filter_df['content'].values
tfidf_vectorizer, tmatrix = vectorize_data(corpus)

##### get similarity

def extract_best_indices(similarity, top_n, mask=None):
    """
    Use sum of the cosine distance over all tokens and return best mathes.
    m (np.array): cos matrix of shape (nb_in_tokens, nb_dict_tokens)
    topk (int): number of indices to return (from high to lowest in order)
    """
    # return the sum on all tokens of consine for the input query
    if len(similarity.shape) > 1:
        cos_sim = np.mean(similarity, axis=0)
    else:
        cos_sim = similarity
    index = np.argsort(cos_sim)[::-1]
    if mask is not None:
        assert mask.shape == m.shape
        mask = mask[index]
    else:
        mask = np.ones(len(cos_sim))
    mask = np.logical_or(cos_sim[index] != 0, mask) #eliminate 0 cosine distance
    best_index = index[mask][:top_n]
    return best_index


##### get recommendations

def get_text_recommendations(df, input_query, _tfidf_matrix, n=5):

    # embed input query
    tokens = preprocess_text(input_query,stopwords = nltk_STOPWORDS, stem=False, lemma=True).split()
    query_vector = tfidf_vectorizer.transform(tokens)

    # get similarity
    similarity = cosine_similarity(query_vector, _tfidf_matrix)

    # best cosine distance for each token independantly
    best_index = extract_best_indices(similarity, top_n=n)

    # return the top n similar listing ids and raw comments
    result_df = df.loc[best_index,:]
    result_df = result_df.loc[:, ['cluster',
                                  'listing_id',
                                  'listing_url',
                                  'listing_name',
                                  'price',
                                  'beds',
                                  'bedrooms',
                                  'bathrooms_count',
                                  'description',
                                  'room_type',
                                  'property_type',
                                  'neighborhood_overview',
                                  'neighbourhood_cleansed',
                                  'neighbourhood_group_cleansed',
                                  'amenities',
                                  'number_of_reviews','review_scores_rating','host_about']]
    result_df = result_df.reset_index().iloc[:,1:]
    result_df.index = np.arange(1,len(result_df)+1)

    return result_df




In [None]:
########################################################################################################
##### build up the combined model

def get_recommendation(df,input_query,_tfidf_matrix, n):
    if input_query == "":
        rec_df = df.loc[df['cluster']==major_cluster]
        select_listing_id = rec_df['listing_id'].iloc[0]
        index = rec_df['listing_id'].tolist().index(select_listing_id)
        recomended_listings = get_num_recommendations(rec_df, num_similarity, n, listing_id=select_listing_id)
        
    else:
        # get corpus
        df = df.reset_index()
        recomended_listings = get_text_recommendations(df, input_query, _tfidf_matrix, n)
        
    return recomended_listings


recomended_listings_update = get_recommendation(filter_df,input_query,tmatrix, 5)
recomended_listings_update


########################################################################################################
# add review sentiment plot for the recommended listings #


def get_review_data():
    # directly read the saved cleaned_review_with_polarity_and_topic dataset
    review_df = pd.read_pickle('../data/data_cleaned/cleaned_review_with_polarity_and_topic.zip')
    return review_df
review_df = get_review_data()

# make plot
# notice: altair can only take <=5000 rows, so cannot show all listings at once

def plot_listing_sentiment_over_time(df,listing_id = None):
    sub_df = df[df['listing_id'].isin(listing_id)]
    return alt.Chart(sub_df, width=500).mark_line().encode(
                x='year(date):T',
                y='mean(polarity)',
                color=alt.Color('listing_id:O', scale=alt.Scale(scheme= 'dark2'))
            ).interactive()

# plot the sentiment changes over time by year for the recommended listings
rec_listing_ids = recomended_listings_update['listing_id'].values
sentiment_plot = plot_listing_sentiment_over_time(review_df, rec_listing_ids)
sentiment_plot

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=f2a50dc6-ff6a-45ff-9dbe-d7a35bd1e393' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>