In [7]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import re
import random
#import plotly.graph_objs as go
#import plotly.plotly as py
pd.options.display.max_columns = 30
#from IPython.core.interactiveshell import InteractiveShell
#import plotly.figure_factory as ff
#InteractiveShell.ast_node_interactivity = 'all'
#from plotly.offline import iplot
df = pd.read_csv('TripAdvReview.csv', encoding="latin-1", na_values='')
df.head()
print('We have ', len(df), 'rentals in the data')

We have  849 rentals in the data


In [8]:
def print_reviews(index):
    example = df[df.index == index][['Reviews', 'Title']].values[0]
    if len(example) > 0:
        print(example[0])
        print('Title:', example[1])

In [9]:
print_reviews(10)

2 
This is one of the best private vacation rentals I have found. It
Title: Inn Of The Sea Resort-Newly renovated, Oceanview & Includes the use of two kayak


In [10]:
print_reviews(100)

1 
unwind. Thanks for another memorable family vacation!
Title: Red Bay Getaway


### EDA

#### Vacation Rentals Reviews Word Count Distribution

In [12]:
df['word_count'] = df['Reviews'].apply(lambda x: len(str(x).split()))
reviews_lengths = list(df['word_count'])
print("Number of descriptions:",len(reviews_lengths),
      "\nAverage word count", np.average(reviews_lengths),
      "\nMinimum word count", min(reviews_lengths),
      "\nMaximum word count", max(reviews_lengths))

Number of descriptions: 849 
Average word count 9.089517078916373 
Minimum word count 1 
Maximum word count 21


In [14]:
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)


calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.



In [15]:
df['word_count'].iplot(
    kind='hist',
    bins = 50,
    linecolor='black',
    xTitle='word count',
    yTitle='count',
    title='Word Count Distribution in rental reviews')

### Text Preprocessing

In [17]:
df['Reviews']= df['Reviews'].astype(str)

In [18]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text
    
df['reviews_clean'] = df['Reviews'].apply(clean_text)

### Modeling

In [19]:
df.set_index('Title', inplace = True)
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df['reviews_clean'])
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

indices = pd.Series(df.index)

def recommendations(Title, cosine_similarities = cosine_similarities):
    
    recommended_rentals = []
    
    # getting the index of the hotel that matches the name
    idx = indices[indices == Title].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar hotels except itself
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    # populating the list with the names of the top 10 matching hotels
    for i in top_10_indexes:
        recommended_rentals.append(list(df.index)[i])
        
    return recommended_rentals

### Recommendations

In [23]:
recommendations('Red Bay Getaway')

['McGregor Bay Island Cottage Rental',
 'Riverfront, Central 3 Bedroom/3 Bathroom Condo in the Heart of Old Quebec',
 'Rivers End Cottage',
 'Stunning Lake View | Beachfront Condo + Saltwater Pool + Hot Tub',
 'Beautiful Home on Chesterman Beach',
 'Cobblestone Farm Retreat',
 'Howe Bay Beach House - PEI Oceanfront Vacation Rental',
 'Best Sauble Beach has to offer!!!!',
 'Tulameen Vacation Cabin',
 'Twilly House - Twillingate Vacation Home Rental']