In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import re
import random
import plotly.graph_objs as go
import plotly.plotly as py
import cufflinks
pd.options.display.max_columns = 30
from IPython.core.interactiveshell import InteractiveShell
import plotly.figure_factory as ff
InteractiveShell.ast_node_interactivity = 'all'
from plotly.offline import iplot
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='solar')
df = pd.read_csv('TripAdvReview.csv', encoding="latin-1")
df.head()
print('We have ', len(df), 'rentals in the data')

ImportError: 
The plotly.plotly module is deprecated,
please install the chart-studio package and use the
chart_studio.plotly module instead. 


In [None]:
def print_reviews(index):
    example = df[df.index == index][['Reviews', 'Title']].values[0]
    if len(example) > 0:
        print(example[0])
        print('Title:', example[1])

In [None]:
print_reviews(10)

In [None]:
print_reviews(100)

### EDA

#### Token (vocabulary) Frequency Distribution Before Removing Stop Words

In [None]:
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_words(df['Reviews'], 20)
df1 = pd.DataFrame(common_words, columns = ['Reviews' , 'count'])
df1.groupby('Reviews').sum()['count'].sort_values().iplot(kind='barh', yTitle='Count', linecolor='black', title='Top 20 words in vacation rental reviews before removing stop words')


#### Token (vocabulary) Frequency Distribution After Removing Stop Words

In [None]:
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_words(df['Reviews'], 20)
df2 = pd.DataFrame(common_words, columns = ['Reviews' , 'count'])
df2.groupby('Reviews').sum()['count'].sort_values().iplot(kind='barh', yTitle='Count', linecolor='black', title='Top 20 words in vacation rental reviews after removing stop words')


#### Vacation Rentals Reviews Word Count Distribution

In [None]:
df['word_count'] = df['Reviews'].apply(lambda x: len(str(x).split()))
reviews_lengths = list(df['word_count'])
print("Number of descriptions:",len(reviews_lengths),
      "\nAverage word count", np.average(reviews_lengths),
      "\nMinimum word count", min(reviews_lengths),
      "\nMaximum word count", max(reviews_lengths))

In [None]:
df['word_count'].iplot(
    kind='hist',
    bins = 50,
    linecolor='black',
    xTitle='word count',
    yTitle='count',
    title='Word Count Distribution in rental reviews')

### Text Preprocessing

In [None]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text
    
df['reviews_clean'] = df['Reviews'].apply(clean_text)

### Modeling

In [None]:
df.set_index('Title', inplace = True)
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df['reviews_clean'])
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

indices = pd.Series(df.index)

def recommendations(Title, cosine_similarities = cosine_similarities):
    
    recommended_rentals = []
    
    # gettin the index of the hotel that matches the name
    idx = indices[indices == Title].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar hotels except itself
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    # populating the list with the names of the top 10 matching hotels
    for i in top_10_indexes:
        recommended_rentals.append(list(df.index)[i])
        
    return recommended_rentals

### Recommendations

In [None]:
recommendations('')