What follows is a variation on a notebook presented by a Thinkful Denver meetup group, with notes and explanations added throughout for my own edification. 

In [4]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import scipy 
import sklearn
import seaborn as sns
import re 
from collections import Counter 
%matplotlib inline

In [1]:
print('hello world')

hello world


In [5]:
# Import the Data Set.
data = pd.read_csv('https://github.com/Thinkful-Ed/data-201-resources/raw/master/hotel-reviews.csv')

In [6]:
data.head()

Unnamed: 0,address,categories,city,country,latitude,longitude,name,postalCode,province,reviews.date,reviews.dateAdded,reviews.doRecommend,reviews.id,reviews.rating,reviews.text,reviews.title,reviews.userCity,reviews.username,reviews.userProvince
0,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2013-09-22T00:00:00Z,2016-10-24T00:00:25Z,,,4.0,Pleasant 10 min walk along the sea front to th...,Good location away from the crouds,,Russ (kent),
1,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2015-04-03T00:00:00Z,2016-10-24T00:00:25Z,,,5.0,Really lovely hotel. Stayed on the very top fl...,Great hotel with Jacuzzi bath!,,A Traveler,
2,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2014-05-13T00:00:00Z,2016-10-24T00:00:25Z,,,5.0,Ett mycket bra hotell. Det som drog ner betyge...,Lugnt l��ge,,Maud,
3,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2013-10-27T00:00:00Z,2016-10-24T00:00:25Z,,,5.0,We stayed here for four nights in October. The...,Good location on the Lido.,,Julie,
4,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2015-03-05T00:00:00Z,2016-10-24T00:00:25Z,,,5.0,We stayed here for four nights in October. The...,������ ���������������,,sungchul,


In [11]:
# Here we undertake some basic text processing. First, we make sure everything is lowercase,
# because the bag-of-words model that we'll eventually build treats 'Here' as different from 
# 'here', so that'd be a problem. Then we do something akin to regular expressions by removing 
# any special characters like ')'. Again, this would cause problems for our models later on. 

data['reviews.text'] = data['reviews.text'].str.lower()
data['reviews.text'] = data['reviews.text'].str.replace(r'\.|\!\?|\'|,|-|\(|\)',"",)
data['reviews.text'] = data['reviews.text'].fillna('')

In [12]:
# Now, on to the actual machine learning. What we're doing here is vectorizing the text 
# in reviews.text, which is to transform them into a format that our computer can do something 
# with. The CountVectorizer specializes in making these text data into vectors of frequency 
# counts. 

from sklearn.feature_extraction.text import CountVectorizer 

vectorizer = CountVectorizer(max_features = 5000) # We do this to 
# instantiate the CountVectorizer object. 

In [15]:
# Calling .fit_transform() on the reviews.text data makes the words within these sentences 
# into DataFrame columns, each word in its own column. 

X = vectorizer.fit_transform(data['reviews.text'])

# Then we'll make this into a table called 'bag-of-words'. 
bag_of_words = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())

In [21]:
# This .rename method let's us clean up the titles of the data we care 
# about for the sake of clarity. To the columns positional argument we've
# passed in a dictionary whose keys are the current column name and 
# whose values are the names we want the column to have. 

data.rename(columns = {'address': 'hotel_address', 'city': 'hotel_city', 
                       'country': 'hotel_country', 'name': 'hotel_name'}, 
            inplace=True)

full_df = data.join(bag_of_words) # This isn't required for the current 
# project, but making the full df would allow us to play around with 
# other predictive variables if we chose to do so. 

In [22]:
# With this cell we now have all the data that we want from the dataframes, 
# in two variables; X, our predictive variable, y_hotel, our predicted one.

X = bag_of_words
y_hotel = data['hotel_name']

In [23]:
# And now, we're going to use a random forest model from sklearn.

from sklearn.ensemble import RandomForestClassifier 
rcf = RandomForestClassifier() # As before, we have to instantiate the 
# object.

In [24]:
# This is where the magic happens. When we call .fit(), the random forest
# model is doing the actual 'learning' part of machine learning. 

rcf.fit(X, y_hotel)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [26]:
test_review = ['''
    I want to go somewhere cold and wet and dirty.
    ''']

In [27]:
# Before our test review can be of any use we must first vectorize it.

X_test = vectorizer.transform(test_review).toarray()

In [32]:
# Using the random forest predictor built up during the .fit() process above
# we can derive a prediction from the transformed vector of our test 
# review. 

prediction = rcf.predict(X_test)[0]

In [34]:
# And now to get the hotel name, location, etc. corresponding to the 
# hotel predicted by our rcf. 

data[data['hotel_name'] == prediction][['hotel_name', 'hotel_address',
                                        'hotel_city', 'hotel_country']].head(1)

Unnamed: 0,hotel_name,hotel_address,hotel_city,hotel_country
31689,Americas Best Value Inn - Medical Center/airport,450 Raynolds St,El Paso,US
