In [8]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import label_binarize


# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.feature_extraction import text
from sklearn.metrics.pairwise import linear_kernel
from nltk.corpus import words
from nltk.corpus import brown
import pickle
import boto3

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

# Set execution role
client = boto3.client('s3') #low-level functional API
resource = boto3.resource('s3') #high-level object-oriented API
my_bucket = resource.Bucket('sagemaker-nomadiq-data') #subsitute this for your s3 bucket name. 


# Download data from S3
We download model artifacts and raw Instagram and Wikitravel data from S3.

In [7]:
my_bucket.download_file('instagram_df.csv','instagram_df.csv')
my_bucket.download_file('wikitravel.csv','wikitravel.csv')

# Load Instagram data

In [None]:
# This data contains location id and location name mappings scraped from instagram
insta_df = pd.read_csv('instagram_df.csv')

In [None]:
insta_df.head()

# Get Wikitravel scrape data

In [None]:
wiki_df = pd.read_csv('wikitravel.csv',sep = '\t', index_col= False)
# Remove entries with "errors"
wiki_df = wiki_df[(wiki_df['summary'] != 'error')|(wiki_df['do'] != 'error')|
                  (wiki_df['see'] != 'error')|(wiki_df['eat'] != 'error')]

# Map Locations to Location_IDs
Instagram IDs are often at the city + state + country level and the wiki travel data is at just the city level. We need to normalizes these to map to eachother.

In [None]:
# First we zip the location_names and location_id from the instagram data
zipped_locations = list(zip(insta_df.location_name,insta_df.location_id))

In [None]:
# Get first 5 rows of zipped data from instagram_data
zipped_locations[:5]

In [None]:
# We condense the large instagram scrape data by grouping to only unique combinations of location_name and location_id
insta_df2 = insta_df.groupby(['location_name','location_id']).size().reset_index().rename(columns={0:'count'})
zipped_locations = list(zip(insta_df2.location_name,insta_df2.location_id))

In [None]:
# Get first 5 rows of zipped data from instagram_data
zipped_locations[:5]

In [None]:
# Compile list of cities from wiki scrape data
city_list = list(wiki_df.city)

In [None]:
# Get first 5 rolws of city list
city_list[:5]

In [None]:
# Create dictionary of city names and location ID using city list from wiki_travel and location ID mappings from instagram data.
# Output is a key value pair of city name and location id
city_dict = {}

# Loop through all cities in the wikitravel list of cities
for city in city_list:
    # Inner loop through all location_name/location_id mappings
    for location in zipped_locations:
        # If there is an exactly city match pull write the key/value
        # Else continue
        if city.lower() == location[0].lower():
            city_dict[city] = int(location[1])
            break
        else:
            continue
    # If no exact matches are found, we try soft matches where find the city name within the longer string in location_name
    try:
        city_dict[city]
    except:
        for location in zipped_locations:
            if city.lower() in location[0].lower():
                city_dict[city]= int(location[1])
                break
        else:
            continue
    

# Preprocess data for vectorization
The data from Wikitravel needs to be parsed and normalized prior to vectorization

In [None]:
# Clean string with preprocessing regex rules
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
#     string = re.sub(r'\b\d+(?:\.\d+)?\s+', '', string)
#     string = re.sub(r"\d+", "", string)
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    string = re.sub(r"<b>", " ", string)
    string = re.sub(r"</b>", " ", string)
    string = re.sub(r"<br>", " ", string)
    string = re.sub(r"</br>", " ", string)
    string = re.sub(r"<p>", " ", string)
    string = re.sub(r"</p>", " ", string)
    string = re.sub(r"<ul>", " ", string)
    string = re.sub(r"</ul>", " ", string)
    string = re.sub(r"<li>", " ", string)
    string = re.sub(r"</li>", " ", string)  
    return string.strip().lower()


In [None]:
# Concatenate all text from Wikitravel text into single column for each location
wiki_df['concat_text'] = wiki_df['summary'] + " " + wiki_df["do"] + " " + wiki_df['see'] + " " + wiki_df['eat']
wiki_df['concat_text'] = wiki_df['concat_text'].apply(clean_str)

In [None]:
# Limit the wiki data to only those that are in the location_name/id mappings
wiki_df = wiki_df[wiki_df['city'].isin(list(city_dict.keys()))]
wiki_df.reset_index(drop = True, inplace = True)
# Dedupe the dataframe
wiki_df = wiki_df.drop_duplicates(['city'])
print("Number of Cities: ", len(wiki_df))
wiki_df.head()

# Build vocabulary and stopword list
The problem with using TFIDF for travel data is that location specific data will be heavily weighted. For example, an article about Tokyo will have the word "Toyko" appear many times. Tokyo is likely not included in many other articles except for others that are about Japan (e.g, Osaka, Kyoto, etc.). This would make the recommendation engine overly specific. Therefore, we limit the vocabulary to the english dictionary and remove location-specific strings.

In [None]:
# Create unique location list and city list to include into stopwords
location_name_list = list(insta_df2.location_name.unique().astype(str))
location_name_list = [x.lower() for x in location_name_list]
cities = pd.read_csv('cities.csv', index_col= False)
city_list = list(cities['city'].astype(str)) + list(cities['country'].astype(str))
city_list = [x.lower() for x in city_list]
location_list = set(location_name_list + city_list)
location_list

In [None]:
city_list2 = []
for city in city_list:
    split_city = city.split()
    for split in split_city:
        city_list2.append(split)

location_list = set(location_name_list + city_list2)

In [None]:
# Concatenate NLTK words and brown corpuses for comprehensive vocabulary
english_vocab = set([w.lower() for w in words.words()] + [w.lower() for w in brown.words()])

In [None]:
# Create Custom Stopwords unioning all english words and location-specific strings
my_stop_words = text.ENGLISH_STOP_WORDS.union(location_list)

# Vectorize text using TFIDF

In [None]:
# We instantiate the TfidifVectorizer using our customer stopwords and limiting to the english vocab
vectorizer = TfidfVectorizer(stop_words = my_stop_words, vocabulary = english_vocab)
X_tfidf = vectorizer.fit_transform(wiki_df['concat_text'])

In [None]:
print("Number of Features:", len(vectorizer.get_feature_names()))

In [None]:
# Get number of features
X_tfidf.shape

# Determine Cosine Similarity
We use cosine similarity to determine pairwise similiarities of locations. 

In [None]:
# We pass the TFIDF matrix to get pairwise similarities of all entries. 
cosine_sim = linear_kernel(X_tfidf, X_tfidf)
# example output of cosine similarity for our first city "A Coruna"
# There is a score for every city from 0 to 1 (1 means that the pair is identical).
cosine_sim[0]

# Find most similar cities to seed city

In [None]:
#Construct a reverse map of cities to indices
indices = pd.Series(wiki_df.city)
indices.reset_index()
reverse_indices = pd.Series(wiki_df.index, index=wiki_df['city'])

In [None]:
indices.head()

In [None]:
# Pickle similarity score data, indices, wiki_df, tfidf_vectorizer, x_tfidf
with open('tfidf_artifacts.pickle', 'wb') as f:
    pickle.dump([cosine_sim,indices,reverse_indices,vectorizer,X_tfidf,city_dict], f)

my_bucket.upload_file('tfidf_artifacts.pickle','tfidf_artifacts.pickle')

In [4]:
# Function that takes in city as input and outputs most similar cities
# Pickle sim_scores, indices, wiki_df, tfidf vectorizer, x_tfidf, clean_str?

def get_recommendations_city(city, cosine_sim=cosine_sim):
    city = city.title()
    idx = reverse_indices[city]
    # Get the pairwise similarity scores of all cities
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Sort the cities based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores of the 10 most similar cities
    sim_scores = sim_scores[1:11]
    # Get the city indices
    city_indices = [i[0] for i in sim_scores]
    city_recs = []
    loop_count = 0
    for city in indices.iloc[city_indices]:
        city_recs.append({"location_id": city_dict[city], "location_name": city, "cosine_similarity": sim_scores[loop_count][1]})
        loop_count += 1
    # Return the top 10 most similar cities
    return json.dumps(city_recs)

In [6]:
# Enter city name to get top 10 city names
get_recommendations_city('Seattle')

'[{"location_id": 282991098, "location_name": "Toyama", "cosine_similarity": 0.33108362726572177}, {"location_id": 246003568, "location_name": "Kusatsu", "cosine_similarity": 0.26432142461217406}, {"location_id": 388032555, "location_name": "Kitakyushu", "cosine_similarity": 0.26155100986898505}, {"location_id": 216289572, "location_name": "Saitama", "cosine_similarity": 0.24143604747506808}, {"location_id": 363044171, "location_name": "Busan", "cosine_similarity": 0.24004022868819272}, {"location_id": 243652040, "location_name": "Sasebo", "cosine_similarity": 0.23542860728034753}, {"location_id": 241120524, "location_name": "Hachinohe", "cosine_similarity": 0.23097162848893407}, {"location_id": 235447538, "location_name": "Sapporo", "cosine_similarity": 0.22906304067391015}, {"location_id": 234915980, "location_name": "Machida", "cosine_similarity": 0.2288593980652679}, {"location_id": 579940741, "location_name": "Sendai", "cosine_similarity": 0.228657650727346}]'

# Find cities most similar to keyword input

In [None]:
def get_recommendations_keywords(doc, cosine_sim=cosine_sim):
    test_tfidf = vectorizer.transform([clean_str(doc)])
    cosine_sim_test = linear_kernel(test_tfidf, X_tfidf)
    sim_scores = list(enumerate(cosine_sim_test[0]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[0:10]
    city_indices = [i[0] for i in sim_scores]
    city_recs = []
    loop_count = 0
    for city in wiki_df['city'].iloc[city_indices]:
        city_recs.append({"location_id": city_dict[city], "location_name": city, "cosine_similarity": sim_scores[loop_count][1]})
        loop_count += 1
    # Return the top 10 most similar cities
    return json.dumps(city_recs)

In [None]:
# Get similar locations to a given keyword vector
get_recommendations_keywords("scuba dive seafood")

In [None]:
# you only needs to do this once, this is a mapping of index to 
features = vectorizer.get_feature_names()

# get the document that we want to 
# doc = df.concat_text[2101]
 
#generate tf-idf for the given document
tf_idf_vector = vectorizer.transform([doc])
 
#sort the tf-idf vectors by descending order of scores
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

sorted_items = sort_coo(tf_idf_vector.tocoo())
 
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

#extract only the top n; n here is 10
keywords=extract_topn_from_vector(features,sorted_items,10)
 
# now print the results
print("\n=====Doc=====")
print(doc)
print("\n===Keywords===")
for k in keywords:
    print(k,keywords[k])

In [9]:
with open(r'tfidf_artifacts.pickle', 'rb') as f:
    cosine_sim,indices,reverse_indices,vectorizer,X_tfidf,city_dict = pickle.load(f)

In [11]:
X_tfidf[0]

<1x261552 sparse matrix of type '<class 'numpy.float64'>'
	with 132 stored elements in Compressed Sparse Row format>