In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import os

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import label_binarize


# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.feature_extraction import text
from sklearn.metrics.pairwise import linear_kernel
from nltk.corpus import words
from nltk.corpus import brown
import pickle
import boto3

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

# Set execution role
client = boto3.client('s3') #low-level functional API
resource = boto3.resource('s3') #high-level object-oriented API
my_bucket = resource.Bucket('sagemaker-nomadiq-data') #subsitute this for your s3 bucket name. 




# Get instagram scrape data

In [2]:
my_bucket.download_file('instagram_df_2.csv','instagram_df_2.csv')
my_bucket.download_file('wikitravel.csv','wikitravel.csv')

In [3]:
# This data contains location id and location name mappings scraped from instagram
insta_df = pd.read_csv('instagram_df_2.csv')

In [4]:
insta_df.head()

Unnamed: 0,caption,location_id,location_name,timestamp,user_name,hashtags,month
0,Atlanta has been an awesome Super Bowl host ci...,1014864000.0,Mercedes-Benz Stadium,1549236000.0,LeeAbbamonte,[],2.0
1,Flying to Atlanta for Super Bowl LIII and exci...,307363000000000.0,Flying To Atlanta,1549045000.0,LeeAbbamonte,[],2.0
2,The big battle of east coast vs west coast is ...,214007300.0,"Marina del Rey, California",1548950000.0,LeeAbbamonte,"['premiumrewards', 'preferredrewards', 'ad']",1.0
3,Dawn over Marina del Rey welcomes another beau...,214007300.0,"Marina del Rey, California",1548688000.0,LeeAbbamonte,[],1.0
4,The Sydney Opera House is without question one...,2112249.0,Sydney Opera House,1548345000.0,LeeAbbamonte,[],1.0


In [162]:
len(insta_df)

177034

# Get Wikitravel scrape data

In [163]:
wiki_df = pd.read_csv('wikitravel.csv',sep = '\t', index_col= False)
# Remove entries with "errors"
wiki_df = wiki_df[(wiki_df['summary'] != 'error')|(wiki_df['do'] != 'error')|
                  (wiki_df['see'] != 'error')|(wiki_df['eat'] != 'error')]

# Map Locations to Location_IDs
Instagram IDs are often at the city + state + country level and the wiki travel data is at just the city level. We need to normalizes these to map to eachother.

In [164]:
# First we zip the location_names and location_id from the instagram data
zipped_locations = list(zip(insta_df.location_name,insta_df.location_id))

In [165]:
# Get first 5 rows of zipped data from instagram_data
zipped_locations[:5]

[('Mercedes-Benz Stadium', 1014864419.0),
 ('Flying To Atlanta', 307362966524981.0),
 ('Marina del Rey, California', 214007272.0),
 ('Marina del Rey, California', 214007272.0),
 ('Sydney Opera House', 2112249.0)]

In [166]:
# We condense the large instagram scrape data by grouping to only unique combinations of location_name and location_id
insta_df2 = insta_df.groupby(['location_name','location_id']).size().reset_index().rename(columns={0:'count'})
zipped_locations = list(zip(insta_df2.location_name,insta_df2.location_id))

In [167]:
# Get first 5 rows of zipped data from instagram_data
zipped_locations[:5]

[('"Greetings From Austin" Postcard Mural', 223569558.0),
 ('"Insel Der Jugend"-Treptower Park', 129113.0),
 ('"Los Angeles - City Of Dreams"', 252555922.0),
 ('"Rum Point" Grand Cayman', 213765965.0),
 ('...le Mura di Ferrara...', 463586100.0)]

In [168]:
# Compile list of cities from wiki scrape data
city_list = list(wiki_df.city)

In [169]:
# Get first 5 rows of city list
city_list[:5]

['A Coruña', 'Aachen', 'Aalborg', 'Aarhus', 'Abadan']

In [551]:
city_dict

{'Aachen': 13820180,
 'Aalborg': 215334386,
 'Aarhus': 215007529,
 'Aberdeen': 237795431,
 'Abu Dhabi': 328670654,
 'Acapulco': 3203457,
 'Accra': 216345646,
 'Addis Ababa': 497821515,
 'Adelaide': 318910,
 'Aden': 219516312,
 'Adoni': 215836718,
 'Adıyaman': 247080249,
 'Agra': 490352302,
 'Aguascalientes': 216234886,
 'Ahmedabad': 707577838,
 'Aix-en-Provence': 215175464,
 'Ajmer': 893502067513238,
 'Aksaray': 75862753,
 'Alajuela': 254369336,
 'Alanya': 582497147,
 'Alappuzha': 260558268,
 'Albuquerque': 1028661414,
 'Alexandria': 3224114,
 'Algiers': 219370504,
 'Alicante': 304569408,
 'Almaty': 541318335,
 'Amarillo': 222409773,
 'Ambon': 1147781268644283,
 'Amersfoort': 26394654,
 'Amman': 880303,
 'Amritsar': 559836093,
 'Amsterdam': 11742,
 'Anaco': 254709242,
 'Anaheim': 215556247,
 'Anand': 280428744,
 'Anchorage': 24589076,
 'Anda': 213563319,
 'Andong': 577010232,
 'Andria': 3224114,
 'Angeles City': 213303696,
 'Angers': 943564351,
 'Ankara': 236300643,
 'Annaba': 10091448

In [170]:
# Create dictionary of city names and location ID using city list from wiki_travel and location ID mappings from instagram data.
# Output is a key value pair of city name and location id
city_dict = {}

# Loop through all cities in the wikitravel list of cities
for city in city_list:
    # Inner loop through all location_name/location_id mappings
    for location in zipped_locations:
        # If there is an exactly city match pull write the key/value
        # Else continue
        if city.lower() == location[0].lower():
            city_dict[city] = int(location[1])
            break
        else:
            continue
    # If no exact matches are found, we try soft matches where find the city name within the longer string in location_name
    try:
        city_dict[city]
    except:
        for location in zipped_locations:
            if city.lower() in location[0].lower():
                city_dict[city]= int(location[1])
                break
        else:
            continue
            

In [172]:
# Get city name into dataframe 
def city_map(loc_id):
    for key, value in city_dict.items():
        try:
            if int(loc_id) == value:
                return key
            else:
                continue
        except:
            break


insta_df['city_name'] = insta_df['location_id'].apply(city_map)

In [173]:
insta_df_cities = insta_df[insta_df['city_name'].notnull()]

In [174]:
insta_df_cities.head()

Unnamed: 0,caption,location_id,location_name,timestamp,user_name,hashtags,month,city_name
59,"The enormous, carved out Leshan Giant Buddha i...",216278196.0,Leshan Giant Buddha,1542119000.0,LeeAbbamonte,[],11.0,Leshan
66,Somewhere in the world places like this do exi...,213616116.0,"Diani Beach, Mombasa, Kenya",1541338000.0,LeeAbbamonte,[],11.0,Mombasa
67,Extra points if you can spot me as I completel...,213616116.0,"Diani Beach, Mombasa, Kenya",1541209000.0,LeeAbbamonte,[],11.0,Mombasa
68,"Back in Diani Beach, Kenya, one of my all time...",213616116.0,"Diani Beach, Mombasa, Kenya",1541159000.0,LeeAbbamonte,[],11.0,Mombasa
80,"Al Balad, the old city of Jeddah, Saudi Arabia...",10410823.0,"Jeddah, Saudi Arabia",1539784000.0,LeeAbbamonte,[],10.0,Jeddah


In [175]:
len(insta_df_cities)

2526

In [176]:
# Create dataframe with user_name, city_name, and whether or not the user visited
insta_df_condensed = insta_df_cities[['user_name','city_name']]
insta_df_condensed = insta_df_condensed.drop_duplicates()
insta_df_condensed['visited'] = 1

In [178]:
insta_df_condensed.head()

Unnamed: 0,user_name,city_name,visited
59,LeeAbbamonte,Leshan,1
66,LeeAbbamonte,Mombasa,1
80,LeeAbbamonte,Jeddah,1
86,LeeAbbamonte,Riyadh,1
131,LeeAbbamonte,Astana,1


In [329]:
insta_df_condensed['city_name'].value_counts()

Hong Kong                      34
Singapore                      30
Marrakech                      26
Vancouver, British Columbia    19
Osaka                          14
Helsinki                       13
Magelang                        8
Jodhpur                         8
Awasa                           8
Mostar                          7
Valencia                        7
Philadelphia, Pennsylvania      7
Valparaíso                      7
Yangon                          6
Beijing                         6
Jena                            6
Bariloche                       5
Rio de Janeiro                  5
Phnom Penh                      5
Las Vegas                       5
Ruse                            5
Dresden                         5
Kandy                           5
Esfahan                         5
Yogyakarta                      5
Zhangjiajie                     5
Nuremberg                       5
Palma de Mallorca               5
Tehran                          5
Dunedin       

# Creating a sparse matrix for cosine similiarity calculations
We create an n x m matrix where n are all cities and m are all instagram users. We us this to determine pairwise cosine similarity between all cities based on the user-vector.

In [503]:
from scipy.sparse import csr_matrix

# Reshape matrix as an m x n array where m is city names and n is the users
# Values represent whether or not the individual has visited
df_travel_features = insta_df_condensed.pivot(
    index='city_name',
    columns='user_name',
    values='visited'
).fillna(0)

# Convert to sparse matrix
sparse_travel_features = csr_matrix(df_travel_features.values)

In [505]:
df_travel_features.head()

user_name,2straws,AlexStrohl,Anna.Everywhere,Bemytravelmuse,BrendanVanSon,Chelseakauai,Chloe_T,Danielkordan,DrewBinsky,Eljackson,...,theblondeabroad,thejetsetterdiaries,theplanetd,theupbeatpath,theworldpursuit,tiffpenguin,travelbabbo,uncornered_market,vancitywild,worldwanderlust
city_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aachen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
Aalborg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Aarhus,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Aberdeen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Abu Dhabi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Executing User-Based Collaborative Filtering
With the first approach, we create an n x m matrix where n is the instagram user and m is the city name. We then calculate the similarity between users to eachother. We then predict whether a given user would like to visit a location based on this similarity.


In [5]:
df_travel_features_user = insta_df_condensed.pivot(
    index='user_name',
    columns='city_name',
    values='visited'
).fillna(0)

NameError: name 'insta_df_condensed' is not defined

In [497]:
df_travel_features_user.head()

city_name,Aachen,Aalborg,Aarhus,Aberdeen,Abu Dhabi,Acapulco,Accra,Addis Ababa,Adelaide,Aden,...,Zhangjiajie,Zhangye,Zhuhai,Zipaquirá,Zonguldak,Zürich,Çanakkale,İstanbul,İzmir,Şanlıurfa
user_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2straws,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AlexStrohl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Anna.Everywhere,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bemytravelmuse,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BrendanVanSon,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [601]:
# Create vector of user visits
user_visits = ['Osaka']
user_vector = []

for location in df_travel_features.index:
    if location in user_visits:
        user_vector.append(1)
    else:
        user_vector.append(0)
        
cosine_sim_user = linear_kernel(user_vector, df_travel_features_user)

indices_user = sorted(range(len(cosine_sim_user[0])), key=lambda i: cosine_sim_user[0][i], reverse=True)[:2]
k = 1/sum(sorted((cosine_sim_user[0]), reverse=True)[:2])

ratings_user = []

for index in range(len(df_travel_features)):
    ratings2_user = []
    for index2 in range(len(indices_user)):
#         print(cosine_sim[0][indices[index2]])
        ratings2.append(cosine_sim_user[0][indices_user[index2]] * df_travel_features_user.iloc[indices_user[index2]][index])
    ratings_user.append(sum(ratings2_user)*k)

user_results = [df_travel_features.index[index] for index in sorted(range(len(ratings_user)), key=lambda i: ratings_user[i], reverse=True) if df_travel_features.index[index] not in user_visits]
user_results[:10]




['Aachen',
 'Aalborg',
 'Aarhus',
 'Aberdeen',
 'Abu Dhabi',
 'Acapulco',
 'Accra',
 'Addis Ababa',
 'Adelaide',
 'Aden']

# Executing Item-Based Collaborative Filtering
This follows multiple steps:
- Calculating the pairwise similarity between all items.
- Predict locations that the new user may want to go to
- Return top n locations that they have not visited yet

In [562]:
# Get frequency counts of most popular cities
popular_cities = list(insta_df_condensed['city_name'].value_counts().index)[:10]

In [544]:
df_travel_features_sum = df_travel_features.sum(axis=1)

In [545]:
# Get pairwise similarity between all items and create item-item cosine similarity matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(df_travel_features)

In [555]:
# Create vector of user visits
user_visits = ['Tokyo']
user_vector = []

for location in df_travel_features.index:
    if location in user_visits:
        user_vector.append(1)
    else:
        user_vector.append(0)
        
user_rating = []

for index in range(len(cosine_sim)):
    # If the user has not visited the location
    # Rating(user, san francisco) = similarity(san francisco, tokyo)*rating(user, tokyo) + sim(san francisco, New York) * rating(user, new york)/
    # (similarity(san francisco, tokyo) + similarity(san francisco, new york) + sim(san francisco, london)
    numerator = []
    denominator = []
    if user_vector[index] == 0:
        for index2 in range(len(cosine_sim[index])):
            numerator.append(cosine_sim[index][index2] * user_vector[index2])
            denominator.append(cosine_sim[index][index2])
        user_rating.append(sum(numerator)/sum(denominator))

    else:
        user_rating.append(0)

indices = sorted(range(len(user_rating)), key=lambda i: user_rating[i], reverse=True)[:10]
for index in indices:
    print(df_travel_features.index[index])
    
sorted(user_rating,reverse=True) 
    
    

Melbourne
Olinda
Jena
Guanajuato
Aachen
Aalborg
Aarhus
Aberdeen
Abu Dhabi
Acapulco


[0.24299239861190464,
 0.24299239861190464,
 0.026303205910118026,
 0.01699757593704191,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,

In [594]:
# Pickle similarity score data, indices, wiki_df, tfidf_vectorizer, x_tfidf
with open('collab_filter_artifacts.pickle', 'wb') as f:
    pickle.dump([df_travel_features,cosine_sim,city_dict,popular_cities]) 
my_bucket.upload_file('collab_filter_artifacts.pickle','collab_filter_artifacts.pickle')

In [602]:
def get_recommendations_city(places_visited):
    '''Enter a list of cities and get recommendations back'''
    places_visited = [place.title() for place in places_visited]
    print(places_visited)
    user_vector = []
    for location in df_travel_features.index:
        if location in places_visited:
            user_vector.append(1)
        else:
            user_vector.append(0)
    
    # Append travel predictions to a list
    user_rating = []
    # Loop through each location in the cosine sim matrix
    for index in range(len(cosine_sim)):
        numerator = []
        denominator = []
        # If the user has not visited the location
        if user_vector[index] == 0:
            # Loop through pairwise similarities of the given locations
            for index2 in range(len(cosine_sim[index])):
                # Pairwise similarity times whether the user has visited the location
                numerator.append(cosine_sim[index][index2] * user_vector[index2])
                denominator.append(cosine_sim[index][index2])
            user_rating.append(sum(numerator)/sum(denominator))

        else:
            user_rating.append(0)
    # Get indices of top 10 ranked cities
    indices = sorted(range(len(user_rating)), key=lambda i: user_rating[i], reverse=True)[:10]
    print(indices)
    ratings_list = sorted(user_rating,reverse=True) 
    recommendations = []
    for index in range(len(indices)):
        if ratings_list[index] > 0:
            recommendations.append(df_travel_features.index[indices[index]])
        else:
            print(popular_cities[:10-index])
            recommendations = recommendations + popular_cities[:10-index]
            break
    city_recs = []
    loop_count = 0
    for city in recommendations:
        city_recs.append({"location_id": city_dict[city], "location_name": city, "cosine_similarity": ratings_list[loop_count]})
        loop_count += 1
    
    return city_recs

get_recommendations_city(['London'])   

['London']
[91, 99, 225, 235, 253, 316, 413, 440, 463, 480]


[{'cosine_similarity': 0.028687099218015363,
  'location_id': 870906703,
  'location_name': 'Belgrade'},
 {'cosine_similarity': 0.028687099218015363,
  'location_id': 9750512,
  'location_name': 'Bilbao'},
 {'cosine_similarity': 0.028687099218015363,
  'location_id': 480007,
  'location_name': 'Edinburgh'},
 {'cosine_similarity': 0.028687099218015363,
  'location_id': 234483875,
  'location_name': 'Enugu'},
 {'cosine_similarity': 0.028687099218015363,
  'location_id': 3000420,
  'location_name': 'Fort Worth'},
 {'cosine_similarity': 0.028687099218015363,
  'location_id': 1999995,
  'location_name': 'Heidelberg'},
 {'cosine_similarity': 0.028687099218015363,
  'location_id': 238056270,
  'location_name': 'Kurashiki'},
 {'cosine_similarity': 0.028687099218015363,
  'location_id': 236278051,
  'location_name': 'Limassol'},
 {'cosine_similarity': 0.028687099218015363,
  'location_id': 252052804,
  'location_name': 'Madrid'},
 {'cosine_similarity': 0.028687099218015363,
  'location_id': 247

# Miscellaneous

In [None]:
# Rating(user, san francisco) = similarity(san francisco, tokyo) + sim(san francisco, New York)/
# (similarity(san francisco, tokyo) + similarity(san francisco, new york) + sim(san francisco, london)

In [85]:
from sklearn.neighbors import NearestNeighbors
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20)
model_knn.fit(sparse_travel_features)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=20, p=2, radius=1.0)

In [59]:
query_index = np.random.choice(df_travel_features.shape[0])
query_index

689

In [107]:
df_travel_features.head()

user_name,2straws,AlexStrohl,Anna.Everywhere,Bemytravelmuse,BrendanVanSon,Chelseakauai,Chloe_T,Danielkordan,DrewBinsky,Eljackson,...,theblondeabroad,thejetsetterdiaries,theplanetd,theupbeatpath,theworldpursuit,tiffpenguin,travelbabbo,uncornered_market,vancitywild,worldwanderlust
city_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aachen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
Aalborg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Aarhus,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Aberdeen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Abu Dhabi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [367]:
# query_index = np.random.choice(df_travel_features.shape[0])
query_index = 573

distances, indices = model_knn.kneighbors(df_travel_features.iloc[query_index].reshape(1, -1), n_neighbors = 800)

for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(df_travel_features.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, df_travel_features.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for Okinawa:

1: Matsuyama, with distance of 0.0:
2: Jos, with distance of 0.0:
3: Minoh, with distance of 0.0:
4: Jiayuguan, with distance of 0.0:
5: Astrakhan, with distance of 0.0:
6: Chongqing, with distance of 0.0:
7: Baku, with distance of 0.0:
8: Sarajevo, with distance of 0.0:
9: Moscow, with distance of 0.0:
10: Nagasaki, with distance of 0.0:
11: Cartago, with distance of 0.0:
12: Nakhon Si Thammarat, with distance of 0.0:
13: Busan, with distance of 0.0:
14: Okinawa, with distance of 0.0:
15: Hakodate, with distance of 0.0:
16: Datong, with distance of 0.0:
17: Manzanillo, with distance of 0.0:
18: Tabriz, with distance of 0.0:
19: Kumamoto, with distance of 0.0:
20: Lanzhou, with distance of 0.0:
21: Kuching, with distance of 0.0:
22: Yokohama, with distance of 0.0:
23: Kota Kinabalu, with distance of 0.0:
24: Konya, with distance of 0.0:
25: Zhangye, with distance of 0.29289321881345254:
26: Hikone, with distance of 0.29289321881345254:
27: Anshun, with dis

  after removing the cwd from sys.path.


In [None]:
def(city):
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(df_travel_features.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, df_travel_features.index[indices.flatten()[i]], distances.flatten()[i]))