In [41]:
#Import packages

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from gensim.models import Word2Vec
import geopy.distance
from geopy import Nominatim
from tqdm import tqdm
from geopy.extra.rate_limiter import RateLimiter

In [71]:
# load data

checkin= pd.read_csv('/home/ubuntu/sina/checkins.txt', sep='\t')
checkin.head()

#rename columns
checkin.columns = ['userID', 'tweetID', 'latitude',
                   'longitude', 'time', 'placeID', 'contentInfo']

#replace | with spaces
checkin['contentInfo'] = [w.replace('|', ' ') for w in checkin['contentInfo']]


In [131]:
#load train data
train = pd.read_csv('/home/ubuntu/sina/Foursquare_train.txt', header=None, names = ['userID', 'placeID','freq'], sep='\t')

In [73]:
train.head()

Unnamed: 0,userID,placeID,freq
0,16,193,13
1,16,194,4
2,16,197,1
3,16,198,3
4,16,199,2


In [74]:
# merge train data to get contentInfo
train_poi = train.merge(checkin[['userID', 'placeID', 'contentInfo']], on = ['userID', 'placeID'], how='left')

In [75]:
train_poi.head()

Unnamed: 0,userID,placeID,freq,contentInfo
0,16,193,13,ice cream shop shops
1,16,193,13,ice cream shop shops
2,16,193,13,ice cream shop shops
3,16,193,13,ice cream shop shops
4,16,193,13,ice cream shop shops


In [118]:
# read in test and tune datasets
test = pd.read_csv('/home/ubuntu/sina/Foursquare_test.txt', header=None, names = ['userID', 'placeID','freq'], sep='\t')
tune = pd.read_csv('/home/ubuntu/sina/Foursquare_tune.txt', sep='\t')

In [120]:
# combine test and tune data
test_poi = test.append(tune,ignore_index=True)

In [119]:
tune.head()

Unnamed: 0,userID,placeID,count
0,16,206,1
1,16,210,4
2,16,211,2
3,16,214,1
4,16,222,1


## Content Based Recommendation

In [79]:
# aggregate content information by places

train_poi = train_poi.groupby('placeID').agg({'contentInfo': ' '.join}).reset_index()

In [80]:
# clean content data to remove duplicates
from collections import OrderedDict

train_poi['cleanContent'] = (train_poi['contentInfo'].str.split()
                              .apply(lambda x: OrderedDict.fromkeys(x).keys())
                              .str.join(' '))

In [81]:
# Create tf-Idf matrix

tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3),
                     min_df=1, stop_words='english')

tfidf_matrix = tf.fit_transform(train_poi['cleanContent'])


(16289, 143387)

In [82]:
#Calculate cosine similarity for POIs

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [84]:
# reset index
train_poi = train_poi.reset_index(drop=True)

In [85]:
# Find 20 similar POIs for each place

results = {}
for idx, row in train_poi.iterrows():
    similar_indices = cosine_similarities[idx].argsort()[:-20:-1]
    similar_items = [(cosine_similarities[idx][i], train_poi['placeID'][i]) for i in similar_indices]
    results[row['placeID']] = similar_items[1:]

In [86]:
# create a function to recommend POIs using similarity matrix
def item(id):  
  return train_poi.loc[train_poi['placeID'] == id]['contentInfo'].tolist()[0].split(' - ')[0] 


# Just reads the results out of the dictionary.def 
def recommend(item_id, num):
    print("Recommending " + str(num) + " products similar to " + str(item_id) + "...")   
    print("-------")
    recs = results[item_id][:num]   
    for rec in recs: 
       print("Recommended: " + str(rec[1]) + 
             #" "+ item(rec[1]) +
             " (score:" +      str(rec[0]) + ")")


In [87]:
# Test recommendations
recommend(item_id = 11, num = 5)

Recommending 5 products similar to 11...
-------
Recommended: 6811 (score:0.6638827651810162)
Recommended: 15980 (score:0.6590080385338191)
Recommended: 1418 (score:0.6340125821934379)
Recommended: 1166 (score:0.5975030309152985)
Recommended: 6805 (score:0.5947515318108647)


# EVALUATION

In [132]:
# function to return recommended placeID
def recommend_eval(item_id, num):
    recs = results[item_id][:num]   
    return [r[1] for r in recs]


In [133]:
# Calculate hits 

final=[]
hits= 0
recs = 0

for idx, row in test_poi.iterrows():
    #print ("user", row['userID'])
    try:
        rec = recommend_eval(row['placeID'], 10)
    except:
        continue
    test = test_poi.loc[test_poi.userID == row['userID'], "placeID"]
    hit = len(set(rec).intersection(set(test)))
    hits += hit
    recs += len(rec)


In [146]:
# Precision, Recall, F1-Score
print ("Precision {} \nRecall {} \nF1-score {}" .format(hits/recs, 
       hits/len(test_poi), 
       ((2 * (hits/recs) * hits/len(test_poi)) / ((hits/recs)+ hits/len(test_poi)))))

Precision 0.01304494382022472 
Recall 0.05704318773645163 
F1-score 0.02123398535019615
