# Recommendation Engine

### Collaborative Filtering using Matrix Factorization with Implicit Data

# Import & Clean Data

In [298]:
import pandas as pd
import numpy as np
import sys
import random
import string

In [299]:
# Load User Flight Data
user_data = pd.read_csv('user_data.txt')
user_data.orig = user_data.orig.apply(lambda x: x.strip())
user_data.dest = user_data.dest.apply(lambda y: y.strip())
user_data['od'] = user_data.orig + user_data.dest

# Region DataFrame 
region_data = user_data[['od', 'region']].drop_duplicates()

In [300]:
# Create unique id for Origin Destination pair
user_data['od_id'] = user_data.groupby('od').ngroup()
user_data['user_id'] = user_data.groupby('user_id').ngroup()

In [301]:
counts = user_data.groupby(['user_id', 'region'], as_index = False)['dest'].count()
counts = counts.rename(columns = {'dest': 'freq'})
user_data = pd.merge(user_data, counts, on = ['user_id', 'region'], how = 'left')

# User Inputs

In [302]:
origin = input('Where are you flying from: ')
destination = input('Where do you want to go: ')
user_id = int(input('Enter a User ID to get recommendations: '))
user_data = user_data[user_data.orig == origin]

try:
    reg = user_data[user_data.od == origin + destination]['region'].values[0]
except:
    pass

Where are you flying from: EWR
Where do you want to go: CDG
Enter a User ID to get recommendations: 66


# Alternating Least Square (ALS) Algorithm

In [303]:
import implicit
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
from sklearn.preprocessing import MinMaxScaler

'''
The implicit library expects user_data as a item-user matrix so we create two matricies:
1.) Fitting the Model (item-user) 
2.) Recommendations (user-item)
'''

sparse_item_user = sparse.csr_matrix((user_data['freq'].astype(float), (user_data['od_id'], user_data['user_id'])))
sparse_user_item = sparse.csr_matrix((user_data['freq'].astype(float), (user_data['user_id'], user_data['od_id'])))

In [304]:
# Initialize the ALS model and fit it using the sparse item-user matrix
model = implicit.als.AlternatingLeastSquares(factors = 10, 
                                             regularization = 0.1,
                                             iterations = 5000
                                            )

# Calculate the confidence by multiplying it by our alpha value.
alpha_value = 27
data_conf = (sparse_item_user * alpha_value).astype('double')

#Fit the model
model.fit(data_conf)

HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))




#### Compute the n Most Similar Items for the User-Defined Origin Destination Pair ID

In [305]:
item_id = user_data[user_data.dest == destination]['od_id'].values[0]
n_similar = 20                                    # Number of similar items to compute
similar = model.similar_items(item_id, n_similar) # Use implicit to get similar items

for item in similar:                              # Print names of the most similar airports
    idx, score = item
    print(user_data.dest.loc[user_data.od_id == idx].iloc[0])

CDG
STL
UVF
PLS
BZE
SDQ
MSY
BNA
FLL
BDL
BRU
MEX
CMH
POS
LHR
PBI
TLV
YYZ
SRQ
IND


#### Create Recommendations for a Unique User ID

In [306]:
recommended = model.recommend(user_id, sparse_user_item) # Use the implicit recommender
airports = []                                            # Empty list to append data
scores = []

for item in recommended:                                 # Get airport names from ids
    idx, score = item
    airports.append(user_data.dest.loc[user_data.od_id == idx].iloc[0])
    scores.append(score.round(3))

# Create DataFrame with  recommended airports and scores
recommendations = pd.DataFrame({'destination': airports, 'score': scores})

#### Clean Data and Join Tables - to provide more information about the recommendations

In [307]:
# Join Airport Codes to imported .csv file with it's respective city/country
cities = pd.read_csv('city_code.csv', engine = 'python')
recs = pd.merge(recommendations, cities, on = 'destination', how = 'left')
recs['origin'] = origin
recs['od'] = recs.origin + recs.destination
recs = pd.merge(recs, region_data, on = 'od', how = 'left')
#recs = recs[recs.region == reg]
recs = recs[['origin', 'destination', 'region', 'city', 'country', 'score']].reset_index(drop = True)
recs

Unnamed: 0,origin,destination,region,city,country,score
0,EWR,BOS,49S,Boston,United States,0.42
1,EWR,ZRH,ATL,Zurich,Switzerland,0.372
2,EWR,BTV,49S,Burlington,United States,0.332
3,EWR,BCN,ATL,Barcelona,Spain,0.289
4,EWR,FCO,ATL,Rome,Italy,0.289
5,EWR,IAH,49S,Houston,United States,0.284
6,EWR,DUB,ATL,Dublin,Ireland,0.28
7,EWR,ORF,49S,Norfolk,United States,0.279
8,EWR,EDI,ATL,Edinburgh,United Kingdom,0.275
9,EWR,TLV,ATL,Tel Aviv Yafo,Israel,0.27
