# Hybrid Recommendation System using LightFM

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score
import numpy as np
from lightfm.cross_validation import random_train_test_split
import os
from scipy.sparse import csr_matrix

# Create lightFM Dataset
## Load data (users, movies, rating)

In [2]:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url', 'action', 'adventure', 'animation', 'children','comedy', 'crime','documentary', 'drama', 'fantasy', 'film_noir','horror', 'musical', 'mystery', 'romance',' scifi', 'thriller', 'war', 'western', 'no_genre']
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
df = []
for i,c,s in zip(['user', 'item', 'data'], [u_cols,m_cols,r_cols], ['|','|','\t']):
    filename = 'u.'+i
    path = os.path.join('data','ml-100k',filename)
    temp = pd.read_csv(path, sep=s, names=c,
                    encoding='latin-1')
    df.append(temp)
user, item, rating = df[0].copy(),df[1].copy(),df[2].copy()

In [3]:
user.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


## Create binning for Age
### Check Distribution

In [4]:
user['age'].describe()

count    943.000000
mean      34.051962
std       12.192740
min        7.000000
25%       25.000000
50%       31.000000
75%       43.000000
max       73.000000
Name: age, dtype: float64

### Check quantiles to create four groups proportionally

In [5]:
pd.qcut(user['age'],4)

0      (6.999, 25.0]
1       (43.0, 73.0]
2      (6.999, 25.0]
3      (6.999, 25.0]
4       (31.0, 43.0]
           ...      
938     (25.0, 31.0]
939     (31.0, 43.0]
940    (6.999, 25.0]
941     (43.0, 73.0]
942    (6.999, 25.0]
Name: age, Length: 943, dtype: category
Categories (4, interval[float64, right]): [(6.999, 25.0] < (25.0, 31.0] < (31.0, 43.0] < (43.0, 73.0]]

### Create adjusted binning

In [6]:
user['age_bin'] = pd.cut(user['age'], bins=[0,25,30,45,np.inf], labels= ['<= 25', '26 - 30', '31 - 45', '>= 45'])

# Check Data

In [7]:
user.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code,age_bin
0,1,24,M,technician,85711,<= 25
1,2,53,F,other,94043,>= 45
2,3,23,M,writer,32067,<= 25
3,4,24,M,technician,43537,<= 25
4,5,33,F,other,15213,31 - 45


In [8]:
item.head()

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url,action,adventure,animation,children,comedy,...,film_noir,horror,musical,mystery,romance,scifi,thriller,war,western,no_genre
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [9]:
rating.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [10]:
rating['liked'] = np.where(rating['rating'] >=3,1,0)

### User Features Data Preparation

In [11]:
user = pd.get_dummies(user.drop(columns = ['age','zip_code']))
user_features_col = user.drop(columns =['user_id']).columns.values
user_feat = user.drop(columns =['user_id']).to_dict(orient='records')

user.head()

Unnamed: 0,user_id,sex_F,sex_M,occupation_administrator,occupation_artist,occupation_doctor,occupation_educator,occupation_engineer,occupation_entertainment,occupation_executive,...,occupation_retired,occupation_salesman,occupation_scientist,occupation_student,occupation_technician,occupation_writer,age_bin_<= 25,age_bin_26 - 30,age_bin_31 - 45,age_bin_>= 45
0,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
1,2,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,3,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
3,4,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
4,5,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


### Movie features data preparation

In [12]:
item_features = item.drop(columns=['title', 'release_date', 'video_release_date', 'imdb_url'])
item_features_col = item_features.drop(columns=['movie_id']).columns.values
item_feat = item_features.drop(columns =['movie_id']).to_dict(orient='records')

### Fit users, items, user features, item features into lightFM Dataset() object to create mappings

In [13]:
dataset = Dataset()
dataset.fit(users=[x for x in user['user_id']], items=[x for x in item['movie_id']], item_features=item_features_col, user_features=user_features_col)

num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))

Num users: 943, num_items 1682.


### Build item features to be fitted into the model

In [14]:
item_features = dataset.build_item_features((x,y) for x,y in zip(item_features['movie_id'],item_feat))

### Build user features to be fitted into the model

In [15]:
user_features = dataset.build_user_features((x,y) for x,y in zip(user['user_id'],user_feat))

### Build interactions (user - item) and its respective weights (in this case each user's movie rating score)

In [16]:
(interactions, weights) = dataset.build_interactions((x, y)
                                                      for x,y in zip(rating['user_id'], rating['movie_id']))

# Model training
## Split train test

In [17]:
train, test = random_train_test_split(interactions,test_percentage=0.2, random_state=779)
train_w, test_w = random_train_test_split(weights, test_percentage=0.2, random_state=779)

## Create Model

In [18]:
n_components = 30
loss = 'warp'
epoch = 30
num_thread = 4
model = LightFM(no_components= n_components, loss=loss, random_state = 1616)
model.fit(train,  user_features= user_features, item_features= item_features, epochs=epoch,num_threads = num_thread, sample_weight = train_w)

<lightfm.lightfm.LightFM at 0x29013323670>

## Model Evaluation

In [19]:
train_precision = precision_at_k(model, train, k=10,item_features=item_features, user_features=user_features).mean()
test_precision = precision_at_k(model, test,train_interactions=train, k=10,item_features=item_features, user_features=user_features).mean()

train_recall = recall_at_k(model, train, k=10,item_features=item_features, user_features=user_features).mean()
test_recall = recall_at_k(model, test,train_interactions=train, k=10,item_features=item_features, user_features=user_features).mean()

train_auc = auc_score(model, train,item_features=item_features, user_features=user_features).mean()
test_auc = auc_score(model, test, train_interactions=train,item_features=item_features, user_features=user_features).mean()

print('Precision: train %.2f' % (train_precision))
print('Precision: test %.2f' % (test_precision))

print('Recall: train %.2f' % (train_recall))
print('Recall: test %.2f' % (test_recall))

print('AUC: train %.2f' % (train_auc))
print('AUC: test %.2f' % (test_auc))

Precision: train 0.49
Precision: test 0.25
Recall: train 0.09
Recall: test 0.13
AUC: train 0.90
AUC: test 0.89


# Recommendation

### Predict scores for sample user (lightFM index =  3, user_id = 4)

In [20]:
scores = model.predict(3, np.arange(1682))
top_items = item.iloc[np.argsort(-scores)]
known_positives = item.iloc[interactions.tocsr()[3].indices]

## Recommendation result

In [21]:
top_items[0:10][['title','movie_id']]

Unnamed: 0,title,movie_id
330,"Edge, The (1997)",331
318,Everyone Says I Love You (1996),319
303,Fly Away Home (1996),304
1292,Star Kid (1997),1293
302,Ulee's Gold (1997),303
896,Time Tracers (1995),897
988,Cats Don't Dance (1997),989
352,Deep Rising (1998),353
343,"Apostle, The (1997)",344
902,Afterglow (1997),903


### Movies in top 10 recommendation that already rated by sample user

In [22]:
known_positives_rating = rating[(rating['user_id']==user['user_id'][3])][['movie_id','rating']].merge(item[['movie_id','title']], on = 'movie_id')
known_positives_rating[known_positives_rating['movie_id'].isin(top_items['movie_id'][0:10])]

Unnamed: 0,movie_id,rating,title
1,303,5,Ulee's Gold (1997)


### Known ratings by sample user sorted descending from highest rating

In [23]:
known_positives_rating.sort_values(by=['rating'], ascending = False)

Unnamed: 0,movie_id,rating,title
23,301,5,In & Out (1997)
19,359,5,"Assignment, The (1997)"
17,327,5,Cop Land (1997)
15,329,5,Desperate Measures (1998)
20,362,5,Blues Brothers 2000 (1998)
13,258,5,Contact (1997)
1,303,5,Ulee's Gold (1997)
11,300,5,Air Force One (1997)
9,354,5,"Wedding Singer, The (1998)"
8,50,5,Star Wars (1977)


## Similar item calculation from item features

In [24]:
def similar_items(item_id, model, N=10, norm = True):
    item_bias ,item_representations = model.get_item_representations(features=item_features)

    # Cosine similarity
    scores = item_representations.dot(item_representations[item_id, :])
    item_norms = np.linalg.norm(item_representations, axis=1)

    if norm == True:
        scores /= item_norms
        best = np.argpartition(scores, -N)[-N:]
        similar = sorted(zip(best, scores[best]/ item_norms[item_id] ), key=lambda x: -x[1])
    else:
        best = np.argpartition(scores, -N)[-N:]
        similar = sorted(zip(best, scores[best] ), key=lambda x: -x[1])
    return similar

In [25]:
similar_item_list = similar_items(302, model)
similar_idx = [x[0] for x in similar_item_list ]
item.iloc[similar_idx][['title']]

Unnamed: 0,title
302,Ulee's Gold (1997)
886,Eve's Bayou (1997)
304,"Ice Storm, The (1997)"
873,Career Girls (1997)
899,Kundun (1997)
895,"Sweet Hereafter, The (1997)"
339,Boogie Nights (1997)
123,Lone Star (1996)
897,"Postman, The (1997)"
1100,Six Degrees of Separation (1993)


### Similar users calculation from user_features

In [26]:
def similar_users(user_id, model, N=10, norm = True):
    user_bias ,user_representations = model.get_user_representations(features= user_features)

    # Cosine similarity
    scores = user_representations.dot(user_representations[user_id, :])
    item_norms = np.linalg.norm(user_representations, axis=1)
    
    if norm == True:
        scores /= item_norms
        best = np.argpartition(scores, -N)[-N:]
        similar = sorted(zip(best, scores[best] / item_norms[user_id]), 
                    key=lambda x: -x[1])
    else:
        best = np.argpartition(scores, -N)[-N:]
        similar = sorted(zip(best, scores[best]), 
                    key=lambda x: -x[1])
    return similar

In [27]:
similar_item_list = similar_users(3,model)
similar_idx = [x[0] for x in similar_item_list]
cols = ['user_id', 'sex_M', 'occupation_writer', 'age_bin_<= 25']
user[user['user_id'].isin(similar_idx)].loc[:,cols]

Unnamed: 0,user_id,sex_M,occupation_writer,age_bin_<= 25
2,3,1,1,1
151,152,0,0,0
175,176,1,0,0
247,248,1,0,1
412,413,1,0,0
549,550,0,0,1
715,716,0,0,0
779,780,1,0,0
810,811,0,0,0
830,831,1,0,1


# Recommendation for new user (cold start problem)

In [28]:
new_user = pd.DataFrame(np.zeros(len(user_features_col))).T
new_user.columns = user_features_col
new_user['sex_M'] = 1
new_user['occupation_lawyer'] = 1
new_user['age_bin_<= 25'] = 1
new_user = csr_matrix(new_user)
scores_new_user = model.predict(user_ids = 0,item_ids = np.arange(interactions.shape[1]), user_features=new_user)
top_items_new_user = item.iloc[np.argsort(-scores_new_user)]
top_items_new_user[0:10][['title']]

Unnamed: 0,title
49,Star Wars (1977)
0,Toy Story (1995)
99,Fargo (1996)
180,Return of the Jedi (1983)
287,Scream (1996)
312,Titanic (1997)
150,Willy Wonka and the Chocolate Factory (1971)
126,"Godfather, The (1972)"
301,L.A. Confidential (1997)
422,E.T. the Extra-Terrestrial (1982)
