# Hybrid Models for Recommendation Systems

Load Pandas, we are going to need it for manipulating data

In [1]:
import pandas as pd
import numpy as np
from IPython.display import Image
np.set_printoptions(precision = 3)

Now load the data

In [26]:
user_ratings_df = pd.read_csv("user_ratings.csv")
user_features_df = pd.read_csv("user_features.csv")
item_features_df = pd.read_csv("item_features.csv")


user_features_df["key"] = 0
user_features_df["user_id"] = range(0,user_features_df.shape[0])
item_features_df["key"] = 0
item_features_df["item_id"] = range(0,item_features_df.shape[0])

merged_df = pd.merge(user_features_df, item_features_df,left_index=True,on="key")
merged_df[["item_id", "user_id"]]




merged_df["rating"] = map(lambda ids: user_ratings_df.values[ids[1]][ids[2]], 
                          merged_df[["user_id", "item_id"]].itertuples())

train = merged_df.dropna()

test = merged_df[merged_df.isnull().any(axis=1)]

print (test)



Empty DataFrame
Columns: [Sex,  Over60, key, user_id, Critic0,  Critic1, item_id, rating]
Index: []


In [11]:
n_latent_features = 2

user_ratings = user_ratings_df.values
latent_user_preferences = np.random.random((user_ratings.shape[0], n_latent_features))
latent_item_features = np.random.random((user_ratings.shape[1],n_latent_features))

user_features = user_features_df.values
item_features = item_features_df.values

print (item_features_df)


user_features = np.concatenate([np.ones(shape = (user_features.shape[0],1)), user_features], axis = 1)
item_features = np.concatenate([np.ones(shape = (item_features.shape[0],1)), item_features], axis = 1)



user_features_weights = np.random.random((user_ratings.shape[0], user_features.shape[1] ))
item_features_weights = np.random.random((user_ratings.shape[1],item_features.shape[1] ))



# print user_features

   Critic0   Critic1  key  item_id
0      0.3       0.9    0        0
1      0.9       0.3    0        1
2      0.6       0.4    0        2
3      0.2       0.1    0        3
4      0.7       0.8    0        4


In [13]:
def predict_rating(user_id,item_id):
    """ Predict a rating given a user_id and an item_id.
    """
    user_preference = latent_user_preferences[user_id]
    item_preference = latent_item_features[item_id]
    
    user_score = user_features_weights[user_id].dot(user_features[user_id])
    item_score = item_features_weights[item_id].dot(item_features[item_id])
    #print user_preference.dot(item_preference), user_score, item_score
    return user_preference.dot(item_preference) + user_score + item_score

def train(user_id, item_id, rating,alpha = 0.001, 
                                   latent_feature_weight_decay = 0.1, 
                                   user_weight_decay = 0.01,
                                   item_weight_decay = 0.0001):
    
    #print item_id
    prediction_rating = predict_rating(user_id, item_id)
    err =  ( prediction_rating - rating );
    #print err
    user_pref_values = latent_user_preferences[user_id][:]
    latent_user_preferences[user_id] -= alpha * err *  ( latent_item_features[item_id] + latent_feature_weight_decay*latent_user_preferences[user_id])
    latent_item_features[item_id] -= alpha * err * ( user_pref_values + latent_feature_weight_decay*latent_item_features[item_id])
    
    user_features_weights[user_id] -=alpha * err *(  user_features[user_id] + user_weight_decay* user_features_weights[user_id])
    item_features_weights[item_id] -=alpha * err * ( item_features_weights[item_id] + item_weight_decay* item_features_weights[item_id])
    
    
    return err
    


def sgd(iterations = 30000):
    """ Iterate over all users and all items and train for 
        a certain number of iterations
    """
    for iteration in range(0,iterations):
        error = []
        for user_id in range(0,latent_user_preferences.shape[0]):
            for item_id in range(0,latent_item_features.shape[0]):
                rating = user_ratings[user_id][item_id]
                if(not np.isnan(rating)):
                    err = train(user_id,item_id,rating)
                    error.append(err)
    mse = (np.array(error) ** 2).mean()          
    print (mse)
                    
                    
    
                    
                    
    


In [14]:
for _ in range(0,10): 
    sgd()

0.2784242195113409
0.27457411516785285
0.2734557631312004
0.272984786015086
0.27276553046412283
0.2726712316698812
0.27264979045339194
0.2726764719275502
0.27273848144777674
0.27282892589865115


In [16]:
predictions = np.zeros(shape = (latent_user_preferences.shape[0], latent_item_features.shape[0]) )
#print latent_user_preferences
print (user_features_weights)
print (item_features_weights)
for user_id in range(0,latent_user_preferences.shape[0]):
            for item_id in range(0,latent_item_features.shape[0]):
                predictions[user_id,item_id] =  predict_rating(user_id,item_id)

  

[[ 0.561  0.923  0.024  0.293  0.73 ]
 [-1.009  0.199 -0.556  0.722 -0.166]
 [ 0.236  0.757  0.411  0.312  1.009]
 [ 0.147  0.945  0.677  0.755  0.809]
 [-0.02   0.597  0.572  0.36  -0.075]
 [ 0.823  0.536  0.587  0.959  3.078]
 [ 0.195  0.572  0.274  0.764 -0.041]
 [ 0.593  0.945  0.165  0.247  0.031]
 [ 0.352  0.51   0.548  0.899 -0.468]
 [ 0.209  0.219  0.89   0.38  -0.224]]
[[1.787e+00 2.528e+00 8.516e-01 2.105e+00 1.965e+00]
 [1.506e-02 2.658e-02 2.449e-02 1.398e-03 3.811e-03]
 [3.064e-02 8.489e-02 1.594e-01 9.492e-02 2.900e-02]
 [8.432e-01 2.898e-02 3.734e-01 2.156e-01 5.472e-01]
 [1.867e+00 2.256e-01 2.670e-01 2.443e+00 1.924e+00]]


In [21]:
values = [zip(user_ratings[i], predictions[i]) for i in range(0,predictions.shape[0])]
comparison_data = pd.DataFrame(values)
comparison_data.columns = user_ratings_df.columns
#comparison_data.applymap(lambda (x,y): "(%2.3f|%2.3f)"%(x,y))

In [22]:
comparison_data

Unnamed: 0,The Call of Cthulhu,Frankenstein,Dracula,Neuromancer,Space Odyssey
0,"(8.0, 7.937341203335815)","(2.0, 2.2309664323723006)","(nan, 21.289816529023664)","(5.0, 4.85439863316201)","(4.0, 3.9792628899830875)"
1,"(3.0, 2.9029806170556514)","(2.0, 2.362459736783734)","(nan, -32.395347021965314)","(7.0, 6.7706950618795645)","(7.0, 6.967595624672963)"
2,"(9.0, 8.936303662596352)","(nan, 4.631064863172122)","(7.0, 7.007501387385744)","(8.0, 8.053118736756023)","(5.0, 4.999866179808517)"
3,"(nan, 8.911641568662445)","(nan, 4.961565790785496)","(7.0, 7.000027036596634)","(8.0, 7.9993272788732614)","(9.0, 8.999469435517353)"
4,"(nan, 4.912595230661494)","(1.0, 0.6839478091217791)","(8.0, 8.006028406196476)","(3.0, 3.2663237144659556)","(7.0, 7.027706104724524)"
5,"(2.0, 2.0047986568797347)","(3.0, 2.9948101165210272)","(5.0, 4.999681329443879)","(nan, -0.01654406052371682)","(nan, 62.88859888550325)"
6,"(4.0, 4.445379857770931)","(2.0, 0.3867126008108789)","(nan, 6.2437819925185085)","(2.0, 2.9817010269793736)","(7.0, 7.140341584044863)"
7,"(7.0, 6.441383648781425)","(1.0, 2.9213908445383185)","(2.0, 2.066139906216127)","(7.0, 5.868454513259691)","(9.0, 8.836160242463166)"
8,"(3.0, 3.160048442238635)","(3.0, 2.427839201259117)","(nan, -34.08833468928435)","(7.0, 7.336158149878218)","(3.0, 3.048031221713254)"
9,"(4.0, 4.16691719703112)","(nan, -0.14517078232080602)","(5.0, 4.993335000830787)","(3.0, 2.8687510013746493)","(3.0, 2.999121878712298)"


In [23]:
d = comparison_data.to_latex()
text_file = open("comparison.txt", "w")
text_file.write(d)
text_file.close()