# Hybrid Models for Recommendation Systems

Load Pandas, we are going to need it for manipulating data

In [1]:
import pandas as pd
import numpy as np
from IPython.display import Image
np.set_printoptions(precision = 3)

Now load the data

In [2]:
user_ratings_df = pd.read_csv("user_ratings.csv")
user_features_df = pd.read_csv("user_features.csv")
item_features_df = pd.read_csv("item_features.csv")


user_features_df["key"] = 0
user_features_df["user_id"] = range(0,user_features_df.shape[0])
item_features_df["key"] = 0
item_features_df["item_id"] = range(0,item_features_df.shape[0])

merged_df = pd.merge(user_features_df, item_features_df,left_index=True,on="key")
merged_df[["item_id", "user_id"]]




merged_df["rating"] = map(lambda ids: user_ratings_df.values[ids[1]][ids[2]], 
                          merged_df[["user_id", "item_id"]].itertuples())

train = merged_df.dropna()

test = merged_df[merged_df.isnull().any(axis=1)]

print(test.to_latex())



\begin{tabular}{lrrrrrrrl}
\toprule
Empty DataFrame
Columns: Index(['Sex', ' Over60', 'key', 'user\_id', 'Critic0', ' Critic1', 'item\_id',
       'rating'],
      dtype='object')
Index: Int64Index([], dtype='int64') \\
\bottomrule
\end{tabular}



In [3]:
n_latent_features = 2

user_ratings = user_ratings_df.values
latent_user_preferences = np.random.random((user_ratings.shape[0], n_latent_features))
latent_item_features = np.random.random((user_ratings.shape[1],n_latent_features))

user_features = user_features_df.values
item_features = item_features_df.values

print(item_features_df.to_latex())


user_features = np.concatenate([np.ones(shape = (user_features.shape[0],1)), user_features], axis = 1)
item_features = np.concatenate([np.ones(shape = (item_features.shape[0],1)), item_features], axis = 1)



user_features_weights = np.random.random((user_ratings.shape[0], user_features.shape[1] ))
item_features_weights = np.random.random((user_ratings.shape[1],item_features.shape[1] ))



# print user_features

\begin{tabular}{lrrrr}
\toprule
{} &  Critic0 &   Critic1 &  key &  item\_id \\
\midrule
0 &      0.3 &       0.9 &    0 &        0 \\
1 &      0.9 &       0.3 &    0 &        1 \\
2 &      0.6 &       0.4 &    0 &        2 \\
3 &      0.2 &       0.1 &    0 &        3 \\
4 &      0.7 &       0.8 &    0 &        4 \\
\bottomrule
\end{tabular}



In [4]:
def predict_rating(user_id,item_id):
    """ Predict a rating given a user_id and an item_id.
    """
    user_preference = latent_user_preferences[user_id]
    item_preference = latent_item_features[item_id]
    
    user_score = user_features_weights[user_id].dot(user_features[user_id])
    item_score = item_features_weights[item_id].dot(item_features[item_id])
    #print user_preference.dot(item_preference), user_score, item_score
    return user_preference.dot(item_preference) + user_score + item_score

def train(user_id, item_id, rating,alpha = 0.001, 
                                   latent_feature_weight_decay = 0.1, 
                                   user_weight_decay = 0.01,
                                   item_weight_decay = 0.0001):
    
    #print item_id
    prediction_rating = predict_rating(user_id, item_id)
    err =  ( prediction_rating - rating );
    #print err
    user_pref_values = latent_user_preferences[user_id][:]
    latent_user_preferences[user_id] -= alpha * err *  ( latent_item_features[item_id] + latent_feature_weight_decay*latent_user_preferences[user_id])
    latent_item_features[item_id] -= alpha * err * ( user_pref_values + latent_feature_weight_decay*latent_item_features[item_id])
    
    user_features_weights[user_id] -=alpha * err *(  user_features[user_id] + user_weight_decay* user_features_weights[user_id])
    item_features_weights[item_id] -=alpha * err * ( item_features_weights[item_id] + item_weight_decay* item_features_weights[item_id])
    
    
    return err
    


def sgd(iterations = 30000):
    """ Iterate over all users and all items and train for 
        a certain number of iterations
    """
    for iteration in range(0,iterations):
        error = []
        for user_id in range(0,latent_user_preferences.shape[0]):
            for item_id in range(0,latent_item_features.shape[0]):
                rating = user_ratings[user_id][item_id]
                if(not np.isnan(rating)):
                    err = train(user_id,item_id,rating)
                    error.append(err)
    mse = (np.array(error) ** 2).mean()          
    print(mse)
                    
                    
    
                    
                    
    


In [5]:
for _ in range(0,10): 
    sgd()

0.2931272535455705
0.2849261260120561
0.28196944081753644
0.28048081525235463
0.2795891023269882
0.27899959986178596
0.278585715312818
0.27828407907841585
0.2780594661187231
0.2778907706635918


In [6]:
predictions = np.zeros(shape = (latent_user_preferences.shape[0], latent_item_features.shape[0]) )
#print latent_user_preferences
print(user_features_weights)
print(item_features_weights)
for user_id in range(0,latent_user_preferences.shape[0]):
            for item_id in range(0,latent_item_features.shape[0]):
                predictions[user_id,item_id] =  predict_rating(user_id,item_id)

[[-1.28  -1.483  0.342  0.092  0.126]
 [ 2.255  0.262  2.218  0.638  2.293]
 [ 0.816  0.88   0.125  0.759  0.52 ]
 [ 0.047  0.468  0.48   0.291  0.843]
 [ 0.509  0.76   0.288  0.104 -0.276]
 [ 0.543  0.496  0.628  0.192  1.11 ]
 [-0.102  0.397  0.62   0.996 -0.238]
 [ 0.095  0.534  0.85   0.49   0.065]
 [ 0.895  0.643  0.916  0.636  0.627]
 [ 0.492  0.555  0.732  0.078 -0.329]]
[[2.214e-01 1.382e+00 2.100e+00 9.092e-01 1.010e+00]
 [4.628e-04 1.025e-03 8.193e-04 5.922e-04 8.507e-04]
 [1.286e+00 4.013e+00 2.513e+00 1.483e+00 3.446e+00]
 [5.245e-01 1.720e-01 2.202e+00 1.915e+00 5.835e-01]
 [6.167e-01 5.718e-01 4.536e-01 7.796e-01 1.858e+00]]


In [7]:
values = [zip(user_ratings[i], predictions[i]) for i in range(0,predictions.shape[0])]
comparison_data = pd.DataFrame(values)
comparison_data.columns = user_ratings_df.columns
#comparison_data.applymap(lambda (x,y): "(%2.3f|%2.3f)"%(x,y))
comparison_data.applymap(lambda x: "(%2.3f|%2.3f)"%(x[0],x[1]))

Unnamed: 0,The Call of Cthulhu,Frankenstein,Dracula,Neuromancer,Space Odyssey
0,(8.000|7.990),(2.000|2.029),(nan|-20.961),(5.000|4.987),(4.000|3.996)
1,(3.000|2.900),(2.000|2.560),(nan|65.261),(7.000|6.618),(7.000|6.924)
2,(9.000|9.045),(nan|4.359),(7.000|7.003),(8.000|7.950),(5.000|5.001)
3,(nan|8.967),(nan|4.929),(7.000|7.000),(8.000|8.000),(9.000|9.000)
4,(nan|3.805),(1.000|0.569),(8.000|7.990),(3.000|3.371),(7.000|7.054)
5,(2.000|2.002),(3.000|2.994),(5.000|5.000),(nan|2.298),(nan|32.354)
6,(4.000|4.308),(2.000|0.436),(nan|-3.680),(2.000|3.000),(7.000|7.205)
7,(7.000|6.762),(1.000|2.906),(2.000|2.082),(7.000|5.662),(9.000|8.760)
8,(3.000|3.101),(3.000|2.518),(nan|73.846),(7.000|7.291),(3.000|3.061)
9,(4.000|3.890),(nan|-0.148),(5.000|4.984),(3.000|3.112),(3.000|2.997)


In [8]:
comparison_data

Unnamed: 0,The Call of Cthulhu,Frankenstein,Dracula,Neuromancer,Space Odyssey
0,"(8.0, 7.989602873024069)","(2.0, 2.0291029128940883)","(nan, -20.96102991173411)","(5.0, 4.987020693742167)","(4.0, 3.9962986410473373)"
1,"(3.0, 2.900361687539126)","(2.0, 2.5597259518651727)","(nan, 65.26112672135585)","(7.0, 6.6175197072609695)","(7.0, 6.924387908287665)"
2,"(9.0, 9.045213352914493)","(nan, 4.359252353129258)","(7.0, 7.003133526725603)","(8.0, 7.950326143402647)","(5.0, 5.000995060021739)"
3,"(nan, 8.967238728955103)","(nan, 4.929342596193918)","(7.0, 6.9999989767697866)","(8.0, 8.000192350514219)","(9.0, 8.999562485763736)"
4,"(nan, 3.8045879946841787)","(1.0, 0.5685577534842259)","(8.0, 7.989594478325586)","(3.0, 3.370756098386226)","(7.0, 7.054013181609645)"
5,"(2.0, 2.0018662731013603)","(3.0, 2.9941025264709253)","(5.0, 4.999989981817055)","(nan, 2.2984482440077305)","(nan, 32.3535258930083)"
6,"(4.0, 4.308150725423932)","(2.0, 0.4362127180416085)","(nan, -3.679604481776364)","(2.0, 3.0000437304236076)","(7.0, 7.205286841305711)"
7,"(7.0, 6.762415307260097)","(1.0, 2.905867011909204)","(2.0, 2.081969188916272)","(7.0, 5.661924954157573)","(9.0, 8.759785253421503)"
8,"(3.0, 3.1014499631808277)","(3.0, 2.517904775569736)","(nan, 73.84588701061976)","(7.0, 7.2912955215817465)","(3.0, 3.061476092345093)"
9,"(4.0, 3.8898193867389255)","(nan, -0.1484848918282188)","(5.0, 4.983745828599943)","(3.0, 3.112152300133614)","(3.0, 2.9966619153949114)"


In [9]:
d = comparison_data.to_latex()
text_file = open("comparison.txt", "w")
text_file.write(d)
text_file.close()