In [5]:
import pandas as pd
import urllib.request
import zipfile
import os

from scipy.sparse import csr_matrix
import numpy as np
# import pandas as pd # already imported
from sklearn.preprocessing import LabelEncoder
from multiprocessing import Pool, cpu_count

In [16]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder

class EASE:
    def __init__(self):
        self.user_enc = LabelEncoder()
        self.meme_enc = LabelEncoder()

    def _get_users_and_memes(self, df):
        users = self.user_enc.fit_transform(df.loc[:, 'user_id'])
        memes = self.meme_enc.fit_transform(df.loc[:, 'meme_id'])
        return users, memes

    def fit(self, df, lambda_: float = 0.5):
        """
        df: pandas.DataFrame with columns user_id, meme_id
        lambda_: l2-regularization term
        """
        users, memes = self._get_users_and_memes(df)
        values = np.ones(df.shape[0])

        X = csr_matrix((values, (users, memes)))
        self.X = X

        G = X.T.dot(X).toarray()
        diagIndices = np.diag_indices(G.shape[0])
        G[diagIndices] += lambda_
        P = np.linalg.inv(G)
        B = P / (-np.diag(P))
        B[diagIndices] = 0

        self.B = B
        self.pred = X.dot(B)

    def get_unseen_memes(self, user_id, df):
        if user_id not in df['user_id'].unique():
            return []

        watched = set(df[df['user_id'] == user_id]['meme_id'])
        all_meme_ids = set(df['meme_id'].unique())
        unseen_memes = list(all_meme_ids - watched)

        return unseen_memes

    def predict(self, user_id, meme_ids, k):
        if user_id not in self.user_enc.classes_:
            return {}

        user_enc_id = self.user_enc.transform([user_id])[0]
        meme_ids = [m for m in meme_ids if m in self.meme_enc.classes_]

        if not meme_ids:
            return {}

        meme_enc_ids = self.meme_enc.transform(meme_ids)
        pred = self.pred[user_enc_id, :]

        pred = np.take(pred, meme_enc_ids)
        res = np.argpartition(pred, -k)[-k:]
        scores = np.take(pred, res)
        result = {self.meme_enc.inverse_transform([meme_enc_ids[i]])[0]: scores[i] for i in range(len(res))}
        
        return dict(sorted(result.items(), key=lambda item: item[1], reverse=True))

In [39]:
# SQL I used to extract the data (likes) 
# """
#     SELECT 
#         UMR.user_id, UMR.meme_id
#     FROM user_meme_reaction UMR
#     INNER JOIN meme_stats MS
#         ON MS.meme_id = UMR.meme_id
#     INNER JOIN meme M 
#         ON M.id = UMR.meme_id
#     WHERE 1=1 
#         AND M.status = 'ok'
#         AND UMR.reaction_id = 1
#         AND MS.nlikes >= 20
#         AND MS.sec_to_react <= 20
#         AND MS.raw_impr_rank <= 2;
# """

In [40]:
df = pd.read_csv("/Users/ohld/Documents/ffmemes/user_meme_reaction_ones.csv")
# df.rename(columns={"user_id": "user_id", "meme_id": "item_id", "reaction": "rating"}, inplace=True)
print("likes: ", df.shape)

print("users: ", df.user_id.nunique(), "items: ", df.meme_id.nunique())

likes:  (855851, 2)
users:  7334 items:  14337


In [41]:
meme_oracle = EASE()
meme_oracle.fit(df)

In [42]:
meme_ids_to_score = """6646
797711
112674
1200023
1194152
8881929
2159073
7276945
3864276
359973
1816614
1726605
689
1194977
2551418
9039833
8613774
8881936
8774141
1194954
19506
2510197
28619
2159118
1194882
7543930
8406034
5991637
289311
6154878
7169354
289312
252004
7227573
7493953
9258404
2159056
7226839
5991868
6422570
1501700
6311805
252003
6423857
7063657
7703209
6589949
6805981
6368231
6423735
874851
874852
6590188
6590187
6590132
5882699
6906066
6906067
6904973
5936118
6906065
6906063
6906064
5991182
5934774
5936151
5936154
5936152
5936130
5935383
578883
7543267
16761
2226667
1601043
6156342
874
2159055
4771929
4745303
4252346
5140961
894461
558061
96629
1200033
1568430
1200032
4161586
5690207
4974612
3738670
495244
2912730
300566
2713458
495238
1200028
1200035
2400118
5816130
4161587
6677
5592507
6437523
6656
389252
5901473
4419709
8875967
9410755
7639932
5474862
1489506
4567590
8172755
1586924
373233
1393273
861412
2527796
1489508
5103943
6706
7822678
1200085
16819
894463
6797931
5349965
5772481
9166067
8029709
19746
321902
4852411
4171712
20121
1699074
1097951
1375045
2320879
648225
2609314
1332109
16816
876817
1746871
3692793
9745
710194
7226916
322645
1094258
8881758
5981983
4682600
785170
2176726
2979330
6733
1200078
5038996
1887988
188483
6725
2414818
5999118
619239
7760123
6476702
8989070
4067353
6155442
1200069
1347768
1238933
6712
2162053
814642
3184199
558051
6700
1160
504905
578821
288745
288634
322176
793
504999
212865
1000
578999
1194950
504832
764
505021
911826
1539379""".split("\n")

meme_ids_to_score = [int(m) for m in meme_ids_to_score]
print("meme_ids_to_score: ", len(meme_ids_to_score))

meme_ids_to_score:  200


In [34]:
user_id = 49820636

In [35]:
unseen_memes = meme_oracle.get_unseen_memes(user_id, df)
print("unseen_memes: ", len(unseen_memes))

unseen_memes:  19043


In [36]:
predictions = meme_oracle.predict(user_id, unseen_memes, k=10)
print(predictions)  # Output: {unseen_meme_id1: score, unseen_meme_id2: score, ...}
print(predictions.keys())


{6422539: 0.003571542535377316, 8: 0.002374616133317173, 6: 0.002340672780740991, 3538950: 0.0023217072253927246, 5111813: 0.0023189169575870026, 5: 0.0022935506565535634, 4: 0.002201195643095507, 2: 0.002175844114981368, 3: 0.0021557388145241043, 7208960: 0.0020668815919213865}
dict_keys([6422539, 8, 6, 3538950, 5111813, 5, 4, 2, 3, 7208960])


In [37]:
predictions = meme_oracle.predict(user_id, meme_ids_to_score, k=10)
print(predictions)  # Output: {unseen_meme_id1: score, unseen_meme_id2: score, ...}
print(predictions.keys())

{1194977: 0.9993295027293092, 1726605: 0.9991520436391277, 689: 0.9986334393195366, 1816614: 0.9984742922072279, 8881929: 0.001156169862222325, 359973: 0.00090822578516097, 2159073: 0.0005116735737032809, 3864276: 0.0004893994073651743, 1200023: 0.00046704125281132273, 6646: 0.00041922682585014085}
dict_keys([1194977, 1726605, 689, 1816614, 8881929, 359973, 2159073, 3864276, 1200023, 6646])


In [None]:
# Example usage:
# likes_df = pd.DataFrame({'user_id': [...], 'meme_id': [...], 'liked': [...]})
# liked_memes_ids = [...]  # List of meme_ids that the user likes
# disliked_memes_ids = [...]  # List of meme_ids that the user dislikes
# meme_oracle = MemeOracle(likes_df, liked_memes_ids, disliked_memes_ids)
# predictions = meme_oracle.get_predictions(user_ids=[user_id], meme_ids=[meme_id], best=True)
