In [10]:
import pandas as pd
import numpy as np
from lightfm.data import Dataset
from lightfm import LightFM

In [2]:
df = pd.DataFrame([
    ('u1', 'i1'), ('u1', 'i2'), ('u2', 'i1')
], columns=['uid', 'iid'])

df

Unnamed: 0,uid,iid
0,u1,i1
1,u1,i2
2,u2,i1


In [4]:
data = Dataset()
data.fit(
    users=df['uid'].unique().tolist(),
    items=df['iid'].unique().tolist()
)

interactions, weights = data.build_interactions(df[['uid', 'iid']].values.tolist())

In [5]:
rec = LightFM()
rec.fit(
    interactions=interactions,
    sample_weight=weights
)

<lightfm.lightfm.LightFM at 0x7f9788d28e80>

In [78]:
np.argsort(
    -rec.predict(
        user_ids=np.array([0, 0, 1, 1], dtype=np.int32),
        item_ids=np.array([0, 1, 0, 1], dtype=np.int32),
    ).reshape(2, -1)
).tolist()

[[0, 1], [0, 1]]

In [67]:
(
    rec.predict(user_ids=0, item_ids=list(iid.values())),
    rec.predict(user_ids=1, item_ids=list(iid.values()))
)

(array([0.09381282, 0.07361589], dtype=float32),
 array([0.07186899, 0.05039217], dtype=float32))

In [32]:
uid, _, iid, _ = data.mapping()

In [33]:
iid

{'i1': 0, 'i2': 1}

In [44]:
iid_reverted = {v: k for k, v in iid.items()}
iid_reverted

{0: 'i1', 1: 'i2'}

In [35]:
iid.values()

dict_values([0, 1])

In [47]:
iid.keys()
[iid_reverted[k] for k in np.argsort(
    -rec.predict(user_ids=1, item_ids=list(iid.values()))
)[:2]]

['i1', 'i2']

In [51]:
rec.get_params()

{'loss': 'logistic',
 'learning_schedule': 'adagrad',
 'no_components': 10,
 'learning_rate': 0.05,
 'k': 5,
 'n': 10,
 'rho': 0.95,
 'epsilon': 1e-06,
 'max_sampled': 10,
 'item_alpha': 0.0,
 'user_alpha': 0.0,
 'random_state': RandomState(MT19937) at 0x7F9788254D40}

In [53]:
pd.Series([1, 2, 3]).tolist()

[1, 2, 3]

In [57]:
recs = []
N=2
for uid_ in ['u1', 'u2']:
    recs.append([
        iid_reverted[k]
        for k in np.argsort(-rec.predict(
            user_ids=uid.get(uid_),
            item_ids=list(iid.values())
        )[:N])
    ])

In [64]:
pd.Series(recs[0]).tolist()

['i1', 'i2']

In [80]:
test = pd.read_csv('../data/raw/sample_submission.csv')
interact = pd.read_csv('../data/raw/interactions.csv')
users = pd.read_csv('../data/raw/users.csv')

In [83]:
test_ = test['user_id'].unique().tolist()
interact_ = interact['user_id'].unique().tolist()
users_ = users['user_id'].unique().tolist()

In [86]:
len(list(set(test_).intersection(set(interact_)))) / len(test_)

0.6637046703225573

In [87]:
len(list(set(test_).intersection(set(users_)))) / len(test_)

0.7764780206407648

In [89]:
users[['sex', 'kids_flg']]

Unnamed: 0,sex,kids_flg
0,М,1
1,М,0
2,Ж,0
3,Ж,0
4,Ж,0
...,...,...
840192,Ж,0
840193,Ж,1
840194,,0
840195,Ж,0


In [94]:
import numpy as np

y = np.random.randint(0, 2, users.shape[0])
y

array([0, 1, 1, ..., 1, 1, 0])

In [91]:
from xgboost import XGBClassifier

In [95]:
m = XGBClassifier()
m.fit(users[['kids_flg']], y)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=12,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [104]:
m.predict_proba(users[['kids_flg']])[:, 1]

array([0.5012169, 0.4990798, 0.4990798, ..., 0.4990798, 0.4990798,
       0.4990798], dtype=float32)

In [101]:
c = pd.DataFrame([
    ['a', [1, 2, 3]],
    ['b', [5, 6, 8]]
],  columns=['u', 'r'])

c.explode('r')

Unnamed: 0,u,r
0,a,1
0,a,2
0,a,3
1,b,5
1,b,6
1,b,8


In [98]:
c

Unnamed: 0,0
0,a
1,"[1, 2, 3]"
2,"[b, [5, 6, 8]]"


In [116]:
a = pd.DataFrame([[1, 1], [1, 2], [2,  3]], columns=['uid', 'iid'])

import numpy as np

class UnusedItems:
    
    def __init__(self, ids, r):
        self.ids = set(list(ids))
        self.r = r
    
    def __call__(self, x):
        l = list(set(list(x)) ^ self.ids)
        return np.random.choice(l, min([len(l), self.r]))

(
    a
    .groupby('uid')['iid']
    .apply(UnusedItems(
        ids=a['iid'].unique().tolist(),
        r=3
    ))
    .reset_index()
)

Unnamed: 0,uid,iid
0,1,[3]
1,2,"[1, 1]"


In [131]:
a = pd.DataFrame([
    (1, 1, 2),
    (1, 2, 3),
    (1, 3, 4),
    (2, 1, 2)
], columns=['uid', 'iid', 'score'])

N = 2

a = a.groupby('uid').apply(lambda x: pd.Series({'iid': x.sort_values('score', ascending=False)['iid'].tolist()[:N]})).reset_index()

In [137]:
a.set_index('uid').loc[[2, 1], 'iid']

uid
2       [1]
1    [3, 2]
Name: iid, dtype: object