In [1]:
import pandas as pd
import numpy as np
from lightfm.data import Dataset
from lightfm import LightFM



In [41]:
df = pd.DataFrame([
    ('u1', 'i1'), ('u1', 'i2'), ('u2', 'i1')
], columns=['uid', 'iid'])

u_f = pd.DataFrame([
    ('u1', 'uf1'),
    ('u2', 'uf2')
], columns=['uid', 'features'])
u_f['f'] = u_f['features'].map(lambda x: [x])

i_f = pd.DataFrame([
    ('i1', 'if1'),
    ('i2', 'if2'),
], columns=['iid', 'features'])
i_f['f'] = i_f['features'].map(lambda x: [x])

df

Unnamed: 0,uid,iid
0,u1,i1
1,u1,i2
2,u2,i1


In [43]:
i_f[['iid', 'f']].values

array(['i1', list(['if1'])], dtype=object)

In [44]:
data = Dataset()
data.fit(
    users=df['uid'].unique().tolist(),
    items=df['iid'].unique().tolist(),
    user_features=u_f['features'].unique().tolist(),
    item_features=i_f['features'].unique().tolist()
)

interactions, weights = data.build_interactions(df[['uid', 'iid']].values.tolist())

if_ = data.build_item_features(i_f[['iid', 'f']].values)
uf_ = data.build_user_features(u_f[['uid', 'f']].values)

In [45]:
rec = LightFM()
rec.fit(
    interactions=interactions,
    user_features=uf_,
    item_features=if_,
    sample_weight=weights
)

<lightfm.lightfm.LightFM at 0x7f9305ae1af0>

In [53]:
rec.predict(
    user_ids=0,
    item_ids=list(iid.values()),
    user_features=uf_,
    item_features=if_
)

array([0.04881844, 0.037433  ], dtype=float32)

In [54]:
rec.predict(
    user_ids=1,
    item_ids=list(iid.values()),
    user_features=uf_,
    item_features=if_
)

array([0.03847828, 0.02552994], dtype=float32)

In [46]:
(
    rec.predict(user_ids=0, item_ids=list(iid.values())),
    rec.predict(user_ids=1, item_ids=list(iid.values()))
)

(array([0.04716425, 0.03877868], dtype=float32),
 array([0.03675271, 0.02239857], dtype=float32))

In [55]:
uid, _, iid, _ = data.mapping()

In [56]:
iid

{'i1': 0, 'i2': 1}

In [9]:
iid_reverted = {v: k for k, v in iid.items()}
iid_reverted

{0: 'i1', 1: 'i2'}

In [10]:
iid.values()

dict_values([0, 1])

In [11]:
iid.keys()
[iid_reverted[k] for k in np.argsort(
    -rec.predict(user_ids=1, item_ids=list(iid.values()))
)[:2]]

['i1', 'i2']

In [12]:
rec.get_params()

{'loss': 'logistic',
 'learning_schedule': 'adagrad',
 'no_components': 10,
 'learning_rate': 0.05,
 'k': 5,
 'n': 10,
 'rho': 0.95,
 'epsilon': 1e-06,
 'max_sampled': 10,
 'item_alpha': 0.0,
 'user_alpha': 0.0,
 'random_state': RandomState(MT19937) at 0x7F933051FC40}

In [13]:
pd.Series([1, 2, 3]).tolist()

[1, 2, 3]

In [57]:
recs = []
N=2
for uid_ in ['u1', 'u2']:
    recs.append([
        iid_reverted[k]
        for k in np.argsort(-rec.predict(
            user_ids=uid.get(uid_),
            item_ids=list(iid.values())
        )[:N])
    ])

In [64]:
pd.Series(recs[0]).tolist()

['i1', 'i2']

In [18]:
test = pd.read_csv('../data/raw/sample_submission.csv')
interact = pd.read_csv('../data/raw/interactions.csv')
users = pd.read_csv('../data/raw/users.csv')

In [83]:
test_ = test['user_id'].unique().tolist()
interact_ = interact['user_id'].unique().tolist()
users_ = users['user_id'].unique().tolist()

In [86]:
len(list(set(test_).intersection(set(interact_)))) / len(test_)

0.6637046703225573

In [87]:
len(list(set(test_).intersection(set(users_)))) / len(test_)

0.7764780206407648

In [89]:
users[['sex', 'kids_flg']]

Unnamed: 0,sex,kids_flg
0,М,1
1,М,0
2,Ж,0
3,Ж,0
4,Ж,0
...,...,...
840192,Ж,0
840193,Ж,1
840194,,0
840195,Ж,0


In [19]:
import numpy as np

y = np.random.randint(0, 2, users.shape[0])
y

array([0, 0, 0, ..., 0, 0, 1])

In [22]:
users

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
1,962099,age_18_24,income_20_40,М,0
2,1047345,age_45_54,income_40_60,Ж,0
3,721985,age_45_54,income_20_40,Ж,0
4,704055,age_35_44,income_60_90,Ж,0
...,...,...,...,...,...
840192,339025,age_65_inf,income_0_20,Ж,0
840193,983617,age_18_24,income_20_40,Ж,1
840194,251008,,,,0
840195,590706,,,Ж,0


In [20]:
from xgboost import XGBClassifier

In [24]:
users['age'] = users['age'].astype('category')

In [27]:
users[['age']]

age    category
dtype: object

In [28]:
m = XGBClassifier()
m.fit(users[['age']], y, enable_categorical=True)

TypeError: fit() got an unexpected keyword argument 'enable_categorical'

In [104]:
m.predict_proba(users[['kids_flg']])[:, 1]

array([0.5012169, 0.4990798, 0.4990798, ..., 0.4990798, 0.4990798,
       0.4990798], dtype=float32)

In [101]:
c = pd.DataFrame([
    ['a', [1, 2, 3]],
    ['b', [5, 6, 8]]
],  columns=['u', 'r'])

c.explode('r')

Unnamed: 0,u,r
0,a,1
0,a,2
0,a,3
1,b,5
1,b,6
1,b,8


In [98]:
c

Unnamed: 0,0
0,a
1,"[1, 2, 3]"
2,"[b, [5, 6, 8]]"


In [116]:
a = pd.DataFrame([[1, 1], [1, 2], [2,  3]], columns=['uid', 'iid'])

import numpy as np

class UnusedItems:
    
    def __init__(self, ids, r):
        self.ids = set(list(ids))
        self.r = r
    
    def __call__(self, x):
        l = list(set(list(x)) ^ self.ids)
        return np.random.choice(l, min([len(l), self.r]))

(
    a
    .groupby('uid')['iid']
    .apply(UnusedItems(
        ids=a['iid'].unique().tolist(),
        r=3
    ))
    .reset_index()
)

Unnamed: 0,uid,iid
0,1,[3]
1,2,"[1, 1]"


In [30]:
a = pd.DataFrame([
    (1, 1, 2),
    (1, 2, 3),
    (1, 3, 4),
    (2, 1, 2)
], columns=['uid', 'iid', 'score'])


In [36]:
a.groupby('uid')['iid'].apply(list).reset_index().values

array([[1, list([1, 2, 3])],
       [2, list([1])]], dtype=object)