# Курсовой проект по курсу Рекомендательные системы

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender

In [2]:
data = pd.read_csv('raw_data/retail_train.csv')
item_features = pd.read_csv('raw_data/product.csv')
user_features = pd.read_csv('raw_data/hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)


# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [3]:
n_items_before = data_train_lvl_1['item_id'].nunique()

data_train_lvl_1 = prefilter_items(data_train_lvl_1, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 5001


In [4]:
recommender = MainRecommender(data_train_lvl_1)



HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5001.0), HTML(value='')))




In [5]:
result_lvl_1 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_1.columns=['user_id', 'actual']
result_lvl_1.head(2)
result_lvl_1=result_lvl_1[:100]

In [11]:
n=50
result_lvl_1['own_recommendations']=result_lvl_1['user_id'].apply(lambda x: recommender.get_als_recommendations(
    x, N=n))

In [12]:
def recall_at_k1(recommended_list, bought_list, k=5):
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)[:k]
    
    flags = np.isin(bought_list, recommended_list)
    
    recall = flags.sum() / len(bought_list)
    
    return recall

In [14]:
recall_k=[]
for i in range(0,result_lvl_1.shape[0]):
    recall_k.append(recall_at_k1(result_lvl_1['own_recommendations'][i],result_lvl_1['actual'][i],50))
print('OWN',np.mean(recall_k))    

OWN 0.048230167681955954


In [15]:
users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique())
users_lvl_2.columns = ['user_id']

# Пока только warm start
train_users = data_train_lvl_1['user_id'].unique()
users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(train_users)]

users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=50))

s = users_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'item_id'

users_lvl_2 = users_lvl_2.drop('candidates', axis=1).join(s)
users_lvl_2['flag'] = 1

users_lvl_2.head(4)

Unnamed: 0,user_id,item_id,flag
0,2070,1105426,1
0,2070,1097350,1
0,2070,879194,1
0,2070,948640,1


In [16]:
targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()
targets_lvl_2['target'] = 1  # тут только покупки 

targets_lvl_2 = users_lvl_2.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')

targets_lvl_2['target'].fillna(0, inplace= True)
targets_lvl_2.drop('flag', axis=1, inplace=True)

In [17]:
targets_lvl_2=targets_lvl_2.merge(item_features,how='left',left_on='item_id',right_on='item_id')
targets_lvl_2=targets_lvl_2.merge(user_features,how='left',left_on='user_id',right_on='user_id')

In [18]:
X_train = targets_lvl_2.drop('target', axis=1)
y_train = targets_lvl_2[['target']]

In [47]:
X_train

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,1105426,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,1097350,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
2,2070,879194,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,14 CT,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
3,2070,948640,1213,DRUG GM,National,ORAL HYGIENE PRODUCTS,WHITENING SYSTEMS,3 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
4,2070,928263,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,13 CT,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111308,1745,903454,1216,MEAT-PCKGD,National,FROZEN MEAT,OTHER - FULLY COOKED,32 OZ,45-54,A,Under 15K,Unknown,Single Male,2,None/Unknown
111309,1745,9419888,759,GROCERY,National,YOGURT,YOGURT MULTI-PACKS,48 OZ,45-54,A,Under 15K,Unknown,Single Male,2,None/Unknown
111310,1745,1076769,3859,DELI,National,DELI MEATS,MEAT: LUNCHMEAT BULK,,45-54,A,Under 15K,Unknown,Single Male,2,None/Unknown
111311,1745,1092588,709,GROCERY,National,FLUID MILK PRODUCTS,MISCELLANEOUS MILK,32 OZ,45-54,A,Under 15K,Unknown,Single Male,2,None/Unknown


In [19]:
cat_feats = X_train.columns[2:].tolist()
X_train[cat_feats] = X_train[cat_feats].astype('category')
lgb = LGBMClassifier(objective='binary', max_depth=7, categorical_column=cat_feats)
lgb.fit(X_train, y_train)

train_preds = lgb.predict(X_train)

  return f(**kwargs)




In [20]:
result_level_2=X_train[['user_id','item_id']].copy()
result_level_2['actual']=y_train
result_level_2['predict']=train_preds

In [21]:
df1 = result_level_2[result_level_2['actual']==1]
df2 = result_level_2[result_level_2['predict']==1]

df1=df1.groupby('user_id')['item_id'].unique().reset_index()
df2=df2.groupby('user_id')['item_id'].unique().reset_index()
df1.columns=['user_id', 'actual']
df2.columns=['user_id', 'predict']
result_level_2=df1.merge(df2,how='inner',left_on='user_id',right_on='user_id')

In [23]:
data = pd.read_csv('raw_data/retail_test1.csv')
data.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,1340,41652823310,664,912987,1,8.49,446,0.0,52,96,0.0,0.0
1,588,41652838477,664,1024426,1,6.29,388,0.0,8,96,0.0,0.0
2,2070,41652857291,664,995242,5,9.1,311,-0.6,46,96,0.0,0.0
3,1602,41665647035,664,827939,1,7.99,334,0.0,1741,96,0.0,0.0
4,1602,41665647035,664,927712,1,0.59,334,-0.4,1741,96,0.0,0.0


In [25]:
data_test = prefilter_items(data, item_features=item_features, take_n_popular=5000)

n_items_after = data_test['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 5001


In [26]:
recommender = MainRecommender(data_test)

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5001.0), HTML(value='')))




In [27]:
result_lvl_1 = data_test.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_1.columns=['user_id', 'actual']
result_lvl_1.head(2)

Unnamed: 0,user_id,actual
0,1,"[825123, 999999, 845307, 852014, 856942, 99102..."
1,2,"[930118, 999999, 5567582, 5568489, 5569230, 93..."


In [32]:
n=50
result_lvl_1['item_id']=result_lvl_1['user_id'].apply(lambda x: recommender.get_own_recommendations(
    x, N=n))

In [33]:
result_lvl_1

Unnamed: 0,user_id,actual,own_recommendations,item_id
0,1,"[825123, 999999, 845307, 852014, 856942, 99102...","[856942, 9297615, 5577022, 8293439, 9655212, 8...","[856942, 9297615, 5577022, 8293439, 9655212, 8..."
1,2,"[930118, 999999, 5567582, 5568489, 5569230, 93...","[1103898, 911974, 1076580, 5567582, 1007414, 1...","[1103898, 911974, 1076580, 5567582, 1007414, 1..."
2,3,"[866211, 1089954, 1118235, 1121321, 12132312, ...","[1092937, 1008714, 12132312, 1075979, 998206, ...","[1092937, 1008714, 12132312, 1075979, 998206, ..."
3,4,"[891423, 907631, 910109, 999999, 939907, 94453...","[6391541, 1052294, 936470, 891423, 1137010, 83...","[6391541, 1052294, 936470, 891423, 1137010, 83..."
4,5,"[999999, 825538, 1002499, 870826, 889509, 9417...","[6552318, 1055403, 994577, 1065017, 829621, 99...","[6552318, 1055403, 994577, 1065017, 829621, 99..."
...,...,...,...,...
2494,2496,"[999999, 871756, 899624, 916122, 998239, 99927...","[872826, 983665, 12452939, 991546, 1134296, 74...","[872826, 983665, 12452939, 991546, 1134296, 74..."
2495,2497,"[999999, 1037840, 1052294, 5569230, 8090537, 1...","[870515, 1102207, 1117219, 1103513, 1010950, 1...","[870515, 1102207, 1117219, 1103513, 1010950, 1..."
2496,2498,"[901776, 1070820, 9677846, 1044500, 5576075, 9...","[1022066, 1100379, 1076580, 931579, 5565356, 9...","[1022066, 1100379, 1076580, 931579, 5565356, 9..."
2497,2499,"[838186, 883665, 932949, 933835, 1067695, 1132...","[7168055, 1128395, 6904613, 5570048, 889989, 8...","[7168055, 1128395, 6904613, 5570048, 889989, 8..."


In [48]:
users_lvl_2 = pd.DataFrame(result_lvl_1['user_id'].unique())
users_lvl_2.columns = ['user_id']

# Пока только warm start
train_users = data_train_lvl_1['user_id'].unique()
users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(train_users)]

users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=50))

s = users_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'item_id'

users_lvl_2 = users_lvl_2.drop('candidates', axis=1).join(s)


users_lvl_2.head(4)

Unnamed: 0,user_id,item_id
0,1,856942
0,1,9297615
0,1,5577022
0,1,8293439


In [49]:
users_lvl_2=users_lvl_2.merge(item_features,how='left',left_on='item_id',right_on='item_id')
users_lvl_2=users_lvl_2.merge(user_features,how='left',left_on='user_id',right_on='user_id')

In [54]:
users_lvl_2[cat_feats] = users_lvl_2[cat_feats].astype('category')

In [55]:
test_preds = lgb.predict(users_lvl_2)

In [60]:
result_level_2=users_lvl_2[['user_id','item_id']].copy()
#result_level_2['actual']=y_train
result_level_2['predict']=test_preds

In [70]:
df = result_level_2[result_level_2['predict']==1]

In [97]:
df = result_level_2[result_level_2['predict']==1]
df=df.groupby('user_id')['item_id'].unique().reset_index()
df.columns=['user_id', 'rec']
df.head()

Unnamed: 0,user_id,rec
0,1,"[8293439, 9655212]"
1,7,[1122358]
2,13,"[9488065, 1038985, 862070, 9803545, 882604, 10..."
3,15,[1098248]
4,16,"[1075368, 1101010]"


In [102]:
len_rec=[]
for i in df['rec']:
    len_rec.append(len(i))
print(max(len_rec))    
for i in range(1,max(len_rec)+1):
    df[f'rec_{i}']=np.NaN

12


In [121]:
for i in df.index:
    for j in range(0,len(df['rec'][i])):
        df[f'rec_{j+1}'][i]=df['rec'][i][j]
df.drop('rec',inplace=True,axis=1)       

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'rec_{j+1}'][i]=df['rec'][i][j]


In [122]:
df.to_csv('recommendations.csv')

In [124]:
df.head()

Unnamed: 0,user_id,rec_1,rec_2,rec_3,rec_4,rec_5,rec_6,rec_7,rec_8,rec_9,rec_10,rec_11,rec_12
0,1,8293439.0,9655212.0,,,,,,,,,,
1,7,1122358.0,,,,,,,,,,,
2,13,9488065.0,1038985.0,862070.0,9803545.0,882604.0,1016709.0,,,,,,
3,15,1098248.0,,,,,,,,,,,
4,16,1075368.0,1101010.0,,,,,,,,,,
