# A Hybrid Recommendation Model


## Model Outline:

**1. Data Preparation**
<br>
**2. User Similarity**
<br>
**3. Make predictions**
<br>
**4. Item similarity from tf-idf** 
<br>
**5. Maximize similarity**

### 1. Data Preparation

- Data amount is reduced and data is splitted into train, test, validation.

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from datetime import date, datetime

In [2]:
df_transactions_indexed = pd.read_csv('derived_data/transactions_smaller.csv')
df_transactions = df_transactions_indexed.drop('Unnamed: 0',axis=1)
df_transactions.t_dat = pd.to_datetime(df_transactions.t_dat)
lastdate = df_transactions.iloc[-1]['t_dat']
df_transactions['count_days_ago'] = (lastdate - df_transactions.t_dat).dt.days +1
df_transactions.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,count_days_ago
0,2018-09-20,001ea4e9c54f7e9c88811260d954edc059d596147e1cf8...,652075001,0.011847,2,734
1,2018-09-20,001ea4e9c54f7e9c88811260d954edc059d596147e1cf8...,670295001,0.010153,2,734
2,2018-09-20,00401a367c5ac085cb9d4b77c56f3edcabf25153615db9...,613456009,0.016932,2,734
3,2018-09-20,00401a367c5ac085cb9d4b77c56f3edcabf25153615db9...,633675001,0.010153,2,734
4,2018-09-20,00401a367c5ac085cb9d4b77c56f3edcabf25153615db9...,648719001,0.025407,2,734


In [3]:
df_transactions.tail()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,count_days_ago
1664982,2020-09-22,ffd4cf2217de4a0a3f9f610cdec334c803692a18af08ac...,701472004,0.010153,2,1
1664983,2020-09-22,ffd4cf2217de4a0a3f9f610cdec334c803692a18af08ac...,456163087,0.033881,2,1
1664984,2020-09-22,ffd4cf2217de4a0a3f9f610cdec334c803692a18af08ac...,832505003,0.025407,2,1
1664985,2020-09-22,ffd4cf2217de4a0a3f9f610cdec334c803692a18af08ac...,902288001,0.022017,2,1
1664986,2020-09-22,ffd4cf2217de4a0a3f9f610cdec334c803692a18af08ac...,856440002,0.042356,2,1


In [4]:
df_transactions.shape

(1664987, 6)

In [5]:
## Take only the last 1 year

df_input = df_transactions[df_transactions.t_dat > lastdate + pd.DateOffset(years=-1)]


## Train - Valid - Test Split
## last 2 weeks are selected as valid & test data
df_train = df_input[df_input.t_dat <= lastdate + pd.DateOffset(weeks=-2)]
df_valid = df_input[(df_input.t_dat <= lastdate + pd.DateOffset(weeks=-1)) & 
                    (df_input.t_dat > lastdate + pd.DateOffset(weeks=-2))]
df_test = df_input[df_input.t_dat > lastdate + pd.DateOffset(weeks=-1)]

s1,s2,s3 = df_train.shape[0], df_valid.shape[0], df_test.shape[0]
print('train-valid-test set lengths: '+str(s1)+' - '+str(s2)+' - '+str(s3))

train-valid-test set lengths: 622471 - 10574 - 9666


### 2. User Similarity

* #### Customer category choices

In [6]:
df_articles = pd.read_csv('data/articles.csv.gz', compression='gzip',
                   error_bad_lines=False)
df_articles = df_articles[['article_id', 'product_group_name']]
df_transactions1 = df_train.merge(df_articles, on='article_id',how='left')
pgn_counts = pd.DataFrame(df_transactions1.groupby('customer_id')['product_group_name'].value_counts(sort = True, normalize=True))
pgn_counts.rename({'product_group_name':'prod_grp_perc'},axis=1,inplace=True)
pgn_counts.reset_index(inplace=True)
pgn_counts = pd.pivot_table(pgn_counts, values = 'prod_grp_perc', index=['customer_id'], columns = 'product_group_name').reset_index()
pgn_counts = pgn_counts.fillna(0)

del df_transactions1
pgn_counts.head()

product_group_name,customer_id,Accessories,Bags,Cosmetic,Furniture,Garment Full body,Garment Lower body,Garment Upper body,Garment and Shoe care,Items,Nightwear,Shoes,Socks & Tights,Stationery,Swimwear,Underwear,Underwear/nightwear,Unknown
0,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,0.056213,0.0,0.0,0.0,0.180473,0.275148,0.331361,0.0,0.0,0.014793,0.044379,0.005917,0.0,0.050296,0.032544,0.0,0.008876
1,000400dcdcf45d8d1ad1bd63be6fbd79a49167adf0cf28...,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.166667,0.0,0.0,0.333333,0.0,0.0,0.166667,0.0,0.0
2,00058592fc65afabbb00b1bb7d33c6b221d00c6a98c621...,0.0,0.0,0.0,0.0,0.076923,0.076923,0.307692,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,0.461538,0.0,0.0
3,0006d3ff0caf0cb4d4e0615ee5cb7d268622364d483335...,0.0625,0.0,0.0,0.0,0.125,0.333333,0.34375,0.0,0.0,0.0,0.03125,0.0,0.0,0.083333,0.010417,0.0,0.010417
4,00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...,0.029197,0.007299,0.0,0.0,0.218978,0.277372,0.357664,0.0,0.0,0.0,0.014599,0.0,0.0,0.014599,0.080292,0.0,0.0


* #### Customer Behaviour

In [7]:
## row --> bir userin bir gunde yaptigi alisveris bilgileri
df_grpd = df_train.groupby(['customer_id','count_days_ago'])['article_id','price'].agg(lambda x: list(x))
df_grpd.reset_index(inplace=True)
df_grpd.sort_values('count_days_ago', ascending=False, inplace=True)


## row --> bir userin toplam alisveris history'si
df_days_grpd = df_grpd.groupby(['customer_id'], as_index=False)['count_days_ago','article_id','price'].agg(lambda x: list(x))
df_days_grpd['article_count_perday'] = df_days_grpd['article_id'].apply(lambda rw: [len(rw[i]) for i in range(len(rw))])
df_days_grpd['total_shop_time'] = df_days_grpd.count_days_ago.apply(lambda x: len(x))

del df_grpd

## kac kez alisveris yapmis, en son ne zaman yapmis, ne sıklıkta yapmıs, ort. kac urun almis , ort. ne kadar harcamis

df_days_grpd['total_shop_time'] = df_days_grpd.count_days_ago.apply(lambda x: len(x))
df_days_grpd['last_shop_time'] = df_days_grpd.count_days_ago.apply(lambda x: x[-1])

df_days_grpd['shop_freq'] = df_days_grpd['count_days_ago'].apply(lambda x: 
                                                                 [x[i]-x[i+1] for i in range(len(x)-1)] 
                                                                 if len(x)>1 else 0)

df_days_grpd['time_weights'] = df_days_grpd['count_days_ago'].apply(lambda rw: [1/x for x in rw])

df_days_grpd['mean_freq'] = df_days_grpd.apply(lambda x: np.dot(x['shop_freq'] , x['time_weights'][1:]) 
                                               / np.sum(x['time_weights'][1:])
                                               if x['shop_freq'] != 0 else 0, axis=1)

df_days_grpd['mean_article_count'] = df_days_grpd.apply(lambda x: np.dot(x['article_count_perday'] , x['time_weights']) 
                                                         / np.sum(x['time_weights'])
                                               if x['article_count_perday'] != 0 else 0, axis=1)


df_days_grpd['price'] = df_days_grpd['price'].apply(lambda x: [np.mean(i) for i in x])
df_days_grpd['mean_price'] = df_days_grpd.apply(lambda x: np.dot(x['price'] , x['time_weights']) 
                                                / np.sum(x['time_weights'])
                                               if x['price'] != 0 else 0, axis=1)

## sales channel id choice as percentage
sci_perc = pd.DataFrame(df_train.groupby('customer_id')['sales_channel_id'].value_counts(sort = True, normalize=True))
sci_perc.rename({'sales_channel_id':'sales_ch_perc'},axis=1,inplace=True)
sci_perc.reset_index(inplace=True)
sci_perc = pd.pivot_table(sci_perc, values = 'sales_ch_perc', index=['customer_id'], columns = 'sales_channel_id').reset_index()
sci_perc = sci_perc.fillna(0)
sci_perc.rename({1:'schi_1', 2:'schi_2'},axis=1,inplace=True)


## herseyi birlestir
df_customer_info = df_days_grpd[['customer_id', 'total_shop_time','last_shop_time','mean_freq', 'mean_article_count', 'mean_price']]
df_customer_info = df_customer_info.merge(pgn_counts, on='customer_id')

del sci_perc
del df_days_grpd
del pgn_counts

df_customer_info

Unnamed: 0,customer_id,total_shop_time,last_shop_time,mean_freq,mean_article_count,mean_price,Accessories,Bags,Cosmetic,Furniture,...,Garment and Shoe care,Items,Nightwear,Shoes,Socks & Tights,Stationery,Swimwear,Underwear,Underwear/nightwear,Unknown
0,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,80,16,4.016379,4.087228,0.035334,0.056213,0.000000,0.0,0.0,...,0.0,0.000000,0.014793,0.044379,0.005917,0.0,0.050296,0.032544,0.0,0.008876
1,000400dcdcf45d8d1ad1bd63be6fbd79a49167adf0cf28...,2,62,216.000000,4.270588,0.011205,0.000000,0.000000,0.0,0.0,...,0.0,0.166667,0.000000,0.000000,0.333333,0.0,0.000000,0.166667,0.0,0.000000
2,00058592fc65afabbb00b1bb7d33c6b221d00c6a98c621...,4,44,103.363202,2.630421,0.034367,0.000000,0.000000,0.0,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.076923,0.461538,0.0,0.000000
3,0006d3ff0caf0cb4d4e0615ee5cb7d268622364d483335...,13,20,16.472736,5.167646,0.031743,0.062500,0.000000,0.0,0.0,...,0.0,0.000000,0.000000,0.031250,0.000000,0.0,0.083333,0.010417,0.0,0.010417
4,00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...,8,35,39.915852,20.806462,0.033231,0.029197,0.007299,0.0,0.0,...,0.0,0.000000,0.000000,0.014599,0.000000,0.0,0.014599,0.080292,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16061,fff2c4204fac63f93aec10ed657958d372efe948de1492...,12,52,18.786095,4.033250,0.020336,0.020833,0.000000,0.0,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.166667,0.0,0.000000
16062,fff4b145d7469e023b147b0f8375c565b1be4394498779...,4,83,40.416254,3.406508,0.024297,0.076923,0.000000,0.0,0.0,...,0.0,0.000000,0.000000,0.153846,0.000000,0.0,0.000000,0.000000,0.0,0.000000
16063,fff7f145e356557541af211bb11aa9d78d7edf51bd3c40...,26,48,11.074113,2.098135,0.021384,0.053571,0.000000,0.0,0.0,...,0.0,0.017857,0.000000,0.035714,0.000000,0.0,0.142857,0.125000,0.0,0.000000
16064,fff969b13a1c848d53ae3f08f111bfebcdcf6cd27e3815...,8,134,20.940368,8.141583,0.043275,0.000000,0.000000,0.0,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.138462,0.000000,0.0,0.000000


* #### Combine all customer info

In [8]:
def get_dummies(df_, col, colname_key, drop=False):
    df = df_.copy()
    uniq_values = df[col].unique()
    uniq_values = [x for x in uniq_values if str(x) != 'nan']
    
    for i in uniq_values:
        i_ = i.replace('-','').replace(' ','')
        df.loc[df[col] == i, colname_key+'_'+i_] = 1
        df[colname_key+'_'+i_] = df[colname_key+'_'+i_].fillna(0)
    if drop==True:
        df = df.drop(col,axis=1)       
    return df

In [10]:
df_customer_ = pd.read_csv('derived_data/customers_smaller.csv')
df_customer = df_customer_.drop('Unnamed: 0',axis=1)
df_customer.fashion_news_frequency = df_customer.fashion_news_frequency.replace({'NONE':np.nan}).replace({'None':np.nan})
df_all_customers = get_dummies(df_customer, 'club_member_status','CMS', drop=True)
df_all_customers = get_dummies(df_all_customers, 'fashion_news_frequency','FNF', drop=True)
df_all_customers = df_all_customers.merge(df_customer_info, on='customer_id', how='left')
df_all_customers.age = df_all_customers.age.fillna(0)
df_all_customers.dropna(inplace=True)
df_all_customers.head(3)

Unnamed: 0,customer_id,age,CMS_ACTIVE,CMS_PRECREATE,CMS_LEFTCLUB,FNF_Regularly,FNF_Monthly,total_shop_time,last_shop_time,mean_freq,...,Garment and Shoe care,Items,Nightwear,Shoes,Socks & Tights,Stationery,Swimwear,Underwear,Underwear/nightwear,Unknown
0,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,44.0,1.0,0.0,0.0,1.0,0.0,80.0,16.0,4.016379,...,0.0,0.0,0.014793,0.044379,0.005917,0.0,0.050296,0.032544,0.0,0.008876
2,000400dcdcf45d8d1ad1bd63be6fbd79a49167adf0cf28...,21.0,1.0,0.0,0.0,1.0,0.0,2.0,62.0,216.0,...,0.0,0.166667,0.0,0.0,0.333333,0.0,0.0,0.166667,0.0,0.0
3,00058592fc65afabbb00b1bb7d33c6b221d00c6a98c621...,24.0,1.0,0.0,0.0,0.0,0.0,4.0,44.0,103.363202,...,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,0.461538,0.0,0.0


* #### Normalize continuous columns

In [11]:
dummy_cols = ['customer_id','CMS_ACTIVE','CMS_PRECREATE','CMS_LEFTCLUB','FNF_Regularly','FNF_Monthly']

for col in df_all_customers.columns.difference(dummy_cols):
    mean_val = df_all_customers[col].mean()
    std_val = df_all_customers[col].std()
    df_all_customers[col] = (df_all_customers[col] - mean_val) / std_val
df_all_customers = df_all_customers.fillna(0)

In [12]:
df_all_customers

Unnamed: 0,customer_id,age,CMS_ACTIVE,CMS_PRECREATE,CMS_LEFTCLUB,FNF_Regularly,FNF_Monthly,total_shop_time,last_shop_time,mean_freq,...,Garment and Shoe care,Items,Nightwear,Shoes,Socks & Tights,Stationery,Swimwear,Underwear,Underwear/nightwear,Unknown
0,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,0.567871,1.0,0.0,0.0,1.0,0.0,6.919286,-0.847701,-0.886651,...,-0.019295,-0.052868,0.030854,0.374360,-0.256392,-0.009118,-0.104884,-0.387217,-0.011361,0.152966
2,000400dcdcf45d8d1ad1bd63be6fbd79a49167adf0cf28...,-1.210572,1.0,0.0,0.0,1.0,0.0,-0.727379,-0.305332,3.471899,...,-0.019295,39.804328,-0.278510,-0.364453,4.157377,-0.009118,-0.482017,0.548457,-0.011361,-0.192010
3,00058592fc65afabbb00b1bb7d33c6b221d00c6a98c621...,-0.978602,1.0,0.0,0.0,0.0,0.0,-0.531310,-0.517564,1.155998,...,-0.019295,-0.052868,-0.278510,-0.364453,-0.336159,-0.009118,0.094775,2.605563,-0.011361,-0.192010
4,0006d3ff0caf0cb4d4e0615ee5cb7d268622364d483335...,-0.050718,1.0,0.0,0.0,0.0,0.0,0.350997,-0.800539,-0.630538,...,-0.019295,-0.052868,-0.278510,0.155794,-0.336159,-0.009118,0.142842,-0.541586,-0.011361,0.212858
6,00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...,-0.205365,1.0,0.0,0.0,0.0,0.0,-0.139174,-0.623679,-0.148530,...,-0.019295,-0.052868,-0.278510,-0.121418,-0.336159,-0.009118,-0.372553,-0.054117,-0.011361,-0.192010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20371,fff2c4204fac63f93aec10ed657958d372efe948de1492...,0.026606,1.0,0.0,0.0,1.0,0.0,0.252963,-0.423239,-0.582974,...,-0.019295,-0.052868,-0.278510,-0.364453,-0.336159,-0.009118,-0.482017,0.548457,-0.011361,-0.192010
20372,fff4b145d7469e023b147b0f8375c565b1be4394498779...,1.495754,1.0,0.0,0.0,0.0,0.0,-0.531310,-0.057729,-0.138241,...,-0.019295,-0.052868,-0.278510,2.196765,-0.336159,-0.009118,-0.482017,-0.614256,-0.011361,-0.192010
20374,fff7f145e356557541af211bb11aa9d78d7edf51bd3c40...,-0.823954,1.0,0.0,0.0,1.0,0.0,1.625441,-0.470401,-0.741538,...,-0.019295,4.217546,-0.278510,0.230115,-0.336159,-0.009118,0.589169,0.257779,-0.011361,-0.192010
20375,fff969b13a1c848d53ae3f08f111bfebcdcf6cd27e3815...,1.186460,1.0,0.0,0.0,0.0,0.0,-0.139174,0.543593,-0.538680,...,-0.019295,-0.052868,-0.278510,-0.364453,-0.336159,-0.009118,0.556210,-0.614256,-0.011361,-0.192010


* #### Cosine similarity between users

In [13]:
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
user_sim = cosine_similarity(df_all_customers.drop('customer_id',axis=1))

In [15]:
user_sim.shape

(16066, 16066)

In [16]:
user_sim

array([[ 1.        , -0.03917817, -0.19101062, ...,  0.35836175,
         0.07445464,  0.02948939],
       [-0.03917817,  1.        ,  0.04017677, ...,  0.81750463,
        -0.06385042, -0.01366258],
       [-0.19101062,  0.04017677,  1.        , ...,  0.06651583,
        -0.24558283, -0.13571355],
       ...,
       [ 0.35836175,  0.81750463,  0.06651583, ...,  1.        ,
        -0.04362333,  0.09157998],
       [ 0.07445464, -0.06385042, -0.24558283, ..., -0.04362333,
         1.        ,  0.2290423 ],
       [ 0.02948939, -0.01366258, -0.13571355, ...,  0.09157998,
         0.2290423 ,  1.        ]])

* #### List K-nearest neihgbours of users

In [17]:
def find_knn(sim , user, k=10):
    """
    sim : similarity matrix for users
    k : neighbour count
    user: the user of which k nearest neighbours to be found
    
    """
    
    similar_user_list = user_sim[user,:].argsort()[::-1][1:k+1]
    neighbors = df_all_customers.iloc[similar_user_list].customer_id.tolist()
    
    similarity_values = user_sim[user,:][user_sim[user,:].argsort()[::-1][1:k+1]]
    knn_dic = {neighbors[i]: similarity_values[i] for i in range(len(neighbors))}
    return knn_dic

In [18]:
df_sim_all = pd.DataFrame(columns=['customer_id','sim_users','sim_ratios'])
df_sim_all['customer_id'] = df_all_customers.customer_id

timer = 0
for i in range(len(df_all_customers.customer_id)):   
    if timer %1000 == 0:
        print('iteration no: '+str(timer))
    dic_user_sim = find_knn(user_sim, user=i, k=10)
    df_sim_all.loc[i, 'sim_users'] = list(dic_user_sim.keys())
    df_sim_all.loc[i, 'sim_ratios'] = list(dic_user_sim.values())
    timer += 1

iteration no: 0
iteration no: 1000
iteration no: 2000
iteration no: 3000
iteration no: 4000
iteration no: 5000
iteration no: 6000
iteration no: 7000
iteration no: 8000
iteration no: 9000
iteration no: 10000
iteration no: 11000
iteration no: 12000
iteration no: 13000
iteration no: 14000
iteration no: 15000
iteration no: 16000


In [19]:
df_sim_all.head()

Unnamed: 0,customer_id,sim_users,sim_ratios
0,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,[b79de13a78298e5dfd735eacd18eb8abab46a09d295ed...,"[0.9915425921943544, 0.9884639649663551, 0.984..."
2,000400dcdcf45d8d1ad1bd63be6fbd79a49167adf0cf28...,[910acbf1c52714ce2b32f4af08fd8d89580ebbcc76e40...,"[0.9666359711072442, 0.9443823678323113, 0.935..."
3,00058592fc65afabbb00b1bb7d33c6b221d00c6a98c621...,[3de1b6ccd0b7650e44d341b0631658068f929f1bf0fc9...,"[0.8690108411667624, 0.8442721879083999, 0.837..."
4,0006d3ff0caf0cb4d4e0615ee5cb7d268622364d483335...,[864ffed75c1496fcfafb728890025727e575dce2f44e8...,"[0.9778483661303669, 0.9745963062102099, 0.965..."
6,00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...,[7530642ff46b73fe0b1b9d1386504082b8da8b89c707e...,"[0.9211743508769599, 0.8847168764622639, 0.873..."


###  3. Bring all predictions together

In [20]:
df_train.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,count_days_ago
1022276,2019-09-23,000fb6e772c5d0023892065e659963da90b1866035558e...,771368001,0.118627,2,366
1022277,2019-09-23,000fb6e772c5d0023892065e659963da90b1866035558e...,591334019,0.016932,2,366
1022278,2019-09-23,000fb6e772c5d0023892065e659963da90b1866035558e...,810831004,0.033881,2,366
1022279,2019-09-23,00f1c3895749444bd89d21b2892f26e87efbe93464d068...,688558001,0.016932,2,366
1022280,2019-09-23,00f1c3895749444bd89d21b2892f26e87efbe93464d068...,688558001,0.016932,2,366


In [21]:
df_sim_all = df_sim_all.merge(df_train.groupby('customer_id')[['article_id']].agg(lambda x: list(x)).reset_index(), on='customer_id')
df_sim_all.article_id = df_sim_all.article_id.apply(lambda x: x[-5:] if len(x) > 5 else x)

In [22]:
df_sim_all.head()

Unnamed: 0,customer_id,sim_users,sim_ratios,article_id
0,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,[b79de13a78298e5dfd735eacd18eb8abab46a09d295ed...,"[0.9915425921943544, 0.9884639649663551, 0.984...","[572998013, 909869004, 923134003, 935858001, 8..."
1,000400dcdcf45d8d1ad1bd63be6fbd79a49167adf0cf28...,[910acbf1c52714ce2b32f4af08fd8d89580ebbcc76e40...,"[0.9666359711072442, 0.9443823678323113, 0.935...","[877961022, 877961022, 719655001, 717490060, 8..."
2,00058592fc65afabbb00b1bb7d33c6b221d00c6a98c621...,[3de1b6ccd0b7650e44d341b0631658068f929f1bf0fc9...,"[0.8690108411667624, 0.8442721879083999, 0.837...","[557599022, 758034001, 611415001, 829152002, 8..."
3,0006d3ff0caf0cb4d4e0615ee5cb7d268622364d483335...,[864ffed75c1496fcfafb728890025727e575dce2f44e8...,"[0.9778483661303669, 0.9745963062102099, 0.965...","[806766001, 903225002, 751471043, 915529001, 9..."
4,00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...,[7530642ff46b73fe0b1b9d1386504082b8da8b89c707e...,"[0.9211743508769599, 0.8847168764622639, 0.873...","[640021019, 868038003, 868038003, 927865001, 9..."


* #### Items that are bought by the similar customers within last 6 months

In [23]:
def split_data_date(data, last_x = 1, months=False, weeks=False):
    lastdate = data.iloc[-1]['t_dat']
    if months:
        treshold = lastdate + pd.DateOffset(months=-last_x)
    if weeks:
        treshold = lastdate + pd.DateOffset(weeks=-last_x)
        
    data1 = data[data.t_dat <= treshold]
    data2 = data[data.t_dat > treshold]
    return data1, data2

In [24]:
_, last6months = split_data_date(df_train, last_x = 6, months=True)

In [25]:
df_sim_all = df_sim_all.dropna()

In [26]:
## Items that are bought by the similar customers within last 6 months

df_sim_all['simusers_items'] = df_sim_all.sim_users.apply(lambda x: list(last6months[last6months.customer_id.isin(x)].article_id.value_counts()[:15].index))

In [27]:
df_sim_all.head()

Unnamed: 0,customer_id,sim_users,sim_ratios,article_id,simusers_items
0,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,[b79de13a78298e5dfd735eacd18eb8abab46a09d295ed...,"[0.9915425921943544, 0.9884639649663551, 0.984...","[572998013, 909869004, 923134003, 935858001, 8...","[875227001, 861712001, 817352006, 861847004, 9..."
1,000400dcdcf45d8d1ad1bd63be6fbd79a49167adf0cf28...,[910acbf1c52714ce2b32f4af08fd8d89580ebbcc76e40...,"[0.9666359711072442, 0.9443823678323113, 0.935...","[877961022, 877961022, 719655001, 717490060, 8...","[723469001, 741356002, 878794001, 866383008, 8..."
2,00058592fc65afabbb00b1bb7d33c6b221d00c6a98c621...,[3de1b6ccd0b7650e44d341b0631658068f929f1bf0fc9...,"[0.8690108411667624, 0.8442721879083999, 0.837...","[557599022, 758034001, 611415001, 829152002, 8...","[799365011, 865073002, 562245103, 870611001, 8..."
3,0006d3ff0caf0cb4d4e0615ee5cb7d268622364d483335...,[864ffed75c1496fcfafb728890025727e575dce2f44e8...,"[0.9778483661303669, 0.9745963062102099, 0.965...","[806766001, 903225002, 751471043, 915529001, 9...","[857621001, 895610003, 842062001, 658298007, 8..."
4,00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...,[7530642ff46b73fe0b1b9d1386504082b8da8b89c707e...,"[0.9211743508769599, 0.8847168764622639, 0.873...","[640021019, 868038003, 868038003, 927865001, 9...","[845790002, 579541086, 803757005, 803757013, 5..."


* #### Items that are bought together with the items the current user bought

In [28]:
def get_items_b2g(data, items, item_count = 15):
    
    best_selling = data['article_id'].value_counts()
    users = data.loc[data.article_id.isin(items), 'customer_id'].unique()
    best_selling_similars = data.loc[(data['customer_id'].isin(users)) &             
                                (~data['article_id'].isin(items)), 'article_id'].value_counts()
    return list(best_selling_similars[:item_count].keys())

In [29]:
df_sim_all['items_b2g'] = df_sim_all.article_id.apply(lambda x: get_items_b2g(df_train, x))

In [31]:
df_sim_all

Unnamed: 0,customer_id,sim_users,sim_ratios,article_id,simusers_items,items_b2g
0,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,[b79de13a78298e5dfd735eacd18eb8abab46a09d295ed...,"[0.9915425921943544, 0.9884639649663551, 0.984...","[572998013, 909869004, 923134003, 935858001, 8...","[875227001, 861712001, 817352006, 861847004, 9...","[923134005, 933838002, 706016001, 864668002, 9..."
1,000400dcdcf45d8d1ad1bd63be6fbd79a49167adf0cf28...,[910acbf1c52714ce2b32f4af08fd8d89580ebbcc76e40...,"[0.9666359711072442, 0.9443823678323113, 0.935...","[877961022, 877961022, 719655001, 717490060, 8...","[723469001, 741356002, 878794001, 866383008, 8...","[841383002, 706016001, 253448003, 464297007, 7..."
2,00058592fc65afabbb00b1bb7d33c6b221d00c6a98c621...,[3de1b6ccd0b7650e44d341b0631658068f929f1bf0fc9...,"[0.8690108411667624, 0.8442721879083999, 0.837...","[557599022, 758034001, 611415001, 829152002, 8...","[799365011, 865073002, 562245103, 870611001, 8...","[706016001, 803986001, 614854005, 720125001, 7..."
3,0006d3ff0caf0cb4d4e0615ee5cb7d268622364d483335...,[864ffed75c1496fcfafb728890025727e575dce2f44e8...,"[0.9778483661303669, 0.9745963062102099, 0.965...","[806766001, 903225002, 751471043, 915529001, 9...","[857621001, 895610003, 842062001, 658298007, 8...","[751471001, 915529003, 883033002, 822946001, 9..."
4,00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...,[7530642ff46b73fe0b1b9d1386504082b8da8b89c707e...,"[0.9211743508769599, 0.8847168764622639, 0.873...","[640021019, 868038003, 868038003, 927865001, 9...","[845790002, 579541086, 803757005, 803757013, 5...","[640021011, 448509014, 706016002, 720125001, 7..."
...,...,...,...,...,...,...
12663,c7d0a74f470362734ea5d7338b30f56112ce74cd02f246...,[1d4c5c55a5ef6fb3094ac523cbbc0e5e0013a2a89c870...,"[0.9181617328517787, 0.894128596785859, 0.8866...","[697564046, 599718002, 848308006, 848308006, 7...","[763275001, 767869001, 767834001, 800691007, 8...","[599718001, 706016001, 572797001, 599718015, 8..."
12664,c7d292861646204b23c43349bdfaba47ceda019ef77be5...,[52c9d41462300e4fc52f55d242da8b58e436e4ddd0dff...,"[0.922252307176197, 0.9177083496286959, 0.8946...","[791587018, 875329002, 905518001, 905518001, 8...","[636323002, 761501009, 572797001, 887949001, 7...","[720125001, 810838010, 875329001, 706016001, 4..."
12665,c7dbda0e6fd654592d3e25bba513467e9c4329edf3e6db...,[e17d216ef51110b702a99e665c941481fa1d08d9053a2...,"[0.9372008774678847, 0.9229436160614617, 0.917...","[853510005, 580684001, 808684002, 744272002, 7...","[772773002, 685816041, 214844003, 841402001, 8...","[693911011, 576897002, 706016001, 862970001, 8..."
12666,c7e650a70d37326dd6e669372b136620dccc271dfc30de...,[0f63dc96e47fac15dbd8766d8e2e63b72f91149385824...,"[0.9222290786453305, 0.9200677580584992, 0.915...","[907188001, 907188001, 897189001, 876151001, 8...","[856527002, 573085043, 827359003, 832166002, 7...","[751471001, 706016001, 706016025, 850259004, 7..."


* #### Recently most popular items

In [32]:
## most popular items

_, df_last6months = split_data_date(df_train, last_x = 6, months=True)
df_last6months['year_month'] = df_last6months.t_dat.dt.year.astype(str) + '-' + df_last6months.t_dat.dt.month.astype(str)
item_popularity = df_last6months.groupby(['article_id','year_month']).agg({'price':'count', 'count_days_ago':'min'}).reset_index()
item_popularity = item_popularity.groupby('article_id')['count_days_ago','price'].agg(lambda x: list(x))
item_popularity['time_weights'] = item_popularity.apply(lambda x: [1/w for w in x['count_days_ago']] , axis=1)
item_popularity = item_popularity.apply(lambda x: np.dot(x.price, x.time_weights)*len(x.price) , axis=1).sort_values(ascending=False)
item_popularity = pd.DataFrame(item_popularity, columns=['ppty']).reset_index()
item_popularity.ppty = (item_popularity.ppty - item_popularity.ppty.mean()) / (item_popularity.ppty.max() - item_popularity.ppty.min())
item_popularity

Unnamed: 0,article_id,ppty
0,751471001,0.985643
1,706016001,0.965464
2,610776002,0.771896
3,841383002,0.724208
4,768912001,0.685490
...,...,...
27866,660187002,-0.014357
27867,643145014,-0.014357
27868,638048001,-0.014357
27869,709334002,-0.014357


In [33]:
most_pupular_lately = list(item_popularity[:15].article_id)
most_pupular_lately

[751471001,
 706016001,
 610776002,
 841383002,
 768912001,
 610776001,
 448509014,
 827968001,
 706016003,
 806388002,
 783346001,
 372860002,
 760084003,
 841383003,
 800691008]

###  4. Item similarity from tf-idf

In [34]:
def prep_tfidf_data(df):
    data = df.fillna('').agg(''.join, axis=1).to_frame('comb')
    data.comb = data.comb.apply(lambda x: np.char.lower(x))
    symbols = "!\"#$%&()*+-.,/:;<=>?@[\]^_`{|}~\n"
    for i in symbols:
        data.comb = data.comb.apply(lambda x:np.char.replace(x, i, ' '))

    data.comb = data.comb.apply(lambda x:np.char.replace(x, 'and', ''))
    data.comb = data.comb.apply(lambda x:np.char.replace(x, 'also', ''))
    data.comb = data.comb.apply(lambda x:np.char.replace(x, 'with', ''))
    data.comb = data.comb.apply(lambda x:np.char.replace(x, 'the', ''))

    numbers_dic = {0: 'zero',
                   1: 'one',
                   2: 'two',
                   3: 'three',
                   4: 'four',
                   5: 'five',
                   6: 'six',
                   7: 'seven',
                   8: 'eight',
                   9: 'nine'}

    for key, value in numbers_dic.items():
        data.comb = data.comb.apply(lambda x:np.char.replace(x, str(key), value))
        
    data.comb = data.comb.apply(lambda x: x.split())
    data.comb = data.comb.apply(lambda x: [i for i in x if len(i)<=10 and len(i)>2])
    data['comb'] = data['comb'].agg(lambda x: ' '.join(map(str, x)))
    return data

In [35]:
df_articles = pd.read_csv('data/articles.csv.gz', compression='gzip', error_bad_lines=False)
df_articles = df_articles[['article_id', 'prod_name','product_type_name', 'product_group_name',
                           'graphical_appearance_name', 'colour_group_name','perceived_colour_value_id', 
                           'perceived_colour_value_name','perceived_colour_master_name', 'department_name', 
                            'index_name', 'index_group_name', 'section_name', 'garment_group_name', 'detail_desc']]
df_articles.set_index('article_id',inplace=True)
df_articles.perceived_colour_value_id = df_articles.perceived_colour_value_id.astype('str')
df_articles.head(2)

Unnamed: 0_level_0,prod_name,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_name,department_name,index_name,index_group_name,section_name,garment_group_name,detail_desc
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
108775015,Strap top,Vest top,Garment Upper body,Solid,Black,4,Dark,Black,Jersey Basic,Ladieswear,Ladieswear,Womens Everyday Basics,Jersey Basic,Jersey top with narrow shoulder straps.
108775044,Strap top,Vest top,Garment Upper body,Solid,White,3,Light,White,Jersey Basic,Ladieswear,Ladieswear,Womens Everyday Basics,Jersey Basic,Jersey top with narrow shoulder straps.


In [36]:
df_articles = prep_tfidf_data(df_articles)
df_articles.reset_index(inplace=True)
df_articles.head(2)

Unnamed: 0,article_id,comb
0,108775015,strap topvest topgarment upper everyday top na...
1,108775044,strap topvest topgarment upper everyday top na...


In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df_articles.comb)

res_dict = {key: [] for key in X.nonzero()[0]}
for row, col in zip(*X.nonzero()):
    res_dict[row].append(col)
    
    
def item_similarity(item1,item2):
    """
    Brings the cosine similarity value between item1 and item2, calculated from tf-idf values.
    item1 & item2 : article_id of items (int)
    """
    id1 = df_articles[df_articles.article_id == item1].index[0]
    id2 = df_articles[df_articles.article_id == item2].index[0]
    
    dic_item1 = { (id1,kee): X[(id1,kee)] for kee in res_dict[id1] }
    dic_item2 = { (id2,kee): X[(id2,kee)] for kee in res_dict[id2] }
    
    df1 = pd.DataFrame(dic_item1.keys(), columns=['item_id','kees'])
    df1['values'] = dic_item1.values()
    df1 = df1.pivot_table(index=['item_id'], columns='kees', values='values')
    df1.reset_index(inplace=True)
    
    df2 = pd.DataFrame(dic_item2.keys(), columns=['item_id','kees'])
    df2['values'] = dic_item2.values()
    df2 = df2.pivot_table(index=['item_id'], columns='kees', values='values')
    df2.reset_index(inplace=True)
    
    common_cols = list(set(df1.columns).intersection(set(df2.columns)))
    df_merged = df1.merge(df2,on=common_cols,how='outer').fillna(0)
    df_merged.drop('item_id',axis=1,inplace=True)
    row1 = df_merged.loc[0].values
    row2 = df_merged.loc[1].values

    cosine_sim = np.dot(row1,row2) / (np.linalg.norm(row1) * np.linalg.norm(row2))    

    return cosine_sim

### 5. Measure similarity between recommendations and items in the valid set and try to maximize it

In [38]:
df_sim_all

Unnamed: 0,customer_id,sim_users,sim_ratios,article_id,simusers_items,items_b2g
0,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,[b79de13a78298e5dfd735eacd18eb8abab46a09d295ed...,"[0.9915425921943544, 0.9884639649663551, 0.984...","[572998013, 909869004, 923134003, 935858001, 8...","[875227001, 861712001, 817352006, 861847004, 9...","[923134005, 933838002, 706016001, 864668002, 9..."
1,000400dcdcf45d8d1ad1bd63be6fbd79a49167adf0cf28...,[910acbf1c52714ce2b32f4af08fd8d89580ebbcc76e40...,"[0.9666359711072442, 0.9443823678323113, 0.935...","[877961022, 877961022, 719655001, 717490060, 8...","[723469001, 741356002, 878794001, 866383008, 8...","[841383002, 706016001, 253448003, 464297007, 7..."
2,00058592fc65afabbb00b1bb7d33c6b221d00c6a98c621...,[3de1b6ccd0b7650e44d341b0631658068f929f1bf0fc9...,"[0.8690108411667624, 0.8442721879083999, 0.837...","[557599022, 758034001, 611415001, 829152002, 8...","[799365011, 865073002, 562245103, 870611001, 8...","[706016001, 803986001, 614854005, 720125001, 7..."
3,0006d3ff0caf0cb4d4e0615ee5cb7d268622364d483335...,[864ffed75c1496fcfafb728890025727e575dce2f44e8...,"[0.9778483661303669, 0.9745963062102099, 0.965...","[806766001, 903225002, 751471043, 915529001, 9...","[857621001, 895610003, 842062001, 658298007, 8...","[751471001, 915529003, 883033002, 822946001, 9..."
4,00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...,[7530642ff46b73fe0b1b9d1386504082b8da8b89c707e...,"[0.9211743508769599, 0.8847168764622639, 0.873...","[640021019, 868038003, 868038003, 927865001, 9...","[845790002, 579541086, 803757005, 803757013, 5...","[640021011, 448509014, 706016002, 720125001, 7..."
...,...,...,...,...,...,...
12663,c7d0a74f470362734ea5d7338b30f56112ce74cd02f246...,[1d4c5c55a5ef6fb3094ac523cbbc0e5e0013a2a89c870...,"[0.9181617328517787, 0.894128596785859, 0.8866...","[697564046, 599718002, 848308006, 848308006, 7...","[763275001, 767869001, 767834001, 800691007, 8...","[599718001, 706016001, 572797001, 599718015, 8..."
12664,c7d292861646204b23c43349bdfaba47ceda019ef77be5...,[52c9d41462300e4fc52f55d242da8b58e436e4ddd0dff...,"[0.922252307176197, 0.9177083496286959, 0.8946...","[791587018, 875329002, 905518001, 905518001, 8...","[636323002, 761501009, 572797001, 887949001, 7...","[720125001, 810838010, 875329001, 706016001, 4..."
12665,c7dbda0e6fd654592d3e25bba513467e9c4329edf3e6db...,[e17d216ef51110b702a99e665c941481fa1d08d9053a2...,"[0.9372008774678847, 0.9229436160614617, 0.917...","[853510005, 580684001, 808684002, 744272002, 7...","[772773002, 685816041, 214844003, 841402001, 8...","[693911011, 576897002, 706016001, 862970001, 8..."
12666,c7e650a70d37326dd6e669372b136620dccc271dfc30de...,[0f63dc96e47fac15dbd8766d8e2e63b72f91149385824...,"[0.9222290786453305, 0.9200677580584992, 0.915...","[907188001, 907188001, 897189001, 876151001, 8...","[856527002, 573085043, 827359003, 832166002, 7...","[751471001, 706016001, 706016025, 850259004, 7..."


* We have 3 lists containing 15 recommended items for each. We should derive the optimum weights to give to these lists in order to have the max similarity with the items in the validation set.

In [42]:
df_valid.groupby('customer_id')[['t_dat']].count().sort_values('t_dat').tail()

Unnamed: 0_level_0,t_dat
customer_id,Unnamed: 1_level_1
0785996d681149d11a659aaee406948b789c022dcef7cee5da937c8692c5f473,35
d435400e8f614b5380bc63aded2255cb193a3ae78d63cc3fb56c4e1b60f31935,36
9703f9fc33a3e2c606fcd8a5995f21fc976128a6d524d7db81a43b68ca212612,37
4588b496b9ac643a27cadf419cb615645abb577449106069dbc0b97e70a6c515,40
28624a2362e70adc94ae52713c08d65db1c53d0c45033a1737ecc51f0c038b34,61


In [49]:
# a customer who has 35 transactions in the valid set

test_id = df_valid.groupby('customer_id')[['t_dat']].count().sort_values('t_dat')[-5:].index[0]
test_id

'0785996d681149d11a659aaee406948b789c022dcef7cee5da937c8692c5f473'

In [50]:
def flatten_listoflists(sim_u_ch):
    s = []
    for sublist in sim_u_ch:
        if type(sublist)==int:
            s.append(sublist)
        elif type(sublist)==list:
            for i in sublist:
                s.append(i)
    return s

In [51]:
row = df_sim_all.loc[df_sim_all.customer_id == test_id]
row = row.loc[row.index[0]]
row

customer_id       0785996d681149d11a659aaee406948b789c022dcef7ce...
sim_users         [1cb96d364209ba01dc17a5ab9c09356fb6a65993676ce...
sim_ratios        [0.9487077895942798, 0.9344624352450429, 0.926...
article_id        [858833001, 748355011, 878794001, 864755009, 8...
simusers_items    [399223001, 639672001, 851167001, 717251001, 7...
items_b2g         [706016001, 464297007, 803986001, 758034001, 5...
Name: 482, dtype: object

In [52]:
user_u = test_id

## items u bought in the valid set
items_useru_bought = df_valid[df_valid.customer_id == user_u].article_id.tolist()

## popular items that are bought buy similar users to current user u
sim_user_choices = flatten_listoflists(row.simusers_items)

## items that are bought together with the items user u have bought
items_b2g = flatten_listoflists(row.items_b2g)

## recently most popular items: most_pupular_lately

In [53]:
df_itemsim = pd.DataFrame(index = sim_user_choices, columns = items_useru_bought)
for i in sim_user_choices:
    for j in items_useru_bought:
        sim = item_similarity(i,j)
        df_itemsim.loc[i,j] = sim
df_itemsim

Unnamed: 0,921380001,921380001.1,924645001,875272011,875272011.1,875272011.2,822389001,906296001,868680007,817047001,...,906639003,906639003.1,736870001,736870001.1,857163001,857163001.1,682550002,868874006,768912001,768912001.1
399223001,0.182558,0.182558,0.143751,0.094793,0.094793,0.094793,0.058299,0.112596,0.012568,0.096755,...,0.011905,0.011905,0.0,0.0,0.142439,0.142439,0.061348,0.065349,0.029833,0.029833
639672001,0.077519,0.077519,0.030089,0.019589,0.019589,0.019589,0.022042,0.052598,0.048407,0.008233,...,0.048422,0.048422,0.096089,0.096089,0.0,0.0,0.010392,0.018598,0.087247,0.087247
851167001,0.074228,0.074228,0.038309,0.039481,0.039481,0.039481,0.162105,0.083684,0.192426,0.010482,...,0.059532,0.059532,0.285189,0.285189,0.0,0.0,0.03513,0.049775,0.116637,0.116637
717251001,0.068805,0.068805,0.013096,0.011649,0.011649,0.011649,0.054418,0.013393,0.016914,0.059043,...,0.04738,0.04738,0.021059,0.021059,0.025505,0.025505,0.026378,0.022119,0.0,0.0
798579009,0.13632,0.13632,0.071136,0.033232,0.033232,0.033232,0.02494,0.088628,0.008449,0.033888,...,0.0,0.0,0.010519,0.010519,0.115013,0.115013,0.01826,0.007294,0.125842,0.125842
811907003,0.018989,0.018989,0.037193,0.011307,0.011307,0.011307,0.07965,0.013,0.081593,0.0,...,0.0,0.0,0.10159,0.10159,0.0,0.0,0.035013,0.010735,0.0,0.0
887830003,0.213772,0.213772,0.055068,0.089211,0.089211,0.089211,0.008022,0.109624,0.009264,0.03716,...,0.0,0.0,0.011535,0.011535,0.126116,0.126116,0.034348,0.01827,0.037236,0.037236
706016001,0.201256,0.201256,0.077251,0.113079,0.113079,0.113079,0.040813,0.133101,0.010766,0.095349,...,0.010198,0.010198,0.0,0.0,0.032644,0.032644,0.075334,0.05598,0.025556,0.025556
832320001,0.039271,0.039271,0.0,0.007701,0.007701,0.007701,0.059767,0.008854,0.051704,0.031137,...,0.008151,0.008151,0.064376,0.064376,0.0,0.0,0.0,0.014623,0.03133,0.03133
811925009,0.020745,0.020745,0.0,0.015179,0.015179,0.015179,0.049626,0.017451,0.058962,0.008025,...,0.007969,0.007969,0.062938,0.062938,0.0,0.0,0.010128,0.021559,0.03063,0.03063


In [58]:
pd.DataFrame(df_itemsim.max(axis=0),columns=['max_similarity'])

Unnamed: 0,max_similarity
921380001,0.213772
921380001,0.213772
924645001,0.188962
875272011,0.113079
875272011,0.113079
875272011,0.113079
822389001,0.162105
906296001,0.133101
868680007,0.211093
817047001,0.096755
