In [1]:
import pandas as pd
import numpy as np
from IPython.display import display
import matplotlib.pyplot as plt

import seaborn as sns
sns.set()
from pandas.io.json import json_normalize
from pprint import pprint
from pathlib import Path
from tqdm import tqdm
tqdm.pandas()

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
import warnings
warnings.filterwarnings("ignore")

In [4]:
transactions = pd.read_csv('./data/rawdata/transactions_train.csv', dtype={'article_id': str})
transactions['t_dat'] = pd.to_datetime(transactions['t_dat'])
#transactions.set_index('t_dat', inplace=True)


In [3]:
customers = pd.read_csv('./data/rawdata/customers.csv')
articles = pd.read_csv('./data/rawdata/articles.csv', dtype={'article_id': str})

# Implicit

In [5]:
transsept = transactions[transactions['t_dat']>='2020-09-01']

In [6]:
#将数据集中的用户ID和物品ID映射为数字索引

ALL_USERS = customers['customer_id'].unique().tolist()
ALL_ITEMS = articles['article_id'].unique().tolist()
#使用enumerate函数将列表中的元素与其对应的数字索引组成字典
user_ids = dict(list(enumerate(ALL_USERS)))
item_ids = dict(list(enumerate(ALL_ITEMS)))

user_map = {u: uidx for uidx, u in user_ids.items()}
item_map = {i: iidx for iidx, i in item_ids.items()}
#使用map函数将原始数据集中的customer_id和article_id列映射为数字索引
transsept['user_id'] = transsept['customer_id'].map(user_map)
transsept['item_id'] = transsept['article_id'].map(item_map)

In [7]:
transsept.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,user_id,item_id
30990055,2020-09-01,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,777148006,0.013542,1,38,74547
30990056,2020-09-01,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,835801001,0.018627,1,38,89049
30990057,2020-09-01,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,923134005,0.012695,1,38,104450
30990058,2020-09-01,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,865929003,0.016932,1,38,95251
30990059,2020-09-01,0005ed68483efa39644c45185550a82dd09acb07622acb...,863646004,0.033881,1,143,94699


In [8]:
import implicit
from scipy.sparse import coo_matrix
from implicit.evaluation import mean_average_precision_at_k

row = transsept['user_id'].values
col = transsept['item_id'].values
data = np.ones(transsept.shape[0])
coo_train = coo_matrix((data, (row, col)), shape=(len(ALL_USERS), len(ALL_ITEMS)))
coo_train

<1371980x105542 sparse matrix of type '<class 'numpy.float64'>'
	with 798269 stored elements in COOrdinate format>

In [None]:
print(coo_train)

In [15]:
%%time
#使用交替最小二乘法（Alternating Least Squares）算法来训练一个推荐模型

model = implicit.als.AlternatingLeastSquares(factors=10, iterations=2)
model.fit(coo_train)

  0%|          | 0/2 [00:00<?, ?it/s]

CPU times: user 397 ms, sys: 284 ms, total: 681 ms
Wall time: 696 ms


In [16]:
#将一个包含交易信息的数据框转换成一个COO稀疏矩阵
def to_user_item_coo(df):
  
    row = df['user_id'].values
    col = df['item_id'].values
    data = np.ones(df.shape[0])
    coo = coo_matrix((data, (row, col)), shape=(len(ALL_USERS), len(ALL_ITEMS)))
    return coo

#按照时间点划分验证集与测试集
def split_data(df, validation_days=7):
   
    validation_cut = '2020-09-16'

    df_train = df[df['t_dat'] < validation_cut]
    df_val = df[df['t_dat'] >= validation_cut]
    return df_train, df_val

def get_val_matrices(df, validation_days=7):
    
    df_train, df_val = split_data(df, validation_days=validation_days)
    coo_train = to_user_item_coo(df_train)
    coo_val = to_user_item_coo(df_val)

    csr_train = coo_train.tocsr() #将一个稀疏矩阵转换为压缩稀疏行矩阵
    csr_val = coo_val.tocsr()
    
    return {'coo_train': coo_train,
            'csr_train': csr_train,
            'csr_val': csr_val
          }


def validate(matrices, factors=200, iterations=20, regularization=0.01, show_progress=True):
    
    coo_train, csr_train, csr_val = matrices['coo_train'], matrices['csr_train'], matrices['csr_val']
    
    model = implicit.als.AlternatingLeastSquares(factors=factors, 
                                                 iterations=iterations, 
                                                 regularization=regularization, 
                                                 random_state=42)
    model.fit(coo_train, show_progress=show_progress)
    
    map12 = mean_average_precision_at_k(model, csr_train, csr_val, K=12, show_progress=show_progress, num_threads=4)
    print(f"Factors: {factors:>3} - Iterations: {iterations:>2} - Regularization: {regularization:4.3f} ==> MAP@12: {map12:6.5f}")
    return map12

In [17]:
matrices = get_val_matrices(transsept)

In [19]:
matrices

{'coo_train': <1371980x105542 sparse matrix of type '<class 'numpy.float64'>'
 	with 557958 stored elements in COOrdinate format>,
 'csr_train': <1371980x105542 sparse matrix of type '<class 'numpy.float64'>'
 	with 496094 stored elements in Compressed Sparse Row format>,
 'csr_val': <1371980x105542 sparse matrix of type '<class 'numpy.float64'>'
 	with 213728 stored elements in Compressed Sparse Row format>}

In [20]:
%%time
#寻找最优参数
best_map12 = 0
for factors in [100, 150 ,200]:
    for iterations in [3, 12, 14, 15, 20]:
        for regularization in [0.01]:
            map12 = validate(matrices, factors, iterations, regularization, show_progress=False)
            if map12 > best_map12:
                best_map12 = map12
                best_params = {'factors': factors, 'iterations': iterations, 'regularization': regularization}
                print(f"Best MAP@12 found. Updating: {best_params}")

Factors: 100 - Iterations:  3 - Regularization: 0.010 ==> MAP@12: 0.00334
Best MAP@12 found. Updating: {'factors': 100, 'iterations': 3, 'regularization': 0.01}
Factors: 100 - Iterations: 12 - Regularization: 0.010 ==> MAP@12: 0.00398
Best MAP@12 found. Updating: {'factors': 100, 'iterations': 12, 'regularization': 0.01}
Factors: 100 - Iterations: 14 - Regularization: 0.010 ==> MAP@12: 0.00399
Best MAP@12 found. Updating: {'factors': 100, 'iterations': 14, 'regularization': 0.01}
Factors: 100 - Iterations: 15 - Regularization: 0.010 ==> MAP@12: 0.00401
Best MAP@12 found. Updating: {'factors': 100, 'iterations': 15, 'regularization': 0.01}
Factors: 100 - Iterations: 20 - Regularization: 0.010 ==> MAP@12: 0.00400
Factors: 150 - Iterations:  3 - Regularization: 0.010 ==> MAP@12: 0.00353
Factors: 150 - Iterations: 12 - Regularization: 0.010 ==> MAP@12: 0.00391
Factors: 150 - Iterations: 14 - Regularization: 0.010 ==> MAP@12: 0.00390
Factors: 150 - Iterations: 15 - Regularizatio

In [21]:
coo_train = to_user_item_coo(transsept)
csr_train = coo_train.tocsr()

In [22]:
def train(coo_train, factors=100, iterations=12, regularization=0.01, show_progress=True):
    model = implicit.als.AlternatingLeastSquares(factors=factors, 
                                                 iterations=iterations, 
                                                 regularization=regularization, 
                                                 random_state=42)
    model.fit(coo_train, show_progress=show_progress)
    return model

In [23]:
best_params
model = train(coo_train, **best_params)

  0%|          | 0/15 [00:00<?, ?it/s]

In [24]:
def submit(model, csr_train, submission_name="submissions.csv"):
    preds = []
    batch_size = 2000
    #生成一个包含所有用户的索引的数组to_generate
    to_generate = np.arange(len(ALL_USERS))
    for startidx in range(0, len(to_generate), batch_size):
        batch = to_generate[startidx : startidx + batch_size]
        #获取每个用户的推荐结果，并将结果保存到preds列表中
        ids, scores = model.recommend(batch, csr_train[batch], N=12, filter_already_liked_items=False)
        for i, userid in enumerate(batch):
            customer_id = user_ids[userid]
            user_items = ids[i]
            article_ids = [item_ids[item_id] for item_id in user_items]
            preds.append((customer_id, ' '.join(article_ids)))

    df_preds = pd.DataFrame(preds, columns=['customer_id', 'prediction'])
    df_preds.to_csv(submission_name, index=False)
    
    display(df_preds.head())
    print(df_preds.shape)
    
    return df_preds

In [25]:
%%time
df_preds = submit(model, csr_train)

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568601006 0762846031 0568601044 0568597006 05...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0187949032 0187949031 0187949030 0187949029 01...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0805000001 0804992014 0804992017 0740519002 07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0187949032 0187949031 0187949030 0187949029 01...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0187949032 0187949031 0187949030 0187949029 01...


(1371980, 2)
CPU times: user 43.2 s, sys: 1.76 s, total: 45 s
Wall time: 46.5 s
