In [None]:
!pip install -q kaggle 
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
! kaggle competitions download --force h-and-m-personalized-fashion-recommendations -f transactions_train.csv

Downloading transactions_train.csv.zip to /content
 99% 577M/584M [00:16<00:00, 38.5MB/s]
100% 584M/584M [00:16<00:00, 36.4MB/s]


In [None]:
! unzip transactions_train.csv.zip

Archive:  transactions_train.csv.zip
  inflating: transactions_train.csv  


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn-white')
sns.set_style("whitegrid")
sns.despine()
plt.rc("figure", autolayout=True)
plt.rc("axes", labelweight="bold", labelsize="large", titleweight="bold", titlesize=14, titlepad=10)

import matplotlib as mpl

mpl.rcParams['axes.spines.left'] = False
mpl.rcParams['axes.spines.right'] = False
mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.spines.bottom'] = False
plt.rcParams["font.weight"] = "bold"
plt.rcParams["axes.labelweight"] = "bold"

import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import cut_tree

import datetime
from tqdm import tqdm

<Figure size 432x288 with 0 Axes>

In [None]:
df_transactions = pd.read_csv("transactions_train.csv", encoding="ISO-8859-1", dtype={"article_id":str}, header=0).drop_duplicates()

In [None]:
df_transactions.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


In [None]:
df_transactions.shape

(28813419, 5)

## FRM Analysis

In [None]:
df_transactions['InvoiceDate'] = pd.to_datetime(df_transactions['t_dat'], format='%Y-%m-%d')
df_transactions = df_transactions[['InvoiceDate', 'customer_id', 'article_id', 'price', 'sales_channel_id']].drop_duplicates()

In [None]:
df_transactions.shape

(28813419, 5)

In [None]:
# Checking df's missing value's attribution in %
df_null = round(100*(df_transactions.isnull().sum())/len(df_transactions), 2)
df_null

InvoiceDate         0.0
customer_id         0.0
article_id          0.0
price               0.0
sales_channel_id    0.0
dtype: float64

In [None]:
import datetime as dt

In [None]:
analysis_date = max(df_transactions['InvoiceDate']) + dt.timedelta(days=1)
print((analysis_date).date())

2020-09-23


In [None]:
df_transactions['date'] = df_transactions['InvoiceDate']

In [None]:
rfm = df_transactions.groupby('customer_id').agg({
    'InvoiceDate': lambda x: (analysis_date - x.max()).days,
    'date': 'count',
    'price': 'sum'
})

rfm.columns = ['Recency', 'Frequency', 'Monetary']

In [None]:
rfm = rfm[rfm['Monetary'] > 0 ]

In [None]:
rfm.head()

Unnamed: 0_level_0,Recency,Frequency,Monetary
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657,18,19,0.543932
0000423b00ade91418cceaf3b26c6af3dd342b51fd051eec9c12fb36984420fa,77,78,2.412237
000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318,8,15,0.606525
00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2c5feb1ca5dff07c43e,472,2,0.060983
00006413d8573cd20ed7128e53b7b13819fe5cfc2d801fe7fc0f26dd8d65a85a,42,13,0.469695


In [None]:
# Date from customer's last purchase. The nearest date gets 5 and the furthest date gets 1.
rfm['recency_score'] = pd.qcut(rfm['Recency'], 5, labels=[5, 4, 3, 2, 1])

# Total number of purchases. The least frequency gets 1 and the maximum frequency gets 5
rfm['frequency_score'] = pd.qcut(rfm['Frequency'].rank(method="first"), 5, labels=[1, 2, 3, 4, 5])

# Total spend by the customer. The least money gets 1, the most money gets 5.
rfm['monetary_value'] = pd.qcut(rfm['Monetary'], 5, labels=[1, 2, 3, 4, 5])
rfm.head()

Unnamed: 0_level_0,Recency,Frequency,Monetary,recency_score,frequency_score,monetary_value
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657,18,19,0.543932,5,4,4
0000423b00ade91418cceaf3b26c6af3dd342b51fd051eec9c12fb36984420fa,77,78,2.412237,4,5,5
000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318,8,15,0.606525,5,4,4
00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2c5feb1ca5dff07c43e,472,2,0.060983,1,1,1
00006413d8573cd20ed7128e53b7b13819fe5cfc2d801fe7fc0f26dd8d65a85a,42,13,0.469695,4,3,4


In [None]:
# RFM - The value of 2 different variables that were formed was recorded as a RFM_SCORE.
rfm['RFM_SCORE'] = (rfm['recency_score'].astype(str) + rfm['frequency_score'].astype(str))

In [None]:
seg_map = {
    r'[1-2][1-2]': 'hibernating',
    r'[1-2][3-4]': 'at_Risk',
    r'[1-2]5': 'cant_loose',
    r'3[1-2]': 'about_to_sleep',
    r'33': 'need_attention',
    r'[3-4][4-5]': 'loyal_customers',
    r'41': 'promising',
    r'51': 'new_customers',
    r'[4-5][2-3]': 'potential_loyalists',
    r'5[4-5]': 'champions'
}
rfm['segment'] = rfm['RFM_SCORE'].replace(seg_map, regex=True)
rfm.head()

Unnamed: 0_level_0,Recency,Frequency,Monetary,recency_score,frequency_score,monetary_value,RFM_SCORE,segment
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657,18,19,0.543932,5,4,4,54,champions
0000423b00ade91418cceaf3b26c6af3dd342b51fd051eec9c12fb36984420fa,77,78,2.412237,4,5,5,45,loyal_customers
000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318,8,15,0.606525,5,4,4,54,champions
00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2c5feb1ca5dff07c43e,472,2,0.060983,1,1,1,11,hibernating
00006413d8573cd20ed7128e53b7b13819fe5cfc2d801fe7fc0f26dd8d65a85a,42,13,0.469695,4,3,4,43,potential_loyalists


In [None]:
rfm[['segment', 'Recency', 'Frequency', 'Monetary']].groupby('segment').agg(['mean', 'count', 'max']).round()

Unnamed: 0_level_0,Recency,Recency,Recency,Frequency,Frequency,Frequency,Monetary,Monetary,Monetary
Unnamed: 0_level_1,mean,count,max,mean,count,max,mean,count,max
segment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
about_to_sleep,161.0,106645,231,3.0,106645,6,0.0,106645,1.0
at_Risk,432.0,182709,734,12.0,182709,31,0.0,182709,4.0
cant_loose,342.0,16727,734,50.0,16727,691,1.0,16727,20.0
champions,16.0,199755,35,62.0,199755,1641,2.0,199755,49.0
hibernating,500.0,345298,734,3.0,345298,6,0.0,345298,2.0
loyal_customers,96.0,266159,231,40.0,266159,642,1.0,266159,22.0
need_attention,155.0,63654,231,9.0,63654,13,0.0,63654,2.0
new_customers,17.0,15946,35,1.0,15946,2,0.0,15946,1.0
potential_loyalists,48.0,143348,96,7.0,143348,13,0.0,143348,2.0
promising,68.0,22040,96,2.0,22040,2,0.0,22040,0.0


In [None]:
import plotly.express as px

In [None]:
x = rfm.segment.value_counts()
fig = px.treemap(x, path=[x.index], values=x)
fig.update_layout(title_text="Distribution of the RFM Segments", title_x=0.5, title_font=dict(size=20))
fig.update_traces(textinfo='label+value+percent root')
fig.show()

## Recommend Items Frequently Purchased Together.
Item-Item Based Collaborative Filtering
- Objective - To product recommendations if items for Hibernating customer - User 5 (from RFM) for their upcoming purchase.

## Item-based collaborative filtering - using probabilistic matrix factorization.
We need to restrict the data respect to a minimum transaction date. This way, we reduce the dimensionality of the problem and we get rid of transactions that are not important in terms of the time decaying popularity.
Also, we get rid of articles that haven not been bought enought.

In [None]:
start_date = datetime.datetime(2020, 9, 1)

# Filter transaction by date.
df_transactions['t_dat'] = pd.to_datetime(df_transactions['InvoiceDate'])
df_transactions = df_transactions.loc[df_transactions['InvoiceDate'] > start_date]

In [None]:
# Filter transactions by number an article has been bought.
article_bought_count = df_transactions[['article_id', 'InvoiceDate']].groupby('article_id').count().reset_index().rename(columns={'InvoiceDate':'count'})
most_bought_articles = article_bought_count[article_bought_count['count']>10]['article_id'].values
df_transactions = df_transactions[df_transactions['article_id'].isin(most_bought_articles)]
df_transactions['bought'] = 1



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Due to the large amount of items, we cannot consider the whole matrix in order to train. Therefore, we need to generate some negative samples: transactions that have never occured.

In [None]:
# Generate negative samples.
np.random.seed(0)

negative_samples = pd.DataFrame({'article_id': np.random.choice(df_transactions.article_id.unique(), df_transactions.shape[0]), 
                                 'customer_id': np.random.choice(df_transactions.customer_id.unique(), df_transactions.shape[0]),
                                 'bought': np.zeros(df_transactions.shape[0])})

Model will be based on recommendations computed through the time decaying popularity and the most similar items to those items bought the most times by each user. Similarity among items is computed through cosine distance.

In [None]:
class ItemBased_RecSys:
  """Collaborative filtering using a cosin sim(u, u')."""

  def __init__(self, pos_trans, neg_trans, num_components=10):
    '''Constructor.'''
    self.pos_trans = pos_trans
    self.df_transactions = pd.concat([pos_trans, neg_trans])
    self.customers = self.df_transactions.customer_id.values
    self.articles = self.df_transactions.article_id.values
    self.bought = self.df_transactions.bought.values
    self.num_components = num_components

    self.customer_id2index = {c: i for i, c in enumerate(np.unique(self.customers))}
    self.article_id2index = {a: i for i, a in enumerate(np.unique(self.articles))}

  def __sdg__(self):
    for idx in tqdm(self.training_indices):
      # Get the current sample.
      customer_id = self.customers[idx]
      article_id = self.articles[idx]
      bought = self.bought[idx]

      # Get the index of the user and the article.
      customer_index = self.customer_id2index[customer_id]
      article_index = self.article_id2index[article_id]

      # Compute the prediction and the error
      prediction = self.predict_single(customer_index, article_index)
      error = (bought - prediction) # error

      # Update latent factors in terms of the learning rate and the observed error
      self.customers_latent_matrix[customer_index] += self.learning_rate * \
                                    (error * self.articles_latent_matrix[article_index] - \
                                     self.lmbda * self.customers_latent_matrix[customer_index])
      self.articles_latent_matrix[article_index] += self.learning_rate * \
                              (error * self.customers_latent_matrix[customer_index] - \
                                self.lmbda * self.articles_latent_matrix[article_index])
                              
  def fit(self, n_epochs=30, learning_rate=1e-5, lmbda=0.1):
    '''Compute the matrix factorization R = P x Q'''
    self.learning_rate = learning_rate
    self.lmbda = lmbda
    n_samples = self.df_transactions.shape[0]

    # Initialize latent matrices
    self.customers_latent_matrix = np.random.normal(scale=1., size=(len(np.unique(self.customers)), self.num_components))
    self.articles_latent_matrix = np.random.normal(scale=1., size=(len(np.unique(self.articles)), self.num_components))

    for epoch in range(n_epochs):
      print(f'Epoch: {epoch}')
      self.training_indices = np.arange(n_samples)

      # Shuffle training samples and follow stochastic gradient descent.
      np.random.shuffle(self.training_indices)
      self.__sdg__()

  def predict_single(self, customer_index, article_index):
    '''Make a prediction for a specific customer and article.'''
    prediction = np.dot(self.customers_latent_matrix[customer_index], self.articles_latent_matrix[article_index])
    prediction = np.clip(prediction, 0, 1)

    return prediction

  def default_recommendation(self):
    '''Calculate time decaying popularity.
    
    This leads to items bought recently having more weight in the popularity list.
    '''
    self.pos_trans['pop_factor'] = self.pos_trans['t_dat'].apply(lambda x: 1/(datetime.datetime(2020, 9, 23) - x).days)
    transaction_by_article = self.pos_trans[['article_id', 'pop_factor']].groupby('article_id').sum().reset_index()

    return transaction_by_article.sort_values(by='pop_factor', ascending=False)['article_id'].values[:12]

  def predict(self, customers):
    '''Make recommendations. '''
    recommendations = []
    self.articles_latent_matrix[np.isnan(self.articles_latent_matrix)] = 0
    
    # Compute similarity matrix (cosine)
    similarity_matrix = cosine_similarity(self.articles_latent_matrix, self.articles_latent_matrix, dense_output=False)

    # Convert similarity matrix into a matrix containing the 12 most similar item's index for each item.
    similarity_matrix = np.argsort(similarity_matrix, axis=1)
    similarity_matrix = similarity_matrix[:, -12:]

    # Get default recommendation (time decay popularity)
    default_recommendation = self.default_recommendation()

    # Group articles by user and articles to compute the number of times each article has been bought by each user.
    transactions_by_customer = self.pos_trans[['customer_id', 'article_id', 'bought']].groupby(['customer_id', 'article_id']).count().reset_index()
    most_bought_article = transactions_by_customer.loc[transactions_by_customer.groupby('customer_id').bought.idxmax()]['article_id'].values 

    # Make predictions
    for customer in tqdm(customers):
      try:
        rec_aux1 = []
        rec_aux2 = []
        aux = []

        # Retrieve the most bought article by customer
        user_most_bought_article_id = most_bought_articles[self.customer_id2index[customer]]

        # Using the similarity matrix, get the 6 most similar articles.
        rec_aux1 = self.articles[similarity_matrix[self.article_id2index[user_most_bought_article_id]]]

        # Return the half of the default recommendation
        rec_aux2 = default_recommendation

        # Merge half of both recommendation lists
        for rec_idx in range(6):
          aux.append(rec_aux2[rec_idx])
          aux.append(rec_aux1[rec_idx])

        recommendations.append(' '.join(aux))
      except:
        # Return the default recommendation
        recommendations.append(' '.join(default_recommendation))

    return pd.DataFrame({
        'customer_id': customers,
        'prediction': recommendations
    })

defining hyperparameters and fitting the model.

In [None]:
rec = ItemBased_RecSys(df_transactions, negative_samples, num_components=1000)
rec.fit(n_epochs=30)

Epoch: 0


100%|██████████| 1274524/1274524 [00:48<00:00, 26120.16it/s]


Epoch: 1


100%|██████████| 1274524/1274524 [00:48<00:00, 26339.84it/s]


Epoch: 2


100%|██████████| 1274524/1274524 [00:48<00:00, 26109.59it/s]


Epoch: 3


100%|██████████| 1274524/1274524 [00:48<00:00, 26529.38it/s]


Epoch: 4


100%|██████████| 1274524/1274524 [00:47<00:00, 26985.54it/s]


Epoch: 5


100%|██████████| 1274524/1274524 [00:46<00:00, 27291.68it/s]


Epoch: 6


100%|██████████| 1274524/1274524 [00:46<00:00, 27350.79it/s]


Epoch: 7


100%|██████████| 1274524/1274524 [00:46<00:00, 27429.63it/s]


Epoch: 8


100%|██████████| 1274524/1274524 [00:46<00:00, 27272.60it/s]


Epoch: 9


100%|██████████| 1274524/1274524 [00:46<00:00, 27406.95it/s]


Epoch: 10


100%|██████████| 1274524/1274524 [00:46<00:00, 27526.96it/s]


Epoch: 11


100%|██████████| 1274524/1274524 [00:46<00:00, 27524.16it/s]


Epoch: 12


100%|██████████| 1274524/1274524 [00:46<00:00, 27616.21it/s]


Epoch: 13


100%|██████████| 1274524/1274524 [00:46<00:00, 27671.35it/s]


Epoch: 14


100%|██████████| 1274524/1274524 [00:45<00:00, 27750.48it/s]


Epoch: 15


100%|██████████| 1274524/1274524 [00:46<00:00, 27342.92it/s]


Epoch: 16


100%|██████████| 1274524/1274524 [00:46<00:00, 27575.36it/s]


Epoch: 17


100%|██████████| 1274524/1274524 [00:46<00:00, 27579.69it/s]


Epoch: 18


100%|██████████| 1274524/1274524 [00:46<00:00, 27597.35it/s]


Epoch: 19


100%|██████████| 1274524/1274524 [00:46<00:00, 27347.23it/s]


Epoch: 20


100%|██████████| 1274524/1274524 [00:46<00:00, 27672.66it/s]


Epoch: 21


100%|██████████| 1274524/1274524 [00:46<00:00, 27140.28it/s]


Epoch: 22


100%|██████████| 1274524/1274524 [00:46<00:00, 27697.99it/s]


Epoch: 23


100%|██████████| 1274524/1274524 [00:45<00:00, 27794.46it/s]


Epoch: 24


100%|██████████| 1274524/1274524 [00:46<00:00, 27671.07it/s]


Epoch: 25


100%|██████████| 1274524/1274524 [00:46<00:00, 27499.24it/s]


Epoch: 26


100%|██████████| 1274524/1274524 [00:46<00:00, 27633.91it/s]


Epoch: 27


100%|██████████| 1274524/1274524 [00:45<00:00, 27852.54it/s]


Epoch: 28


100%|██████████| 1274524/1274524 [00:46<00:00, 27482.39it/s]


Epoch: 29


100%|██████████| 1274524/1274524 [00:45<00:00, 27826.15it/s]


In [None]:
! kaggle competitions download --force h-and-m-personalized-fashion-recommendations -f sample_submission.csv

Downloading sample_submission.csv.zip to /content
 81% 41.0M/50.3M [00:03<00:00, 12.6MB/s]
100% 50.3M/50.3M [00:03<00:00, 16.4MB/s]


In [None]:
! unzip sample_submission.csv.zip

Archive:  sample_submission.csv.zip
  inflating: sample_submission.csv   


In [None]:
customers = pd.read_csv('sample_submission.csv', encoding='ISO-8859-1', dtype={'article_id':str}, header=0).customer_id.unique()

In [None]:
recommendations = rec.predict(customers)

100%|██████████| 1371980/1371980 [00:03<00:00, 435601.98it/s]


In [None]:
recommendations.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0924243001 0584483003 0918522001 0874754001 07...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0924243001 0918522001 0751471001 0924243002 09...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0924243001 0919786001 0918522001 0916000003 07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0924243001 0918522001 0751471001 0924243002 09...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0924243001 0918522001 0751471001 0924243002 09...


In [None]:
recommendations.to_csv("rfm_collaborative_filtering.csv", index=False)

In [None]:
! kaggle competitions submit -c h-and-m-personalized-fashion-recommendations -f rfm_collaborative_filtering.csv -m "rfm collaborative filtering 1"

100% 258M/258M [00:17<00:00, 15.6MB/s]
Successfully submitted to H&M Personalized Fashion Recommendations