## Import initiaux

In [1]:
import os
# os.environ["TF_USE_LEGACY_KERAS"] = "1"
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
import pickle
import time
import mlflow
warnings.filterwarnings('ignore')

# plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

ARTICLES_PATH = "Datas/articles_metadata.csv"  
CLICKS_SAMPLE_PATH = "Datas/clicks_sample.csv"     
CLICKS__PATH = "Datas/clicks/"         
EMBEDING_PATH = "Datas/articles_embeddings.pickle" 
RATING_PREPROC_DF_PATH='Datas/rating_preprocess_df.pkl'
CANDIDATE_PREPROC_DF_PATH='Datas/candidate_preprocess_df.pkl'

debug=False

    

warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.width' ,2000)
pd.set_option('display.precision', 1)
pd.set_option('display.max_colwidth', 30)  
pd.set_option('display.expand_frame_repr', True) 
pd.set_option('display.float_format', '{:,.2f}'.format)  


# I. Exploration


## A. Articles_metadata


In [9]:
# Load articles metadata
articles_df = pd.read_csv(ARTICLES_PATH)
print(f"Articles shape: {articles_df.shape}")

# Articles dataset info
print("\nARTICLES METADATA:")
print(f"Number of articles: {len(articles_df)}")
print(f"Number of columns: {len(articles_df.columns)}")
print("\nColumns:")
for col in articles_df.columns:
    print(f"  - {col}")

print("\narticles data head:")
print(articles_df.head())

print("Articles dataset:")
print(articles_df.dtypes)

print("Missing Values:")
print(articles_df.isnull().sum())

print(f"Duplicate articles: {articles_df.duplicated().sum()}")
    

Articles shape: (364047, 5)

ARTICLES METADATA:
Number of articles: 364047
Number of columns: 5

Columns:
  - article_id
  - category_id
  - created_at_ts
  - publisher_id
  - words_count

articles data head:
   article_id  category_id  created_at_ts  publisher_id  words_count
0           0            0  1513144419000             0          168
1           1            1  1405341936000             0          189
2           2            1  1408667706000             0          250
3           3            1  1408468313000             0          230
4           4            1  1407071171000             0          162
Articles dataset:
article_id       int64
category_id      int64
created_at_ts    int64
publisher_id     int64
words_count      int64
dtype: object
Missing Values:
article_id       0
category_id      0
created_at_ts    0
publisher_id     0
words_count      0
dtype: int64
Duplicate articles: 0


la pluspart des informations vont nous √™tres inutiles:
    - l'id servira au merge avec les autres df 
    - on peut conserver  la cat√©gorie m√™me si elle sera probablement redondante avec l'embediing
    - words_count pourrait servir √† pond√©rer le temps de r√©tentention, si on peut le calculer avec la longueur de l'article

## B. Clicks_metadata

### Load datas

In [10]:
import os
if debug:
    clicks_df = pd.read_csv(CLICKS_SAMPLE_PATH)
else:
    clicks_files = []
        
    # Check if CLICKS_PATH is a directory or file
    if os.path.isdir(CLICKS__PATH):
        # Load all CSV files in the directory
        for file in os.listdir(CLICKS__PATH):
            if file.endswith('.csv'):
                file_path = os.path.join(CLICKS__PATH, file)
                print(f"Loading: {file_path}")
                clicks_files.append(pd.read_csv(file_path))
        

        clicks_df = pd.concat(clicks_files, ignore_index=True)
        print(f"Combined {len(clicks_files)} click files")
                                   
#rename click_article_df for merge
clicks_df.rename(columns={'click_article_id':'article_id'}, errors="raise", inplace=True)

Loading: Datas/clicks/clicks_hour_000.csv
Loading: Datas/clicks/clicks_hour_001.csv
Loading: Datas/clicks/clicks_hour_002.csv
Loading: Datas/clicks/clicks_hour_003.csv
Loading: Datas/clicks/clicks_hour_004.csv
Loading: Datas/clicks/clicks_hour_005.csv
Loading: Datas/clicks/clicks_hour_006.csv
Loading: Datas/clicks/clicks_hour_007.csv
Loading: Datas/clicks/clicks_hour_008.csv
Loading: Datas/clicks/clicks_hour_009.csv
Loading: Datas/clicks/clicks_hour_010.csv
Loading: Datas/clicks/clicks_hour_011.csv
Loading: Datas/clicks/clicks_hour_012.csv
Loading: Datas/clicks/clicks_hour_013.csv
Loading: Datas/clicks/clicks_hour_014.csv
Loading: Datas/clicks/clicks_hour_015.csv
Loading: Datas/clicks/clicks_hour_016.csv
Loading: Datas/clicks/clicks_hour_017.csv
Loading: Datas/clicks/clicks_hour_018.csv
Loading: Datas/clicks/clicks_hour_019.csv
Loading: Datas/clicks/clicks_hour_020.csv
Loading: Datas/clicks/clicks_hour_021.csv
Loading: Datas/clicks/clicks_hour_022.csv
Loading: Datas/clicks/clicks_hour_

In [11]:
print(f"Clicks shape: {clicks_df.shape}")

print(f"Number of interactions: {len(clicks_df)}")
print(f"Number of unique users: {clicks_df['user_id'].nunique()}")
print(f"Number of unique articles: {clicks_df['article_id'].nunique()}")

print(f"Date range: {clicks_df['click_timestamp'].min()} to {clicks_df['click_timestamp'].max()}")

print("\nFirst few rows of clicks data:")
print(clicks_df.head())

# Check for missing values
print("\n MISSING VALUES:")
print(clicks_df.isnull().sum())

# Check for duplicates
print(f"\n DUPLICATES:")
print(clicks_df.duplicated().sum())

# Check data types
print("\n DATA TYPES:")
print(clicks_df.dtypes)

Clicks shape: (2988181, 12)
Number of interactions: 2988181
Number of unique users: 322897
Number of unique articles: 46033
Date range: 1506826800026 to 1510603454886

First few rows of clicks data:
  user_id        session_id  ... click_region click_referrer_type
0       0  1506825423271737  ...           20                   2
1       0  1506825423271737  ...           20                   2
2       1  1506825426267738  ...           16                   2
3       1  1506825426267738  ...           16                   2
4       2  1506825435299739  ...           24                   2

[5 rows x 12 columns]

 MISSING VALUES:
user_id                0
session_id             0
session_start          0
session_size           0
article_id             0
click_timestamp        0
click_environment      0
click_deviceGroup      0
click_os               0
click_country          0
click_region           0
click_referrer_type    0
dtype: int64

 DUPLICATES:
0

 DATA TYPES:
user_id              

clicks_{}.csv contient 12 collonnes :

    user_id : user ID
    session_id : Session ID
    session_start : D√©but de session (timestamp)
    session_size : nombre d'article vu sur la session
    click_article_id : article ID user clicked
    click_timestamp : When user clicked (timestamp)
    click_environment : user env
    click_deviceGroup : user device
    click_os : user OS
    click_country : localisation (country)
    click_region : localisation (region)
    click_referrer_type : ?

On peut drop 
    click_environment      
    click_deviceGroup      
    click_os               
    click_country          
    click_region           
    click_referrer_type    

On peut merge les metadatas conserv√©es




On r√©cup√®re la diff√©rence entre le pr√©c√©dant click et le clic actuel pour avoir le temps ass√© sur chaque article

In [12]:
clicks_df = clicks_df.apply(pd.to_numeric)

In [13]:
clicks_df['prev_click_timestamp'] = clicks_df.groupby('session_id')['click_timestamp'].shift(1)
clicks_df.loc[clicks_df['prev_click_timestamp'].isna(),'prev_click_timestamp'] = clicks_df['session_start']
clicks_df['time_spend_on_article'] = clicks_df['click_timestamp']-clicks_df['prev_click_timestamp']

clicks_df['time_spend_on_article'].describe()

count       2,988,181.00
mean          569,876.01
std         5,442,533.48
min                 0.00
25%            30,000.00
50%            39,903.00
75%           202,575.00
max     1,212,149,256.00
Name: time_spend_on_article, dtype: float64

On comprend qu'un timestamp automatique √† +30000 a √©t√© appliqu√© √† toutes les derni√®res entr√©es de session, pour lesquelles on a pas de timestamp de sortie.

On peut soit essayer de corriger √ßa en rempla√ßant par exemple par la dur√©e moyenne de lecture pour une meilleur approximation, soit abandonner la pond√©ration par temps de lecture

In [14]:
mean_timespend = clicks_df.loc[clicks_df['time_spend_on_article']!=30000,'time_spend_on_article'].mean()
med_timespend = clicks_df.loc[clicks_df['time_spend_on_article']!=30000,'time_spend_on_article'].median()

print(f'temps moyen pass√© sur un article hors inf√©rence {mean_timespend}')
print(f'temps median pass√© sur un article hors inf√©rence {med_timespend}')

temps moyen pass√© sur un article hors inf√©rence 861752.947533867
temps median pass√© sur un article hors inf√©rence 130404.0


In [15]:

clicks_df.loc[clicks_df['time_spend_on_article']==30000,'time_spend_on_article']=med_timespend

In [16]:
clicks_df.drop(columns = [
                'session_id',
                'session_size',
                # 'click_environment',
                # 'click_deviceGroup',
                # 'click_os',
                # 'click_country',
                # 'click_region',
                # 'click_referrer_type',
                
                'prev_click_timestamp'],inplace=True)

On va avoir besoind e r√©cuperer certaines metadata des articles pour cr√©er notre rating. On merge donc les DF

In [17]:
# Merge datasets
clicks_df = clicks_df.merge(
    articles_df, 
    on='article_id', 
    how='left'
)
print(f"Merged shape: {clicks_df.shape}")





Merged shape: (2988181, 15)


#### Rating


In [19]:
clicks_df['words_count'].describe()

count   2,988,181.00
mean          208.63
std            81.60
min             0.00
25%           173.00
50%           198.00
75%           232.00
max         6,690.00
Name: words_count, dtype: float64

In [20]:
clicks_df.loc[clicks_df['words_count']<=0,'words_count']=198

In [21]:
clicks_df['time_per_word'] = clicks_df['time_spend_on_article']/clicks_df['words_count']

## C. articles_embeding

- Le pickle article_embedding est une repr√©sentation abstraites des articles qui vas nous servir pour la recommendation:
    Pickle (Python 3) of a NumPy matrix containing the Article Content Embeddings (250-dimensional vectors), trained upon articles' text and metadata by the CHAMELEON's ACR module (see paper for details) for 364047 published articles.
    P.s. The full text of news articles could not be provided due to license restrictions, but those embeddings can be used by Neural Networks to represent their content. See this paper for a t-SNE visualization of these embeddings, colored by category.



In [22]:

with open(EMBEDING_PATH , 'rb') as f:    
    embeding_df = pickle.load(f)

embeding_df = pd.DataFrame(embeding_df)


print(f"Embeding shape: {embeding_df.shape}")


Embeding shape: (364047, 250)


In [23]:
# embeddings_with_id = embeding_df.reset_index().rename(columns={'index': 'article_id'})
embeding_df.index.names = ['article_id']



In [None]:

# # Merge datasets
# merged_df = merged_df.merge(
#     embeding_df, 
#     on='article_id', 
#     how='left'
# )
# print(f"Merged shape: {merged_df.shape}")



 la data ne semble pas fiable pour determiner le temps pass√© sur els article:
    Sur les session de 2 article la difference de click_timestamp est toujours identique.
    il n'existe aucune session de 1 article
    l

In [24]:

clicks_df.to_pickle(RATING_PREPROC_DF_PATH)
embeding_df.to_pickle(CANDIDATE_PREPROC_DF_PATH)

# II. Model de recommandation

In [2]:

with open(RATING_PREPROC_DF_PATH , 'rb') as f:
    rating_df = pickle.load(f)
with open(CANDIDATE_PREPROC_DF_PATH , 'rb') as f:
    candidate_df = pickle.load(f)

##  librairie: implicit vs surprise vs tensorflow reocmmendation:

Implicit n'est pas en V1 et n'a pas vu de commit depuis plus d'un an. Il n'y a pas de documentation hormis  le git.

Surprise est mieux maintenu  et plus cit√© mais plus orient√© explicit qu'implicit

tfrs est ~~bien maintenu~~(non), addoss√© a tensorflow, et adapt√© au probl√®me:
https://www.tensorflow.org/recommenders/examples/basic_retrieval





## Popularity (Dummy)  

In [3]:
from popularity_model import PopularityArticleRecommender

## A. IMPLICIT (for explicit/implicit collaborative filtering)

### 1. Model architecture


In [3]:
from implicit_model import ArticleRetrievalImplicit





In [None]:

retrieval_model = ArticleRetrievalImplicit(rating_df, 
                                           candidate_df, 
                                           factors=64,
                                           model_type='BAY'
                                           )
retrieval_model.train()


### 2. Metrics

In [4]:
from implicit_model import implicit_evaluation

#### test de recommendation

In [78]:
# Recommend articles for a user
result = retrieval_model.recommend("8", N=5)
print(result)

# Find similar articles
# print(retrieval_model.similar_items('42', N=5))

[{'article_id': '331242', 'score': 7.332470417022705}, {'article_id': '331664', 'score': 6.8674774169921875}, {'article_id': '10253', 'score': 6.760776042938232}, {'article_id': '353786', 'score': 6.688801288604736}, {'article_id': '36160', 'score': 6.6756181716918945}]


### 3. Entrainement

#### Initialisation mlflow

In [5]:
from mlflow_tools import start_local_experiment
start_local_experiment()

mlflow server --host 127.0.0.1  --port 8080 

                mlflow ui --backend-store-uri /mlruns


#### Experimentation mlflow

In [8]:
from mlflow_tools import mlflow_experiment

##### Dummy

In [7]:
experiment_params = {
    'rating_df' : rating_df,
    'candidate_df' : candidate_df,
    'ratings_keep' : ["user_id","article_id","time_per_word","category_id"],
    'candidates_keep' : ["article_id"],                      
    'rating_target' : "time_per_word",
    'train_test_split_perc' : 0.8,
    'model_class': PopularityArticleRecommender,
    'evaluation_type' : 'CB' ,  
    
}

In [9]:
experiment_params = {
    'rating_df' : rating_df,
    'candidate_df' : candidate_df,
    'ratings_keep' : ["user_id","article_id","time_per_word","category_id"],
    'candidates_keep' : ["article_id"],                      
    'rating_target' : None,
    'train_test_split_perc' : 0.8,
    'model_class': PopularityArticleRecommender,
    'evaluation_type' : 'CB' ,  
    
}

##### Reference

In [7]:
experiment_params = {
    'rating_df' : rating_df,
    'candidate_df' : candidate_df,
    'ratings_keep' : ["user_id","article_id","time_per_word","category_id"],
    'candidates_keep' : ["article_id"],                      
    'rating_target' : ["time_per_word","category_id"],
    'train_test_split_perc' : 0.8,
    
}

##### ALS

In [10]:
experiment_params = {
    'rating_df' : rating_df,
    'candidate_df' : candidate_df,
    'ratings_keep' : ["user_id","article_id","time_per_word","category_id"],
    'candidates_keep' : ["article_id"],                      
    'rating_target' : None,
    'train_test_split_perc' : 0.8,
    'model_type':'ALS'
}

In [6]:
experiment_params = {
    'rating_df' : rating_df,
    'candidate_df' : candidate_df,
    'ratings_keep' : ["user_id","article_id","time_per_word"],
    'candidates_keep' : ["article_id"],                      
    'rating_target' : 'time_per_word',
    'train_test_split_perc' : 0.8,
    'model_type':'ALS'
}

##### Bayesian

In [12]:
experiment_params = {
    'rating_df' : rating_df,
    'candidate_df' : candidate_df,
    'ratings_keep' : ["user_id","article_id","time_per_word"],
    'candidates_keep' : ["article_id"],                      
    'rating_target' : "time_per_word",
    'train_test_split_perc' : 0.8,
    'model_type':'BAY'
    
}

In [16]:
experiment_params = {
    'rating_df' : rating_df,
    'candidate_df' : candidate_df,
    'ratings_keep' : ["user_id","article_id","category_id"],
    'candidates_keep' : ["article_id"],                      
    'rating_target' : None,
    'train_test_split_perc' : 0.8,
    'model_type':'BAY'
    
}

In [15]:
experiment_params = {
    'rating_df' : rating_df,
    'candidate_df' : candidate_df,
    'ratings_keep' : ["user_id","article_id","category_id",'time_spend_on_article','words_count'],
    'candidates_keep' : ["article_id"],                      
    'rating_target' : None,
    'train_test_split_perc' : 0.8,
    
}

##### User metadata

In [32]:
experiment_params = {
    'rating_df' : rating_df,
    'candidate_df' : candidate_df,
    'ratings_keep' : ["user_id","article_id","category_id",'time_spend_on_article','words_count',
                      'click_environment','click_deviceGroup','click_country','click_region'],
    'candidates_keep' : ["article_id"],                      
    'rating_target' : None,
    'train_test_split_perc' : 0.8,
    
}

##### Article embedding

In [11]:
experiment_params = {
    'rating_df' : rating_df,
    'candidate_df' : candidate_df,
    'ratings_keep' : ["user_id","article_id","time_per_word","category_id"],
    'candidates_keep' : ["article_id"],                      
    'rating_target' : None,
    'train_test_split_perc' : 0.8,
    'add_embeding_vector': True,
    'embeding_alpha':0.9
    
}

In [9]:
experiment_params = {
    'rating_df' : rating_df,
    'candidate_df' : candidate_df,
    'ratings_keep' : ["user_id","article_id","time_per_word","category_id"],
    'candidates_keep' : ["article_id"],                      
    'rating_target' : None,
    'train_test_split_perc' : 0.8,
    'add_embeding_vector': True,
    'use_pca' : True,
    'embeding_alpha':0.9
    
}

In [13]:
experiment_params = {
    'rating_df' : rating_df,
    'candidate_df' : candidate_df,
    'ratings_keep' : ["user_id","article_id","time_per_word","category_id"],
    'candidates_keep' : ["article_id"],                      
    'rating_target' : None,
    'train_test_split_perc' : 0.8,
    'add_embeding_vector': True,
    'use_pca' : True,
    'embeding_alpha':0.5
    
}

In [21]:
experiment_params = {
    'rating_df' : rating_df,
    'candidate_df' : candidate_df,
    'ratings_keep' : ["user_id","article_id","time_per_word","category_id"],
    'candidates_keep' : ["article_id"],                      
    'rating_target' : None,
    'train_test_split_perc' : 0.8,
    'add_embeding_vector': False,
    'use_pca' : True,
    'embeding_alpha':0
    
}

In [7]:
experiment_params = {
    'rating_df' : rating_df,
    'candidate_df' : candidate_df,
    'ratings_keep' : ["user_id","article_id","time_per_word","category_id"],
    'candidates_keep' : ["article_id"],                      
    'rating_target' : ["time_per_word","category_id"],
    'train_test_split_perc' : 0.8,
    'add_embeding_vector': True,
    'use_pca' : True,
    'embeding_alpha':0,
    'embedding_type' : 'USER_MEAN',
    
}

In [9]:
from recommenders_models import ArticleRetrievalRecommenders as ARR_recommenders
experiment_params = {
    'rating_df' : rating_df,
    'candidate_df' : candidate_df,
    'ratings_keep' : ["user_id","article_id","time_per_word","category_id"],
    'candidates_keep' : ["article_id"],                      
    'rating_target' : ["time_per_word","category_id"],
    'train_test_split_perc' : 0.8,
    'add_embeding_vector': True,
    'use_pca' : True,
    'embeding_alpha':0,
    'embedding_type' : 'USER_MEAN',
    'model_class' : ARR_recommenders,
    'model_type' : 'SAR'
    
}

##### ContentBased

In [5]:
from content_base_model import ContentBasedRecommender
experiment_params = {
    'rating_df' : rating_df,
    'candidate_df' : candidate_df,
    #'sample': 1000,
    'ratings_keep' : ["user_id","article_id","time_per_word","category_id"],
    'candidates_keep' : ["article_id"],                      
    'rating_target' : [],
    'train_test_split_perc' : 0.8,
    'model_class' : ContentBasedRecommender,
    'evaluation_type' : 'CB' ,  
    
}



In [None]:
from content_base_model import ContentBasedRecommender
experiment_params = {
    'rating_df' : rating_df,
    'candidate_df' : candidate_df,
    #'sample': 1000,
    'ratings_keep' : ["user_id","article_id","time_per_word","category_id"],
    'candidates_keep' : ["article_id"],                      
    'rating_target' : ["time_per_word"],
    'train_test_split_perc' : 0.8,
    'model_class' : ContentBasedRecommender,
    'evaluation_type' : 'CB' ,  

    
}

In [10]:
from content_base_model import ContentBasedRecommender
experiment_params = {
    'rating_df' : rating_df,
    'candidate_df' : candidate_df,
    #'sample': 1000,
    'ratings_keep' : ["user_id","article_id","time_per_word","category_id"],
    'candidates_keep' : ["article_id"],                      
    'rating_target' : ["time_per_word"],
    'train_test_split_perc' : 0.8,
    'model_class' : ContentBasedRecommender,
    'evaluation_type' : 'CB' ,  
    'seen_candidate_only' : True,
    
}

In [8]:
experiment_params = {
    'rating_df' : rating_df,
    'candidate_df' : candidate_df,
    'sample': 1000,
    'ratings_keep' : ["user_id","article_id","time_per_word","category_id"],
    'candidates_keep' : ["article_id"],                      
    'rating_target' : ["time_per_word"],
    'train_test_split_perc' : 0.8,
    'model_class' : ContentBasedRecommender,
    'evaluation_type' : 'CB' ,  
    'seen_candidate_only' : True,
    
    
}

In [10]:
experiment_params = {
    'rating_df' : rating_df,
    'candidate_df' : candidate_df,
    'ratings_keep' : ["user_id","article_id","time_per_word","category_id"],
    'candidates_keep' : ["article_id"],                      
    'rating_target' : [],
    'train_test_split_perc' : 0.8,
    'model_class' : ContentBasedRecommender,
    'evaluation_type' : 'CB' ,  
    'seen_candidate_only' : True,
    
    
}

##### Launch

In [17]:
print('')
model = mlflow_experiment(**experiment_params)


Training BAY model...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [00:05<00:00,  8.86it/s, train_auc=98.51%, skipped=4.02%]


After fit:
Training complete.



100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 210253/210253 [00:19<00:00, 10898.68it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 210253/210253 [00:18<00:00, 11069.23it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 210253/210253 [00:18<00:00, 11369.14it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 210253/210253 [00:18<00:00, 11263.11it/s]


üèÉ View run grandiose-duck-222 at: http://127.0.0.1:8080/#/experiments/362647451172403616/runs/885797c9f5f94cfda81bcdb9a8079dd0
üß™ View experiment at: http://127.0.0.1:8080/#/experiments/362647451172403616


### 5. Precomputing

In [3]:
from implicit_model import precompute_model_results

MODEL_PATH = 'model_assets/model.pkl'
with open(MODEL_PATH, 'rb') as f:
    model = pickle.load(f)

precompute_dic = precompute_model_results(model,N=5)

In [None]:
result = precompute_dic['100000']


In [6]:
with open('model_assets/precompute_dic.pkl', 'wb') as file:
    pickle.dump(precompute_dic, file)

## B- Content-based

### 1. model architecture

In [3]:
from content_base_model import ContentBasedRecommender

In [4]:
model = ContentBasedRecommender(rating_df,candidate_df)


In [5]:

model.train()

12 minute with slow python loop

12min python loop, 1min 30 vectorized, 12s FAISS san precomp, 2m18s precompute 105, 

In [12]:
model.faiss_recommend_items(100011)

(Index([157536, 157122, 161915, 105687, 155573, 149603, 162026, 160453, 156623,
        267015],
       dtype='int64'),
 array([0.7607803 , 0.757375  , 0.7505505 , 0.74798596, 0.74087423,
        0.7383653 , 0.7380524 , 0.73552585, 0.7344037 , 0.7330377 ],
       dtype=float32))

In [7]:
model.faiss_recommend_items(100011)

(Index([157728, 153421, 284362, 150436, 155662, 160711, 157000, 159539, 289045,
        346062],
       dtype='int64'),
 array([0.80296916, 0.7828106 , 0.7755932 , 0.7748253 , 0.76989317,
        0.7691962 , 0.76218307, 0.76191413, 0.7615298 , 0.7610241 ],
       dtype=float32))

In [6]:
model.recommend_items(100011)

Unnamed: 0,article_id,recStrength
0,157728.0,0.802969
1,153421.0,0.782811
2,284362.0,0.775593
3,150436.0,0.774825
4,155662.0,0.769893
5,160711.0,0.769196
6,157000.0,0.762183
7,159539.0,0.761915
8,289045.0,0.76153
9,346062.0,0.761024


In [6]:
model.recommend_items(100011)

Unnamed: 0,article_id,recStrength
0,157728.0,0.802969
1,153421.0,0.782811
2,284362.0,0.775593
3,150436.0,0.774825
4,155662.0,0.769893
5,160711.0,0.769196
6,157000.0,0.762183
7,159539.0,0.761915
8,289045.0,0.76153
9,346062.0,0.761024


In [18]:
CB_MODEL_PATH = 'model_assets/cf_model.pkl'


In [19]:
with open(CB_MODEL_PATH, 'wb') as file:
    pickle.dump(model, file)

In [14]:

with open(CB_MODEL_PATH, 'rb') as f:
    model = pickle.load(f)

### 2. Evaluation


In [10]:
from model_evaluator import RecommendationModelEvaluator

In [None]:
model_evaluator = RecommendationModelEvaluator(model)

print('Evaluating Content-Based Filtering model...')
cb_global_metrics, cb_detailed_results_df = model_evaluator.evaluate_model(model)
print('\nGlobal metrics:\n%s' % cb_global_metrics)
cb_detailed_results_df.head(10)



## C. Keras+tfrs (abandonn√© car mal maintenu)

In [4]:
from keras import Sequential,layers,Model

import tensorflow_recommenders as tfrs


# Build vocabularies
user_ids_vocabulary = layers.StringLookup(
    vocabulary=rating_df["user_id"].astype(str).unique(), mask_token=None
)
article_ids_vocabulary = layers.StringLookup(
    vocabulary=candidate_df.index.astype(str).unique(), mask_token=None
)

# Define the model
class ArticleRetrievalModel(tfrs.models.Model):

    def __init__(self,
                 user_ids_vocabulary=user_ids_vocabulary,
                 article_ids_vocabulary=article_ids_vocabulary,
                 ):
        super().__init__()


                
        embedding_dim = 64  # dimensionality of learned user/article embeddings
        
        # User tower
        self.user_model = Sequential([
            user_ids_vocabulary,
            layers.Embedding(
                input_dim=user_ids_vocabulary.vocabulary_size(),
                output_dim=embedding_dim)
        ])
        
        # Article tower: combines id + precomputed embedding
        self.article_id_model = Sequential([
            article_ids_vocabulary,
            layers.Embedding(article_ids_vocabulary.vocabulary_size(), 
                             embedding_dim)
        ])
        
        self.article_embedding_projector = Sequential([
            layers.Dense(128, activation="relu"),
            layers.Dense(embedding_dim)
        ])
        
        # Retrieval task
        self.task = tfrs.tasks.Retrieval()

    def article_embedding(self, features):
        # Combine ID-based embedding and precomputed 250D embedding
        id_emb = self.article_id_model(features["article_id"])
        content_emb = self.article_embedding_projector(tf.convert_to_tensor(features["embedding"]))
        return tf.concat([id_emb, content_emb], axis=1)
    
    def compute_loss(self, features, training=False):
        user_embeddings = self.user_model(features["user_id"])
        article_embeddings = self.article_embedding(features)
        
        return self.task(user_embeddings, article_embeddings)


In [5]:
from typing import Dict, Text
import tensorflow as tf

class ArticldeModel(tfrs.Model):

  def __init__(self, user_model, candidate_model, task):
    super().__init__()
    self.candidate_model: Model = candidate_model
    self.user_model: Model = user_model
    self.task: layers.Layer = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    user_embeddings = self.user_model(features["user_id"])
    # And pick out the movie features and pass them into the movie model,
    # getting embeddings back.
    positive_candidate_embeddings = self.candidate_model(features["article_id"])

    # The task computes the loss and the metrics.
    return self.task(user_embeddings, positive_candidate_embeddings)

#### tfrs

In [None]:
from keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping
from keras.optimizers import Adagrad
from keras.losses import CategoricalCrossentropy

def mlflow_experiment(rating_df = rating_df,
                      candidate_df = candidate_df,
                      ratings_keep= ["user_id","article_id","time_per_word"],
                      candidates_keep = ["article_id"],                      
                      rating_target = 'time_per_word',
                      num_epochs = 5,
                      **kwargs):
    start_time = time.time()
    # convert vocabulary to str
    candidate_df = candidate_df.reset_index()

    ratings_tf =  tf.data.Dataset.from_tensor_slices(dict(rating_df[ratings_keep]))
    candidate_tf = tf.data.Dataset.from_tensor_slices(dict(candidate_df[candidates_keep]))

    candidate_ids = candidate_df["article_id"].astype(str)
    user_ids = rating_df["user_id"].astype(str)
    # candidate_ids = candidate_tf.map(lambda x: x["article_id"])
    # user_ids = ratings_tf.map(lambda x: x["user_id"])

    
    unique_candidate_id = candidate_ids.unique()
    unique_user_ids = user_ids.unique()
    # unique_user_ids = np.unique(np.concatenate(list(user_ids)))


    # Build vocabularies
    user_ids_vocabulary = layers.StringLookup(
        vocabulary=unique_user_ids, mask_token=None
    )
    article_ids_vocabulary = layers.StringLookup(
        vocabulary=unique_candidate_id, mask_token=None
    )

    #SPLIT test/train
    tf.random.set_seed(42)
    shuffled = ratings_tf.shuffle(len(rating_df), seed=42, reshuffle_each_iteration=False)
    train = shuffled.take(int(len(rating_df) * 0.8))
    test = shuffled.skip(int(len(rating_df) * 0.8))


    # model = ArticleRetrievalModel(user_ids_vocabulary=user_ids_vocabulary,
    #                               article_ids_vocabulary=article_ids_vocabulary)
    #LOCAL MODEL
    embedding_dimension = 32
    user_model = Sequential([
                 layers.StringLookup(
                    vocabulary=unique_user_ids, mask_token=None),
                # We add an additional embedding to account for unknown tokens.
                 layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
                ])
    
    candidate_model = Sequential([
                  layers.StringLookup(
                                    vocabulary=unique_candidate_id, mask_token=None),
                  layers.Embedding(len(unique_candidate_id) + 1, embedding_dimension)
    ])


    

    # article_candidates = (
    #     tf_df.batch(128)
    #     #.map(candidate_model)
    #     .map(model.article_id_model)
    #     # .map(lambda x: (x["article_id"], model.article_embedding(x)))
    #     .cache()
    # )
    article_candidates = candidate_tf.batch(128).map(
                                                    lambda x: candidate_model(x["article_id"])
                                                ).cache()
    
    candidate_embeddings = candidate_model(
        tf.convert_to_tensor(candidate_df["article_id"].astype(str))
    )
    candidate_dataset = tf.data.Dataset.from_tensor_slices({
    "article_id": candidate_df["article_id"].astype(str),
    "embedding": candidate_embeddings  # Your pre-computed embeddings
    })

    # Assign metrics
    # metrics=tfrs.metrics.FactorizedTopK(candidates=article_candidates)

    # Explicitly specify k values and use the brute-force method
    # metrics = tfrs.metrics.FactorizedTopK(
    #     candidates=article_candidates,
    #     ks=[1, 5, 10]  # Add this line
    # )
    # loss=CategoricalCrossentropy()

    task = tfrs.tasks.Retrieval(
        # metrics=metrics,
        # loss = loss
        
    )

    model = ArticldeModel(user_model,candidate_model,task)

    # optimizer=Adagrad(learning_rate=0.1)
    # optimizer=tf.compat.v1.train.AdagradOptimizer(learning_rate=0.1)
    optimizer='adam'
    model.compile(optimizer=optimizer,
                #   loss=loss, 
                #   metrics=metrics
                  )
    
    monitor=kwargs.get('monitor','val_loss')

    callbacks = []
    if kwargs.get('use_tensorboard',False):
        tb = TensorBoard(log_dir='logs', write_graph=True)
        callbacks.append(tb)
    if kwargs.get('use_checkpoint',False):
        mc = ModelCheckpoint(
                            mode='max', 
                            filepath='models-dr/pdilated.weights.h5', 
                            monitor=monitor, 
                            save_best_only='True', 
                            save_weights_only='True', 
                            verbose=1)
        callbacks.append(mc)
    es = EarlyStopping(
                        monitor=monitor,
                        mode='min',
                    #    mode='max', 
                    #    monitor='acc', 
                        patience= kwargs.get('patience',2), 
                        verbose=1)
    callbacks.append(es)

    cached_train = train.batch(8192).cache()
    cached_test = test.batch(4096).cache()

    # model.fit(cached_train, epochs=5)
    with mlflow.start_run() as run:

        # mlflow.tensorflow.autolog()
        history =  model.fit( #fit_generator deprecated
                    cached_train,
                    # steps_per_epoch=train_step_per_epoch,
                    epochs=num_epochs,
                    # verbose=1,
                    validation_data=cached_test,
                    # validation_steps=val_step_per_epoch,
                    # use_multiprocessing=True,
                    # workers=16,
                    # callbacks=callbacks,
                    # max_queue_size=32,
                    )

        process_time = time.time() - start_time


        # Start an MLflow run

        signature = None
        # Infer the model signature
        # if sign_model:
        #     signature = infer_signature(X_train, model.predict(X_train), model_params )


        model_info  = mlflow.keras.log_model(
                                    model=model,        
                                    name=model.name,
                                    signature=signature,
                                    input_example=None,
                                    registered_model_name=f"{model.name}",
                                    )

        
        # hash_id = None
        # try:
        #     import hashlib
        #     hash_id = hashlib.sha256(df.to_string().encode()).hexdigest()
        # except:
        #     pass

        # Log other information about the model
        mlflow.log_params({ "Process_Time": process_time,
                        #    'ModelParams' : model_params,
                            'optimizer':optimizer,
                            # 'loss':loss,
                            'monitor':monitor,
                            # 'GenParams' : gen_params,
                            'Metrics' : metrics,
                            # 'TrainStepPerEpoch' : train_step_per_epoch, #sample/batch_size
                            # 'ValStepPerEpoch' : val_step_per_epoch,
                            'Epochs' : num_epochs,
                            # 'DataHash': hash_id,
                            # 'dataset_path':df_path,
                            # 'dataset_length':len(df.index),
                            
                            })

#### tfrs


https://github.com/tensorflow/recommenders/issues/712 Bug sur la librairie tfrs

In [None]:

mlflow_experiment(rating_df = rating_df,
                candidate_df = candidate_df,
                ratings_keep= ["user_id","article_id","time_per_word"],
                candidates_keep = ["article_id"],                      
                rating_target = 'time_per_word',
                num_epochs = 5,
                )



##### Get recommendations

In [None]:
# Create article dataset with embeddings
article_dataset = tf_df.batch(128).map(lambda x: (
    x["article_id"], model.article_embedding(x)
))

index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
index.index_from_dataset(article_dataset)

user_id = "1"
scores, ids = index(tf.constant([user_id]))
print("Top recommendations for user", user_id, ":", ids[0, :5].numpy())


## D. Recommenders

### LightFM

https://github.com/lyst/lightfm/issues/687 crash sans raison...

https://github.com/recommenders-team/recommenders/blob/main/examples/02_model_collaborative_filtering/lightfm_deep_dive.ipynb

incompatible python 3.13, 3.12.9, ? Installed from forked using new ctype for build

In [4]:
import os
import sys
import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import lightfm
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm import cross_validation
from lightfm.evaluation import precision_at_k as lightfm_prec_at_k
from lightfm.evaluation import recall_at_k as lightfm_recall_at_k

from recommenders.evaluation.python_evaluation import precision_at_k, recall_at_k
from recommenders.utils.timer import Timer
from recommenders.datasets import movielens
from recommenders.models.lightfm.lightfm_utils import (
    track_model_metrics,
    prepare_test_df,
    prepare_all_predictions,
    compare_metric,
    similar_users,
    similar_items,
)
from recommenders.utils.notebook_utils import store_metadata

print("System version: {}".format(sys.version))
print("LightFM version: {}".format(lightfm.__version__))

System version: 3.12.9 | packaged by Anaconda, Inc. | (main, Feb  6 2025, 18:49:16) [MSC v.1929 64 bit (AMD64)]
LightFM version: 1.17


In [5]:
# default number of recommendations
K = 10
# percentage of data used for testing
TEST_PERCENTAGE = 0.25
# model learning rate
LEARNING_RATE = 0.25
# no of latent factors
NO_COMPONENTS = 20
# no of epochs to fit model
NO_EPOCHS = 20
# no of threads to fit model
NO_THREADS = 4
# regularisation for both user and item features
ITEM_ALPHA = 1e-6
USER_ALPHA = 1e-6

# seed for pseudonumber generations
SEED = 42



#### Prepared dataset

In [6]:
dataset = Dataset()

dataset.fit(users=rating_df['user_id'], 
            items=rating_df['article_id'])

In [7]:
ratings_target = ['time_per_word']


num_users, num_articles = dataset.interactions_shape()
print(f'Num users: {num_users}, num_topics: {num_articles}.')

(interactions, weights) = dataset.build_interactions(rating_df[['user_id','article_id']+ratings_target].values)

Num users: 322897, num_topics: 46033.


#### Train/test split

In [8]:

train_interactions, test_interactions = cross_validation.random_train_test_split(
    interactions, test_percentage=TEST_PERCENTAGE,
    random_state=np.random.RandomState(SEED))

print(f"Shape of train interactions: {train_interactions.shape}")
print(f"Shape of test interactions: {test_interactions.shape}")



Shape of train interactions: (322897, 46033)
Shape of test interactions: (322897, 46033)


#### Fit model

In [9]:


model1 = LightFM(loss='warp', no_components=NO_COMPONENTS, 
                 learning_rate=LEARNING_RATE,                 
                 random_state=np.random.RandomState(SEED))


In [10]:


model1.fit(interactions=train_interactions,
          epochs=NO_EPOCHS)





: 

### DKN? https://github.com/recommenders-team/recommenders/blob/main/examples/00_quick_start/dkn_MIND.ipynb