In [138]:
import pandas as pd
import numpy as np
from collections import defaultdict
from itertools import islice,product,chain,zip_longest
from tqdm.contrib.itertools import product as tqdm_product
from sklearn.model_selection import train_test_split as train_test_split_rows
import re
import time
import implicit
import operator
import optuna
from implicit.evaluation import train_test_split
from implicit.cpu.bpr import BayesianPersonalizedRanking as BPR
from implicit.cpu.lmf import LogisticMatrixFactorization as LogisticMF
from implicit.als import AlternatingLeastSquares as ALS
from implicit.nearest_neighbours import CosineRecommender
from implicit.nearest_neighbours import BM25Recommender
from implicit.nearest_neighbours import TFIDFRecommender
from scipy.sparse import coo_matrix
from tqdm.notebook import tqdm
from sklearn.preprocessing import StandardScaler
import plotly.express as px
from scipy.sparse import csr_matrix
from plotly import graph_objs as go

## 1. Загружаем датасеты, подготовленные ранее

In [2]:
courses_features_with_embeddings = pd.read_csv('courses_features.csv',sep = '^',index_col=0)
courses_features_temp = pd.read_csv('courses_features_1.csv',sep = '^',index_col=0)
interactions_matrix_with_weights = pd.read_csv('interactions_matrix.csv',index_col=0)

In [3]:
interactions_matrix_with_weights = interactions_matrix_with_weights.reindex(sorted(interactions_matrix_with_weights.columns), axis=1)

In [4]:
courses_features_with_embeddings = courses_features_with_embeddings.sort_values(by = 'id')

In [5]:
id_title_description = courses_features_temp[['id','title','description']].copy()
courses_features_without_embeddings = courses_features_temp.drop(['title','description'], axis = 1).copy()
del courses_features_temp

In [6]:
interactions_matrix_with_weights.columns = list(map(lambda x: int(re.findall('[0-9]+',x)[0]),interactions_matrix_with_weights.columns))
interactions_matrix_without_weights = interactions_matrix_with_weights.transform(lambda x: (x>0).astype(float))

## 2. Подбираем лучшую коллаборативную модель

Поделим выборку на тренировочную и тестовую, на тестовой выборке будем оценивать итоговое качество модели с наилучшими гиперпараметрами, которые подобраны кросс-валидацией

In [7]:
train_percent = 0.8
short_train_percent = 0.5
train_interactions_without_weights, test_interactions_without_weights = train_test_split(coo_matrix(interactions_matrix_without_weights),
                                                                                            train_percentage = train_percent,random_state = 122333)
short_train_interactions_without_weights,_ = train_test_split_rows(coo_matrix(interactions_matrix_without_weights),
                                                                                            test_size = 1 - short_train_percent,random_state = 122333)
train_interactions_with_weights, test_interactions_with_weights = train_test_split(coo_matrix(interactions_matrix_with_weights),
                                                                                            train_percentage = train_percent,random_state = 122333)
short_train_interactions_with_weights,_ = train_test_split_rows(coo_matrix(interactions_matrix_with_weights),
                                                                                            test_size = 1 - short_train_percent,random_state = 122333)

In [55]:
als_params = {
    'factors' : range(30,200,10),
    'regularization' : [0,0.0001,0.001,0.005,0.01,0.05,0.1,0.5,1,1.5,2],
    'iterations' : [50],
    'random_state' : [122333],
    'use_native' : [True],
    'use_cg' : [True]
}
bpr_params = {
    'factors' : range(30,200,10),
    'regularization' : [0,0.0001,0.001,0.003,0.005,0.007,0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.1],
    'learning_rate' :[0,0.0001,0.001,0.003,0.005,0.007,0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.1],
    'iterations' : [100],
    'random_state' : [122333],
    'verify_negative_samples' : [True]
}
logistic_params = {
    'factors' : range(30,200,10),
    'regularization' : [0,0.0001,0.001,0.005,0.01,0.05,0.1,0.5,1,1.5,2],
    'learning_rate' : [0.0001,0.001,0.005,0.01,0.05,0.1,0.5,1,1.5,2,3,4],
    'iterations' : [200],
    'random_state' : [122333],
    'neg_prop' : range(5,50,1)
}

In [56]:
collaborative_models = {
    'als' : ALS,
    'bpr' : BPR,
    'logistic' : LogisticMF
}
models_params = {
    'als' : als_params,
    'bpr' : bpr_params,
    'logistic' : logistic_params
}

In [42]:
def get_discrete_params(dict_params,size):
    for _ in range(size):
        yield [np.random.choice(dict_params[key]) for key in dict_params]

In [8]:
def get_evaluations(model,train,test,K=5):
    eval = implicit.evaluation.ranking_metrics_at_k(model,train,test,K=K,show_progress = False)
    eval[f'p_at_{K}'] = eval.pop('precision')
    eval[f'map_at_{K}'] = eval.pop('map')
    eval[f'ndcg_at_{K}'] = eval.pop('ndcg')
    eval[f'auc_at_{K}'] = eval.pop('auc')
    return eval

In [9]:
def delete_users_without_interactions(train,test):
    test_ind = np.array((test.sum(axis = 1) != 0).T)[0]
    train = train[test_ind,:]
    test = test[test_ind,:]
    train_ind = np.array((train.sum(axis = 1) != 0).T)[0]
    test = test[train_ind,:]
    train = train[train_ind,:]
    return train,test

In [45]:
def cross_validation(model_name,params,interactions, cv):
    cv_info = defaultdict(list)
    k = 7
    for i in range(cv):
        train_interactions, test_interactions = train_test_split(interactions, train_percentage = 0.8, random_state = i)
        train_interactions,test_interactions = delete_users_without_interactions(train_interactions,test_interactions)
        model_temp = collaborative_models[model_name](**params)
        model_temp.fit(train_interactions,show_progress = False)
        eval_k = get_evaluations(model_temp,train_interactions,test_interactions,k)
        for metric in eval_k:
            cv_info[metric].append(eval_k[metric])
    for metric in cv_info:
        cv_info[metric] = np.median(cv_info[metric])
    return cv_info

def tuning_hyperparams (model_name, metric, train, cv = 3, number_of_combinations = None, type = 'no weight'):
    best_metric = 0
    params_dict = models_params[model_name]
    if(number_of_combinations == None):
        gen = tqdm_product(*params_dict.values())
    else:
        gen = tqdm(get_discrete_params(params_dict,number_of_combinations),total = number_of_combinations)

    for params in gen:
        params = dict(zip(params_dict.keys(),params))
        print(params)
        curr_info = cross_validation(model_name,params,train, cv)
        if  curr_info[metric]> best_metric:
            best_metric = curr_info[metric]
            best_params_info = curr_info
            best_params = params
            with open(f'./best_parameters_collaborative_{model_name}_{type}.json', 'w') as f:
                f.write(str({metric : best_metric}))
                f.write(str(best_params))
    best_model = collaborative_models[model_name](**best_params)
    best_model.fit(train)

    return dict(best_model = best_model, best_params = best_params, metrics = best_params_info)

### 2.1 Коллаборативные модели без весов взаимодействий

In [14]:
als_without_weights = tuning_hyperparams('als','p_at_7',short_train_interactions_without_weights,number_of_combinations = 1)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

In [15]:
als_without_weights

{'best_model': <implicit.cpu.als.AlternatingLeastSquares at 0x7f4432041210>,
 'best_params': {'factors': 50,
  'regularization': 0.0001,
  'iterations': 50,
  'random_state': 122333,
  'use_native': True,
  'use_cg': True},
 'metrics': defaultdict(list,
             {'p_at_7': 0.30266961401122255,
              'map_at_7': 0.1802774607765173,
              'ndcg_at_7': 0.2241737735394443,
              'auc_at_7': 0.6535726049856614})}

In [48]:
bpr_without_weights = tuning_hyperparams('bpr','p_at_7',short_train_interactions_without_weights,number_of_combinations = 1)

  0%|          | 0/1 [00:00<?, ?it/s]

{'factors': 170, 'regularization': 0.0001, 'learning_rate': 0.0001, 'iterations': 100, 'random_state': 122333, 'verify_negative_samples': True}


  0%|          | 0/100 [00:00<?, ?it/s]

In [49]:
bpr_without_weights

{'best_model': <implicit.cpu.bpr.BayesianPersonalizedRanking at 0x7f8a5c9f6830>,
 'best_params': {'factors': 170,
  'regularization': 0.0001,
  'learning_rate': 0.0001,
  'iterations': 100,
  'random_state': 122333,
  'verify_negative_samples': True},
 'metrics': defaultdict(list,
             {'p_at_7': 0.09395340870161015,
              'map_at_7': 0.037766739397478086,
              'ndcg_at_7': 0.054777575741494416,
              'auc_at_7': 0.5405232786175533})}

In [35]:
model = BPR()
model.fit(short_train_interactions_without_weights)

  0%|          | 0/100 [00:00<?, ?it/s]

In [32]:
bpr_without_weights

{'best_model': <implicit.cpu.bpr.BayesianPersonalizedRanking at 0x7f8a59327d00>,
 'best_params': {'factors': 50,
  'regularization': 0.001,
  'learning_rate': 0.05,
  'iterations': 100,
  'random_state': 122333,
  'verify_negative_samples': True},
 'metrics': defaultdict(list,
             {'p_at_7': 0.3123618432239415,
              'map_at_7': 0.18797396019073673,
              'ndcg_at_7': 0.23551372139904223,
              'auc_at_7': 0.6676031803755293})}

In [57]:
logistic_without_weights = tuning_hyperparams('logistic','p_at_7',short_train_interactions_without_weights,number_of_combinations = 1)

  0%|          | 0/1 [00:00<?, ?it/s]

{'factors': 150, 'regularization': 0.01, 'learning_rate': 2.0, 'iterations': 500, 'random_state': 122333, 'neg_prop': 42}


  0%|          | 0/500 [00:00<?, ?it/s]

In [58]:
logistic_without_weights

{'best_model': <implicit.cpu.lmf.LogisticMatrixFactorization at 0x7f8a591e5a80>,
 'best_params': {'factors': 150,
  'regularization': 0.01,
  'learning_rate': 2.0,
  'iterations': 500,
  'random_state': 122333,
  'neg_prop': 42},
 'metrics': defaultdict(list,
             {'p_at_7': 0.0711716341212744,
              'map_at_7': 0.02794082279251652,
              'ndcg_at_7': 0.04120989782850661,
              'auc_at_7': 0.5293613683399583})}

Как дообучать, как работать с новыми юзерами в рекомендациях и в дообучении?

In [None]:
model = als_without_weights['best_model']

Если надо составить рекомендацию юзеру, а у нас нет информации про него, то можно делать так

In [None]:
recommendations = model.recommend(userid = [None],user_items = short_train_interactions_without_weights[0],N = 5, recalculate_user=True)

Когда у нас появилась новая информация про какого-то старого пользователя

In [None]:
model.recommend(userid = 0, user_items=short_train_interactions_without_weights[0],N = 5, recalculate_user=False)

(array([ 95, 261, 133, 174,  88], dtype=int32),
 array([0.19836809, 0.14409204, 0.12629929, 0.12010314, 0.06894191],
       dtype=float32))

In [None]:
temp = short_train_interactions_without_weights[0].copy()
temp = np.zeros(570)
temp[2] = 1
temp[3] = 1
model.partial_fit_users(userids = [0],user_items = coo_matrix(temp).tocsr())

In [None]:
model.recommend(userid = 0, user_items=coo_matrix(temp).tocsr(),N = 5, recalculate_user=False)

(array([ 55, 401, 305, 436, 211], dtype=int32),
 array([0.05278216, 0.04426131, 0.0324229 , 0.02991808, 0.02367844],
       dtype=float32))

Раз в какое-то время когда у нас появились новые пользователи с какими-то взаимодействиями 

In [None]:
model.partial_fit_users(userids = [short_train_interactions_without_weights.shape[0] + 5],user_items = coo_matrix(temp).tocsr())

In [None]:
model.recommend(userid = short_train_interactions_without_weights.shape[0] + 5, user_items=coo_matrix(temp).tocsr(),N = 5, recalculate_user=False)

(array([ 55, 401, 305, 436, 211], dtype=int32),
 array([0.05278216, 0.04426131, 0.0324229 , 0.02991808, 0.02367844],
       dtype=float32))

## 2.2 Коллаборативные модели c весами взаимодействия

In [None]:
als_with_weights = tuning_hyperparams('als','p_at_7',short_train_interactions_with_weights,number_of_combinations = 1,type = 'weight')

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
als_with_weights

{'best_model': <implicit.cpu.als.AlternatingLeastSquares at 0x7f4d048039d0>,
 'best_params': {'factors': 60,
  'regularization': 0.1,
  'iterations': 50,
  'random_state': 122333,
  'use_native': True,
  'use_cg': True},
 'metrics': defaultdict(list,
             {'p_at_7': 0.13342186462523922,
              'map_at_7': 0.06970110324192948,
              'ndcg_at_7': 0.08560353824939826})}

In [None]:
logistic_with_weights = tuning_hyperparams('logistic','p_at_7',short_train_interactions_with_weights,number_of_combinations = 1, type = 'weight')

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
logistic_with_weights

{'best_model': <implicit.cpu.lmf.LogisticMatrixFactorization at 0x7f4d047ea410>,
 'best_params': {'factors': 180,
  'regularization': 0.05,
  'learning_rate': 1.5,
  'iterations': 100,
  'random_state': 122333,
  'neg_prop': 100},
 'metrics': defaultdict(list,
             {'p_at_7': 0.051913309812739894,
              'map_at_7': 0.01812444928524995,
              'ndcg_at_7': 0.02733086405871815})}

## 3. Подбираем лучшую контентную модель по взаимодействиям

In [8]:
cos_params = {
    'K' : range(10,200,10)
}
tf_idf_params = {
    'K' : range(10,200,10)
}
bm25_params = {
    'K' : range(10,200,10),
    'K1' : np.arange(0.5,2,0.2),
    'B' : np.arange(0.3,0.9,0.05)
}

In [9]:
content_models = {
    'cos' : CosineRecommender,
    'bm25' : BM25Recommender,
    'tf-idf' : TFIDFRecommender
}
models_params_content = {
    'cos' : cos_params,
    'bm25' : bm25_params,
    'tf-idf' : tf_idf_params
}

In [10]:
def cross_validation_content(model_name,model,params,interactions, cv,type):
    cv_info = defaultdict(list)
    k = 7
    for i in range(cv):
        train_interactions, test_interactions = train_test_split(interactions, train_percentage = 0.8, random_state = i)
        train_interactions,test_interactions = delete_users_without_interactions(train_interactions,test_interactions)
        if(type != 'features_similarity'):
            model = content_models[model_name](**params)
            model.fit(train_interactions,show_progress = False)
        eval_k = get_evaluations(model,train_interactions,test_interactions,k)
        for metric in eval_k:
            cv_info[metric].append(eval_k[metric])
    for metric in cv_info:
        cv_info[metric] = np.median(cv_info[metric])
    return cv_info

def tuning_hyperparams_content (model_name, metric, features,interactions, cv = 3, number_of_combinations = None, type = 'interactions_similarity'):
    best_metric = 0
    params_dict = models_params_content[model_name]
    if(number_of_combinations == None):
        gen = tqdm_product(*params_dict.values())
    else:
        gen = tqdm(get_discrete_params(params_dict,number_of_combinations),total = number_of_combinations)

    for params in gen:
        params = dict(zip(params_dict.keys(),params))
        model = None
        if(type == 'features_similarity'):
            model = content_models[model_name](**params)
            model.fit(coo_matrix(features).tocsr(),show_progress = False)
        curr_info = cross_validation_content(model_name,model,params,interactions, cv,type)
        if  curr_info[metric]> best_metric:
            best_metric = curr_info[metric]
            best_params_info = curr_info
            best_params = params
            with open(f'./best_parameters_content_{model_name}_{type}.json', 'w') as f:
                f.write(str({metric : best_metric}))
                f.write(str(best_params))
    best_model = content_models[model_name](**best_params)
    best_model.fit(features)

    return dict(best_model = best_model, best_params = best_params, metrics = best_params_info)

In [10]:
s1 = set(courses_features_with_embeddings.id)  
s2 = set(interactions_matrix_without_weights.columns)
common_courses = s1 & s2

In [None]:
train_courses_features_with_embeddings = courses_features_with_embeddings.copy()
train_courses_features_with_embeddings = train_courses_features_with_embeddings.set_index('id')
scaler = StandardScaler()
scaler.fit(train_courses_features_with_embeddings)
train_courses_features_with_embeddings = pd.DataFrame(scaler.transform(train_courses_features_with_embeddings),
                                                      index = train_courses_features_with_embeddings.index,
                                                      columns=train_courses_features_with_embeddings.columns)
train_courses_features_with_embeddings = train_courses_features_with_embeddings.loc[list(common_courses)]

In [13]:
train_interactions_with_weights_for_content, test_interactions_with_weights_for_content = train_test_split(coo_matrix(interactions_matrix_with_weights[list(common_courses)]),
                                                                                             train_percentage = train_percent,random_state = 122333)
train_interactions_without_weights_for_content, test_interactions_without_weights_for_content = train_test_split(coo_matrix(interactions_matrix_without_weights[list(common_courses)]),
                                                                                            train_percentage = train_percent,random_state = 122333)

In [14]:
list(interactions_matrix_without_weights[list(common_courses)].columns) == list(train_courses_features_with_embeddings.index)

True

In [15]:
def get_course_id(list_of_courses):
    ids = list(train_courses_features_with_embeddings.id[list_of_courses].values())
    result = dict()
    for id in ids:
        result[id] = id_title_description[id].title
    return result

Будем выбирать между 4 моделями - Cosine Similarity (матрица признаков куров и матрица взаимодействий с весами), TF-IDF (матрица взаимодействия), BM25(матрица взаимодействия)

In [None]:
cos_model_with_features = tuning_hyperparams_content('cos','p_at_7',train_courses_features_with_embeddings.T,train_interactions_with_weights_for_content,number_of_combinations=1,type = 'features_similarity')

In [None]:
cos_model_with_interactions = tuning_hyperparams_content('cos','p_at_7',train_interactions_with_weights_for_content,train_interactions_with_weights_for_content,number_of_combinations=1)

In [None]:
bm25_model_with_interactions = tuning_hyperparams_content('bm25','p_at_7',train_interactions_without_weights_for_content,train_interactions_with_weights_for_content,number_of_combinations=1)

In [None]:
tf_idf_model_with_interactions = tuning_hyperparams_content('tf-idf','p_at_7',train_interactions_without_weights_for_content,train_interactions_with_weights_for_content,number_of_combinations=1)

## 4. Гибридные модели

In [25]:
model_collaborative = als_without_weights['best_model']
collaborative_params = als_without_weights['best_params']
model_content = cos_model_with_features['best_model']
content_params = cos_model_with_features['best_params']

### 4.1 Выбирает одна модель, из получившихся выбирает другая

In [10]:
class LinearHybrid:
    def __init__(self,model_collaborative,collaborative_params,model_content,content_params,a):
        self.a = a
        self.model_collaborative = model_collaborative
        self.model_content = model_content
        self.collaborative_params = collaborative_params
        self.content_params = content_params

    def recommend(self,userid, user_items, N=10, filter_already_liked_items=True, filter_items=None, recalculate_user=False, items=None):
        def count_score(a,collab,content):
            return collab*a + (1-a)*content
        
        def get_new_dict(a,collab_dict,content_dict):
            return {k: count_score(a,collab_dict.get(k, 0),content_dict.get(k, 0)) for k in (set(collab_dict) | set(content_dict))}
        
        def get_sorted_dict(courses_dict,N):
            return dict(sorted(courses_dict.items(), key = operator.itemgetter(1),reverse=True)[:N])
        
        def get_keys_values(courses_dict):
            recs = [list(courses_dict[i].keys()) for i in range(len(courses_dict))]
            scores = [list(courses_dict[i].values()) for i in range(len(courses_dict))]
            return np.array(recs),np.array(scores)
        
        oversampled_num = min(5*N, user_items.shape[1])
        collab_recs,collab_scores = self.model_collaborative.recommend(userid, user_items, oversampled_num, filter_already_liked_items, filter_items, recalculate_user, items)
        content_recs,content_scores = self.model_content.recommend(userid, user_items, oversampled_num, filter_already_liked_items, filter_items, recalculate_user, items)
        courses_list_dicts_collab = [dict(zip(collab_recs[i],collab_scores[i])) for i in range(user_items.shape[0])]
        courses_list_dicts_content = [dict(zip(content_recs[i],content_scores[i])) for i in range(user_items.shape[0])]
        courses_list_dicts = [get_new_dict(self.a,courses_list_dicts_collab[i],courses_list_dicts_content[i]) for i in range(user_items.shape[0])]
        courses_list_dicts = [get_sorted_dict(courses_list_dicts[i],N) for i in range(user_items.shape[0])]
        return get_keys_values(courses_list_dicts)
    
    def fit(self,content_features, user_items, show_progress=True, callback=None):
        if(user_items is not None):
            self.model_collaborative = self.model_collaborative.__class__(**self.collaborative_params)
            self.model_collaborative.fit(user_items,show_progress,callback)
        if(content_features is not None):
            self.model_content = self.model_content.__class__(**self.content_params)
            self.model_content.fit(content_features,show_progress,callback)

In [42]:
class NfirstHybrid:
    def __init__(self,model_collaborative,collaborative_params, model_content, content_params, n):
        self.n = n
        self.model_collaborative = model_collaborative
        self.model_content = model_content
        self.collaborative_params = collaborative_params
        self.content_params = content_params

    def recommend(self,userid, user_items, N=10, filter_already_liked_items=True, filter_items=None, recalculate_user=False, items=None):

        def get_unique_recs(recs_main,recs_score,recs_2,number_of_recs):
            return [[item,recs_score[i]] for i,item in enumerate(recs_main) if item not in recs_2][:number_of_recs]
        
        def get_recs(recs_scores):
            return [item[0] for item in recs_scores]
        
        def get_scores(recs_scores):
            return [item[1] for item in recs_scores]

        def predicator(x):
            return x != None

        def merge_two_recs(collab,content,type):
            merged = list((chain.from_iterable(zip_longest(collab,content))))
            return [*filter(predicator, merged)]
    
        
        n_collab = max(int(self.n * N),1)
        collab = self.model_collaborative.recommend(userid, user_items, n_collab, filter_already_liked_items, filter_items, recalculate_user, items)
        
        content = self.model_content.recommend(userid, user_items, N, filter_already_liked_items, 
                                                     filter_items, recalculate_user, items)
        
        content = [get_unique_recs(content[0][i],content[1][i],collab[0][i],N-n_collab) for i in range(len(content[0]))]
        content_recs =  [get_recs(content[i]) for i in range(len(content))]
        content_scores =  [get_scores(content[i]) for i in range(len(content))]
        result_items = [merge_two_recs(collab[0][i],content_recs[i],int) for i in range(len(content_recs))]
        
        result_scores = [merge_two_recs(collab[1][i],content_scores[i],float) for i in range(len(content_scores))]
        return np.array(result_items), np.array(result_scores)
    
    def fit(self,content_features, user_items, show_progress=True, callback=None):
        if(user_items != None):
            self.model_collaborative = self.model_collaborative.__class__(**self.collaborative_params)
            self.model_collaborative.fit(user_items,show_progress,callback)
        if(content_features != None):
            self.model_content = self.model_content.__class__(**self.content_params)
            self.model_content.fit(content_features,show_progress,callback)

In [51]:
hybrid_models = {
    'linear' : LinearHybrid,
    'n_first' : NfirstHybrid
}

hybrid_params = {
    'linear' : dict(a = np.arange(0.1,1,0.1)),
    'n_first' : dict(n = np.arange(0.1,0.9,0.1))
}

In [52]:
def cross_validation_hybrid(model_name, params, interactions, interactions_for_content, features, cv, number_for_metric):
    cv_info = defaultdict(list)
    for i in range(cv):
        train_interactions, test_interactions = train_test_split(interactions, train_percentage = 0.8, random_state = i)
        train_interactions,test_interactions = delete_users_without_interactions(train_interactions,test_interactions)
        if interactions_for_content is not None:
            train_interactions_content, test_interactions_content = train_test_split(interactions_for_content, train_percentage = 0.8, random_state = i)
            train_interactions_content,test_interactions_content = delete_users_without_interactions(train_interactions_content,test_interactions_content)
        model = hybrid_models[model_name](model_collaborative,collaborative_params,model_content,content_params,**params)
        if(features):
            model.fit(None,train_interactions,show_progress = False)
        else:
            model.fit(train_interactions_content,train_interactions,show_progress = False)

        eval_k = get_evaluations(model,train_interactions,test_interactions,number_for_metric)
        for metric in eval_k:
            cv_info[metric].append(eval_k[metric])
    for metric in cv_info:
        cv_info[metric] = np.median(cv_info[metric])
    return cv_info


def find_best_params_hybrid(hybrid_name, number_for_metric, interactions, interactions_for_content = None, features = False, cv = 3, number_of_combinations = None):
    best_metric = 0
    metric = f'p_at_{number_for_metric}'
    params_dict = hybrid_params[hybrid_name]
    if(number_of_combinations == None):
        gen = tqdm_product(*params_dict.values())
    else:
        gen = tqdm(get_discrete_params(params_dict,number_of_combinations),total = number_of_combinations)

    for params in gen:
        params = dict(zip(params_dict.keys(),params))
        curr_info = cross_validation_hybrid(hybrid_name,params,interactions,interactions_for_content, features, cv, number_for_metric)
        if curr_info[metric]> best_metric:
            best_metric = curr_info[metric]
            best_params_info = curr_info
            best_params = params
            with open(f'./best_parameters_hybrid_{hybrid_name}_{type}.json', 'w') as f:
                f.write(str({metric : best_metric}))
                f.write(str(best_params))
    return dict(best_params = best_params, metrics = best_params_info)

In [150]:
find_best_params_hybrid('n_first',10,short_train_interactions_without_weights,features = True, number_of_combinations=1)

  0%|          | 0/1 [00:00<?, ?it/s]



{'p_at_10': 0.49266962154790317, 'map_at_10': 0.22574409467938863, 'ndcg_at_10': 0.3079805776137272, 'auc_at_10': 0.7542418374688671}




{'p_at_10': 0.4843921834024194, 'map_at_10': 0.22403616173293592, 'ndcg_at_10': 0.30570836182592037, 'auc_at_10': 0.7514298300251816}




{'p_at_10': 0.4892344497607656, 'map_at_10': 0.2243626463891742, 'ndcg_at_10': 0.3067249678294615, 'auc_at_10': 0.7530574803005478}


{'best_params': {'n': 0.4},
 'metrics': defaultdict(list,
             {'p_at_10': 0.4892344497607656,
              'map_at_10': 0.2243626463891742,
              'ndcg_at_10': 0.3067249678294615,
              'auc_at_10': 0.7530574803005478})}

In [47]:
find_best_params_hybrid('linear',10,short_train_interactions_without_weights,features = True, number_of_combinations=1)

  0%|          | 0/1 [00:00<?, ?it/s]



{'best_params': {'a': 0.2},
 'metrics': defaultdict(list,
             {'p_at_10': 0.5105946684894054,
              'map_at_10': 0.2849268639625873,
              'ndcg_at_10': 0.3592013514390457,
              'auc_at_10': 0.7635841716968537})}

## Оптуна для уточнения гиперпараметров

In [None]:
def objective_als (trial):
    factors = trial.suggest_int('factors',30,200)
    regularization = trial.suggest_float('regularization',0,3)
    iterations = trial.suggest_int('iterations',50,300)
    params = {'factors' : factors, 'regularization' : regularization, 'iterations' : iterations, 'random_state' : 122333, 'use_native' : True, 'use_cg' : True}
    curr_info = cross_validation('als',params,short_train_interactions_with_weights, 3)
    return curr_info['p_at_7']

In [None]:
study_als  = optuna.create_study(direction="maximize",sampler = optuna.samplers.TPESampler(seed=122333))
study_als.optimize(objective_als, n_trials = 1, n_jobs = -1)
print(study_als.best_trial)

In [None]:
def objective_bm25 (trial):
    k = trial.suggest_int('K',10,200)
    k1 = trial.suggest_float('K1',0.5,2)
    b = trial.suggest_float('B',0.3,0.9)
    params = {'K' : k, 'K1' : k1, 'B' : b}
    curr_info = cross_validation_content('bm25',None,params,train_interactions_without_weights_for_content, 3 ,'interactions_similarity')
    return curr_info['p_at_7']

In [None]:
study_bm25  = optuna.create_study(direction="maximize",sampler = optuna.samplers.TPESampler(seed=122333))
study_bm25.optimize(objective_bm25, n_trials = 1, n_jobs = -1)
print(study_bm25.best_trial)

Тестирование выбранной модели

In [11]:
test_ind_not_null = np.array((test_interactions_with_weights.sum(axis = 1) != 0).T)[0]
test_ind_null = np.array((test_interactions_with_weights.sum(axis = 1) == 0).T)[0]
train_ind_not_null = np.array((train_interactions_with_weights.sum(axis = 1) != 0).T)[0]
not_null = list(set(np.nonzero(train_ind_not_null)[0]) & set(np.nonzero(test_ind_not_null)[0]))
test = test_interactions_with_weights[not_null,:]
null_only_test = list(set(np.nonzero(train_ind_not_null)[0]) ^ set(not_null))
train = coo_matrix(np.concatenate((train_interactions_with_weights[not_null,:].toarray(), train_interactions_with_weights[null_only_test,:].toarray())))

In [12]:
model_als = ALS()
collab_params = dict(factors=170,regularization=1.765,alpha = 14.592, iterations = 137, use_cg=True, random_state = 122333)
model_bm = BM25Recommender()
content_params = dict(K = 180, K1 = 0.513, B = 0.39)
model = LinearHybrid(model_als,collab_params,model_bm,content_params,0.6)
model.fit(train,train)



  0%|          | 0/137 [00:00<?, ?it/s]



  0%|          | 0/570 [00:00<?, ?it/s]

In [15]:
model.model_collaborative.save('collab_model')
model.model_content.save('content_model')

In [16]:
get_evaluations(model,train,test,K = 7)

{'p_at_7': 0.473700042973786,
 'map_at_7': 0.30848094724496256,
 'ndcg_at_7': 0.370450150652833,
 'auc_at_7': 0.7510539846331753}

In [45]:
get_evaluations(model,train,test,K = 10)

{'p_at_10': 0.5344481033229269,
 'map_at_10': 0.31690517520714134,
 'ndcg_at_10': 0.39075125109139675,
 'auc_at_10': 0.780216086451921}

In [58]:
auc = []
test = csr_matrix(test)
train = csr_matrix(train)
K = test.shape[1]
for i in range(test.shape[0]):
    info = get_evaluations(model,train[i,:],test[i,:],K = K)
    auc.append(info[f'auc_at_{K}'])

In [59]:
print('Среднее значение auc = ',np.mean(auc))
fig = px.histogram(auc)
fig.update_layout(showlegend = False,title = 'Распределение auc_score итоговой модели у разных пользователей')
fig.show()

Среднее значение auc =  0.9361711728290821


In [61]:
users_type = pd.read_csv('./user_to_split_type.csv',index_col=0)

In [68]:
(users_type.index == interactions_matrix_with_weights.index).all()

True

In [93]:
users = users_type.reset_index().loc[not_null,:]
users['auc'] = auc
users['auc_score_bins'] = pd.cut(users.auc,[0,0.3,0.5,0.7,0.8,0.9,0.93,0.95,0.97,0.98,0.99,1])
users.columns = ['id','class','auc','auc_score_bins']

In [127]:
def build_histogram_with_classes():
    temp_scores = users.groupby(['auc_score_bins'],as_index = False)['id'].agg('count')
    temp = users.groupby(['class','auc_score_bins'],as_index=False)['id'].agg('count')
    temp['auc_score_bins'] = temp['auc_score_bins'].astype(str)
    temp_scores.columns = ['auc_score_bins','total_count']
    temp_scores['auc_score_bins'] = temp_scores['auc_score_bins'].astype(str)
    temp['class'] = temp['class'].astype(str)
    temp = temp.rename({'id':'count'},axis = 'columns')
    temp = pd.merge(temp,temp_scores,on = 'auc_score_bins')
    temp['percent'] = np.round(temp['count'] * 100/ temp['total_count'],2)
    fig = px.bar(temp,
                x = 'auc_score_bins',
                y = 'percent',
                template = 'plotly_dark',
                color = 'class',
                title = 'Auc_score с разбивкой на классы пользователей', 
                color_discrete_sequence=['#0d0887','#46039f','#7201a8','#9c179e','#bd3786','#d8576b','#ed7953','#fb9f3a','#fdca26','#f0c821','#faf921'],
                text_auto=True
                )
    fig.update_layout(width = 1400, height = 700)
    return fig

In [128]:
build_histogram_with_classes().show()

In [162]:
info = dict()
info['auc'] = []
info['precision'] = []
info['map'] = []
info['ndcg'] = []
for k in tqdm(range(1,30)):
    res = get_evaluations(model,train,test,K = k)
    info['auc'].append(res[f'auc_at_{k}'])
    info['precision'].append(res[f'p_at_{k}'])
    info['ndcg'].append(res[f'ndcg_at_{k}'])
    info['map'].append(res[f'map_at_{k}'])

  0%|          | 0/29 [00:00<?, ?it/s]

In [163]:
fig = go.Figure()
for key in info:
    fig.add_trace(go.Scatter(x = list(range(1,30)), y = info[key], mode='lines+markers',name = key))
fig.update_layout(title = 'Метрики при разных k',xaxis_title = 'K')
fig.show()