In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
%reload_ext autoreload
%autoreload 2

In [3]:
from models.ContentBasedRecommender import ContentBasedRecommender
from models.CollaborativeFilteringSVD import CollaborativeFilteringSVD
from models.CollaborativeFilteringALS import CollaborativeFilteringALS
from models.ContentBasedDoc2Vec import ContentBasedDoc2Vec
from models.Mixin import Mixin
from models.Random import Random
from evaluation.evaluator import ModelEvaluator

[nltk_data] Downloading package punkt to /home/artem/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
interactions_train = pd.read_csv('processed_data/interactions_train.csv', index_col='personId')
interactions_test = pd.read_csv('processed_data/interactions_test.csv', index_col='personId')
interactions_full = pd.read_csv('processed_data/interactions_full.csv', index_col='personId')
articles_df = pd.read_csv('processed_data/articles_df.csv', index_col='contentId')

In [5]:
evaluator = ModelEvaluator()
evaluator.fit(interactions_train[['contentId']], interactions_test[['contentId']])

# ContentBased

In [115]:
model = ContentBasedRecommender(articles_df[['title', 'url', 'lang']])
model.fit(articles_df[['content']], interactions_train)

In [69]:
global_metrics, detailed_results_df = evaluator.evaluate_model(model, verbose=True)

Running evaluation for users
100 of 579 users processed
200 of 579 users processed
300 of 579 users processed
400 of 579 users processed
500 of 579 users processed


In [70]:
global_metrics

{'modelName': 'Content-Based',
 'recall@5': 0.16642441860465115,
 'recall@10': 0.26468023255813955,
 'precision@3': 0.03223949337938975,
 'mean_average_precision': 0.0396447062504095}

# CollaborativeFilteringSVD

In [73]:
model = CollaborativeFilteringSVD(articles_df[['title', 'url', 'lang']])

In [77]:
model.fit(interactions_train)

In [78]:
global_metrics, detailed_results_df = evaluator.evaluate_model(model, verbose=True)

Running evaluation for users
100 of 579 users processed
200 of 579 users processed
300 of 579 users processed
400 of 579 users processed
500 of 579 users processed


In [79]:
global_metrics

{'modelName': 'Collaborative Filtering',
 'recall@5': 0.31962209302325584,
 'recall@10': 0.46075581395348836,
 'precision@3': 0.09383995394358102,
 'mean_average_precision': 0.06304196957559945}

# CollaborativeFilteringALS

In [106]:
model = CollaborativeFilteringALS(articles_supplementary_information=articles_df[['title', 'url', 'lang']])

In [107]:
model.fit(interactions_train, iterations=10, verbose=True)

iteration 1 of 10
iteration 2 of 10
iteration 3 of 10
iteration 4 of 10
iteration 5 of 10
iteration 6 of 10
iteration 7 of 10
iteration 8 of 10
iteration 9 of 10
iteration 10 of 10


In [113]:
global_metrics, detailed_results_df = evaluator.evaluate_model(model, verbose=True)

Running evaluation for users
100 of 579 users processed
200 of 579 users processed
300 of 579 users processed
400 of 579 users processed
500 of 579 users processed


In [114]:
global_metrics

{'modelName': 'Collaborative Filtering',
 'recall@5': 0.3021802325581395,
 'recall@10': 0.45247093023255813,
 'precision@3': 0.042602187679907866,
 'mean_average_precision': 0.04690866891686961}

# Random

In [34]:
model = Random()
model.fit(interactions_train)

In [35]:
global_metrics, detailed_results_df = evaluator.evaluate_model(model, verbose=True)

Running evaluation for users
100 of 579 users processed
200 of 579 users processed
300 of 579 users processed
400 of 579 users processed
500 of 579 users processed


In [36]:
global_metrics

{'modelName': 'Random',
 'recall@5': 0.1001453488372093,
 'recall@10': 0.18517441860465117,
 'precision@3': 0.00863557858376511,
 'mean_average_precision': 0.00893206658901561}

Просто хотелось посмотреть какие скоры получит модель, которая делает рандомные рекомендации 

# Mixin

In [43]:
interactions_train = pd.read_csv('processed_data/interactions_train_with_validation.csv', index_col='personId')
interactions_validation = pd.read_csv('processed_data/interactions_validation.csv', index_col='personId')
interactions_test = pd.read_csv('processed_data/interactions_test_with_validation.csv', index_col='personId')

In [44]:
model1 = ContentBasedRecommender(articles_df[['title', 'url', 'lang']])
model1.fit(articles_df[['content']], interactions_train)

In [45]:
model2 = CollaborativeFilteringSVD(articles_df[['title', 'url', 'lang']])
model2.fit(interactions_train)

In [46]:
model3 = CollaborativeFilteringALS(articles_supplementary_information=articles_df[['title', 'url', 'lang']])
model3.fit(interactions_train, iterations=10, verbose=True)

iteration 1 of 10
iteration 2 of 10
iteration 3 of 10
iteration 4 of 10
iteration 5 of 10
iteration 6 of 10
iteration 7 of 10
iteration 8 of 10
iteration 9 of 10
iteration 10 of 10


In [10]:
evaluator = ModelEvaluator()
evaluator.fit(interactions_train[['contentId']], interactions_validation[['contentId']])

In [62]:
def target_function(x):
    model = Mixin(param1=x[0], param2=x[1], param3=1 - x[0] - x[1])
    model.fit(model1=model1, model2=model2, model3=model3)
    global_metrics, detailed_results_df = evaluator.evaluate_model(model, verbose=True)
    return global_metrics['mean_average_precision']  

In [59]:
seed = 42

In [73]:
points = []
evaluations = []
best_ind = None
num_iters=10

In [74]:
for i in range(num_iters):
    x = [0, 0]
    random.seed(i + seed)
    # Here I don't sample uniform distribution all that well 
    x[0] = random.uniform(0, 1)
    x[1] = random.uniform(0, 1 - x[0])
    y = target_function(x)
    
    points.append(x)
    evaluations.append(y)
    
    if best_ind is None or y > evaluations[best_ind]:
        best_ind = len(points) - 1

Running evaluation for users
100 of 579 users processed
200 of 579 users processed
300 of 579 users processed
400 of 579 users processed
500 of 579 users processed
Running evaluation for users
100 of 579 users processed
200 of 579 users processed
300 of 579 users processed
400 of 579 users processed
500 of 579 users processed
Running evaluation for users
100 of 579 users processed
200 of 579 users processed
300 of 579 users processed
400 of 579 users processed
500 of 579 users processed
Running evaluation for users
100 of 579 users processed
200 of 579 users processed
300 of 579 users processed
400 of 579 users processed
500 of 579 users processed
Running evaluation for users
100 of 579 users processed
200 of 579 users processed
300 of 579 users processed
400 of 579 users processed
500 of 579 users processed
Running evaluation for users
100 of 579 users processed
200 of 579 users processed
300 of 579 users processed
400 of 579 users processed
500 of 579 users processed
Running evaluati

In [89]:
best_params = points[best_ind].copy()
best_params += [1 - best_params[0] - best_params[1]]
print("best_params:  ", best_params)
print("best validation score mean average precision: ", evaluations[best_ind])

best_params:   [0.038551839337380045, 0.6693835944079729, 0.29206456625464705]
best validation score mean average precision:  0.06865458599005099


##### Test score

In [90]:
evaluator = ModelEvaluator()
evaluator.fit(interactions_train[['contentId']], interactions_test[['contentId']])

In [91]:
model = Mixin(param1=best_params[0], param2=best_params[1], param3=best_params[2])
model.fit(model1=model1, model2=model2, model3=model3)

In [92]:
global_metrics, detailed_results_df = evaluator.evaluate_model(model, verbose=True)

Running evaluation for users
100 of 579 users processed
200 of 579 users processed
300 of 579 users processed
400 of 579 users processed
500 of 579 users processed


In [93]:
global_metrics

{'modelName': 'Mixin',
 'recall@5': 0.2643895348837209,
 'recall@10': 0.3640988372093023,
 'precision@3': 0.06332757628094418,
 'mean_average_precision': 0.06449281913521829}

# ContentBasedDoc2Vec

In [10]:
model = ContentBasedDoc2Vec(articles_supplementary_information=articles_df[['title', 'url', 'lang']], 
                            size_of_embedings=200)

In [11]:
model.fit(articles_df[['content']], interactions_train, verbose=True)

Word2Vec is being trained...
Word2Vec's training has been finished.
TF_IDF matrix is being built...
TF_IDF matrix's building has been finished.
Doc2Vec maxtrix is being built...
Doc2Vec matrix building has been finished.


In [12]:
global_metrics, detailed_results_df = evaluator.evaluate_model(model, verbose=True)

Running evaluation for users
100 of 579 users processed
200 of 579 users processed
300 of 579 users processed
400 of 579 users processed
500 of 579 users processed


In [13]:
global_metrics

{'modelName': 'Content-BasedDoc2Vec',
 'recall@5': 0.12354651162790697,
 'recall@10': 0.22034883720930232,
 'precision@3': 0.020725388601036277,
 'mean_average_precision': 0.02254794394159169}