### Train model, test and predict using Vopal Vabbit library

Find d3_categories, viewed by users in prediction week, but not previous three ones

In [1]:
from classeslib import train_calendar
from classeslib import persistence_files
from classeslib.predictions_users import PredictionUser, SilentWeeksReader, FutureWeekReader

In [2]:
silent_weeks_files = [persistence_files.days_split_catalog + "train." + str(day) + ".csv" 
                      for day in train_calendar.target_silent_weeks]
silent_weeks_reader = SilentWeeksReader()

In [None]:
for day_file in silent_weeks_files:
    silent_weeks_reader.read_input_file(day_file)

In [6]:
silent_weeks_users = silent_weeks_reader.users

794168


In [7]:
target_week_files = [persistence_files.days_split_catalog + "train." + str(day) + ".csv" 
                     for day in train_calendar.target_week]
target_week_reader = FutureWeekReader(silent_weeks_users)

In [8]:
for day_file in target_week_files:
    target_week_reader.read_input_file(day_file)

In [9]:
prediction_users = {user_id: user for (user_id, user) in target_week_reader.users.items() 
                    if len(user.silent_weeks_d3_categories) > 0 and len(user.target_categories) > 0}

In [10]:
from classeslib.statistics import StatisticsDumper
from classeslib import persistence_files
prediction_users_interests = StatisticsDumper.get_users_interests(
    prediction_users.keys(),
    persistence_files.public_train_statistics_db_file)

Restore clusters info

In [29]:
from classeslib import persistence_files
from classeslib.clusterization import UsersClaster

In [30]:
import pickle
with open(persistence_files.cluster_model_file, "r") as cluster_file:
    clusterization = pickle.load(cluster_file)

In [None]:
print sorted(set(clusterization.labels_))

In [32]:
import shelve
cluster_categories = {}
clusters_categories_db = shelve.open(persistence_files.clusters_categories_db_file)
for key in clusters_categories_db.keys():
    cluster_categories[key] = clusters_categories_db[key]
clusters_categories_db.close()

In [None]:
print persistence_files.clusters_categories_db_file
print sorted(cluster_categories.keys())

In [34]:
(d1_level_statistics,
 d2_level_statistics,
 d3_level_statistics) = StatisticsDumper.restore_statistics(persistence_files.public_train_statistics_db_file)

d1_categories_sorted = sorted(d1_level_statistics.categories_statistics.keys(),
                       key = lambda c : d1_level_statistics.categories_statistics[c],
                       reverse=True)
d2_categories_sorted = sorted(d2_level_statistics.categories_statistics.keys(),
                       key = lambda c : d2_level_statistics.categories_statistics[c],
                       reverse=True)
d3_categories_sorted = sorted(d3_level_statistics.categories_statistics.keys(),
                       key = lambda c : d3_level_statistics.categories_statistics[c],
                       reverse=True)
print len(d3_categories_sorted)

924


Form train dataset for Vopal Wabbit

In [36]:
from classeslib.features import build_categories_features_matrix

In [18]:
from numpy import random

def find_negative_sample(user_cluster, d3_categories, positive_categories):
    d3_categories_count = len(d3_categories)
    cluster_d3_categories = cluster.d3_categories_popularity.keys()
    start_category_index = random.randint(0, d3_categories_count)
    index = start_category_index
    while True:
        d3_category = d3_categories[index]
        if d3_category not in cluster_d3_categories and d3_category not in positive_categories:
            return d3_category
        index = (index + 1) % d3_categories_count
        if index == start_category_index:
            break
    
    without_positive_categories = list(set(cluster_d3_categories) - set(positive_categories))
    if len(without_positive_categories) < 4 :
        return without_positive_categories[len(without_positive_categories)-1]
    
    last_third_index = 3 * len(without_positive_categories) / 4
    least_popular_category_index = random.randint(last_third_index, len(without_positive_categories) - 1)
    return without_positive_categories[least_popular_category_index]

In [19]:
positive_class = 1
negative_class = -1

In [20]:
vw_train_file = "./vw_train_users_dataset"

In [27]:
def find_user_cluster(user_id, user_interests, cluster_model, clusters_categories):
    user_features = build_categories_features_matrix(
        d1_categories_sorted, d2_categories_sorted, d3_categories_sorted, {user_id: user_interests})[0]
    return cluster_model.predict(user_features)[0]

In [25]:
from classeslib.vw_datasets import VwDatasetWriter
from tqdm import tqdm
#from imp import reload
#import classeslib.vw_datasets
#VwDatasetWriter = reload(classeslib.vw_datasets)
#from classeslib.vw_datasets import VwDatasetWriter

In [23]:
with open(vw_train_file, 'w') as train_dataset:
    
    dataset_writer = VwDatasetWriter(train_dataset)
    
    for (user_id, user) in tqdm(prediction_users.items()):  
        #print user_id
        silent_week_categories = ' '.join(user.silent_weeks_d3_categories)
        if silent_week_categories == "":
            print user_id, "no silent weeks categories"
            continue
            
        cluster_label = find_user_cluster(
            user_id, prediction_users_interests[user_id], clusterization, cluster_categories)
        
        #if not cluster_categories.has_key(cluster_label):
        #    continue
            
        cluster = cluster_categories[str(cluster_label)]
        
         # write "positive" samples (categories form silent week) -> viewed category
        for target_category in user.target_categories:   
            dataset_writer.write(positive_class, user_id, target_category, silent_week_categories)
            
         # write "negative" samples (categories form silent week) -> not viewed category    
        for s in range(0, 5):
            negative_sample = find_negative_sample(cluster, d3_categories_sorted, user.target_categories)
            dataset_writer.write(negative_class, user_id, negative_sample, silent_week_categories)

100%|██████████| 339657/339657 [1:45:27<00:00, 53.68it/s]


Train Vopal Wabbit model

In [None]:
#OSError: [Errno 12] Cannot allocate memory - run from separate console
#rm vw_train_users_dataset.cache
#vw -d vw_train_users_dataset -c --passes 2 -f vw.track1_model -q DC --quiet --binary

Read test week users and their categories in test week and three weeks before

In [4]:
test_silent_weeks_files = [persistence_files.days_split_catalog + "train." + str(day) + ".csv" 
                      for day in train_calendar.test_silent_weeks]
test_silent_weeks_reader = SilentWeeksReader()

for day_file in test_silent_weeks_files:
    test_silent_weeks_reader.read_input_file(day_file)

In [5]:
test_silent_weeks_users = silent_weeks_reader.users

In [6]:
test_week_files = [persistence_files.days_split_catalog + "train." + str(day) + ".csv" 
                     for day in train_calendar.test_week]
test_week_reader = FutureWeekReader(test_silent_weeks_users)

In [14]:
for day_file in test_week_files:
    test_week_reader.read_input_file(day_file)

In [15]:
test_week_users = {user_id: user for (user_id, user) in test_week_reader.users.items() 
                    if len(user.silent_weeks_d3_categories) > 0 and len(user.target_categories) > 0}

In [16]:
from classeslib.statistics import StatisticsDumper

In [17]:
test_users_interests = StatisticsDumper.get_users_interests(
    test_week_users.keys(),
    persistence_files.public_train_statistics_db_file)

In [19]:
print len(test_week_reader.users)
print len(test_users_interests)

1010997
304953


Form test dataset for Vopal Wabbit

In [50]:
vw_test_file = "./vw_test_dataset"

In [51]:
from classeslib.candidates import get_candidate_categories
#from imp import reload
#import classeslib
#reload(classeslib.candidates)
#from classeslib.candidates import get_candidate_categories

In [52]:
with open(vw_test_file, 'w') as test_dataset:
    
    dataset_writer = VwDatasetWriter(test_dataset)
    
    for user_id in test_users_interests.keys():        
        
        user = test_week_users[user_id]
        silent_week_categories = ' '.join(user.silent_weeks_d3_categories)
        if silent_week_categories == "":
            continue
            
        cluster_label = find_user_cluster(
            user_id, test_users_interests[user_id], clusterization, cluster_categories)  
        
        cluster = cluster_categories[str(cluster_label)]
            
        candidates = get_candidate_categories(cluster, user_id, user, test_users_interests, d3_categories_sorted)    
        
        # write candidates rows to test dataset
        for candidate in candidates:
            dataset_writer.write(1, user_id, candidate, silent_week_categories)

Run VW on test dataset in test mode

In [None]:
#OSError: [Errno 12] Cannot allocate memory - run in separate terminal
#!vw -i vw.track1_model -t vw_test_dataset -p vw_test_dataset.out --quiet

Calculate test predictions score

In [58]:
from classeslib.candidates import get_users_predicted_categories 
import numpy as np

In [None]:
y_true = dict((user_id, list(test_week_users[user_id].target_categories)) for user_id in test_week_users.keys())
num_users = len(y_true)

y_pred = get_users_predicted_categories(vw_test_file, "./vw_test_dataset.out", d3_categories_sorted)
num_users_5p = int(np.ceil(0.05 * num_users))

In [66]:
print (num_users)
print (num_users_5p)
print len(y_pred)

329451
16473
304953


In [67]:
most_active_test_users = sorted(
    y_pred.items(),
    key = lambda (user_id, u): test_week_users[user_id].views,
    reverse = True)[:num_users_5p]

y_pred = dict(most_active_test_users)

In [70]:
#from scorer import scorer
#score = scorer(y_true, y_pred, num_users)
#print "Score: %f" % score

Score: 2477.994294
