The task is to choose exactly 53,979 users (user_id, 5% of all users in the dataset) and for each select ﬁve third-level product categories (id3) that they have not viewed in the last three weeks and which will be of interest to them in the next seven days. The resulting score is based on the number of users for which at least one product category is correctly nominated. Accurate predictions of two or more categories for one user will not improve your score.

In [1]:
import numpy as np
import pandas as pd
import scipy.sparse
import matplotlib.pyplot as plt
%matplotlib inline

# Custom metric is implemented here
from scorer import scorer
from tqdm import tqdm
from joblib import Parallel, delayed
from sklearn.linear_model import LogisticRegression

# Load data

# Use custom dtypes for efficiency
dtypes = {'id1': np.int16, 'id2': np.int16, 'id3': np.int16, 'user_id': np.int32, 'date': np.int16}

train = pd.read_csv('train.csv.zip', dtype=dtypes)
train.tail(5)

Unnamed: 0,id3,user_id,id2,date,id1
46996823,69,44533,71,54,15
46996824,145,575754,37,54,6
46996825,333,608981,117,54,10
46996826,283,69644,85,54,7
46996827,142,348032,103,54,8


In [2]:
import time
time.sleep(30)

In [2]:

for column in train.columns:
    if train[column].nunique() < 60:
        print(column, sorted(train[column].unique()))
    else:
        print(column, train[column].nunique())

id3 924
user_id 1079572
id2 122
date [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54]
id1 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]


In [3]:
date_validation_start = train.date.max() - 6

def calculate_target(data, date_test_start):
    '''
        This function returns a dictionary of type {user: items_list}
        Such that user viewed an item in testing period, 
        but did not view it within the last 3 weeks of train period.
    '''
    
    test_mask = (data.date >= date_test_start) & (data.date < date_test_start + 7)
    last_3weeks_mask = (data.date >= date_test_start - 21 + 1) & (data.date < date_test_start)
    
    # Items that used viewed during test period
    items_test = data[test_mask].groupby('user_id').id3.apply(set)
    
    # Items, that user viewd in last 3 weeks
    user_last_3weeks = data[last_3weeks_mask].groupby('user_id').id3.apply(set)
    
    # Get table, where for each `user_id` we have both items from test period and 3 weeks
    joined = items_test.reset_index().merge(user_last_3weeks.reset_index(), on=['user_id'], how='left')
    joined.set_index('user_id', inplace=True)
    
    # Remove the items, which the user viewed during last 3 weeks 
    target = {}
    for user_id, (id3_x, id3_y) in joined.iterrows():   
        items = id3_x if id3_y is np.nan else id3_x - id3_y
        if items != set(): target.update({user_id: items})

    return target

# This function may take several minutes to finish
# y_val_dict = calculate_target(train, date_validation_start)
# np.save('y_val_dict', y_val_dict)
y_val_dict = np.load('y_val_dict.npy').item()


## Ideas for features:

* id2 id3 for last 3 weeks
* all past history
* https://official.contest.yandex.com/idao/contest/6919/

## Todo:

* ~~is there correlation between id2 and no picks in id3~~
* ~~predict id1, then id2, only then id3~~
* ~~https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters.rst~~
* https://github.com/apple/turicreate/blob/master/userguide/recommender/README.md
* neural nets
* https://www.kaggle.com/gspmoreira/recommender-systems-in-python-101


In [4]:
num = 53979
# # Let's take not random users, but the ones who viewed a lot 
mask_train = (train.date < date_validation_start - 7) & (train.date > date_validation_start - 7*4)
mask_test = (train.date < date_validation_start) & (train.date >= date_validation_start - 7*3)

# users = train.loc[mask_test].user_id.value_counts().index[:num]
# mask_users = train.user_id.isin(users)

# mask_train = mask_train & mask_users
# mask_test = mask_test & mask_users

In [5]:

def get_feats(data):
    '''
        Builds sparse matrix using users' history.
    '''
    return scipy.sparse.coo_matrix(([1] * data.shape[0], (data.user_id, data.id3)), 
                                    shape =[data.user_id.max()+1, data.id3.max()+1]).tocsr()

def get_target_matrix(X, target_dict):
    '''
        Builds sparse matrix using dictionary.
    '''
    indptr = [0]
    indices = []
    data = []
    vocabulary = {}

    ks = []
    for k in tqdm(range(X.user_id.max()+1)):
        d = target_dict.get(k, [])
        for y in d:
            indices.append(y)
            data.append(1)
        indptr.append(len(indices))
    return scipy.sparse.csr_matrix((data, indices, indptr), dtype=int, shape =[X.user_id.max()+1, X.id3.max()+1])

X_train = get_feats(train.loc[mask_train])
X_test = get_feats(train.loc[mask_test])

y_train_dict = calculate_target(train, date_validation_start - 7)
y_train = get_target_matrix(train.loc[mask_train], y_train_dict)
y_test = get_target_matrix(train.loc[mask_test], y_val_dict)

100%|██████████| 1179572/1179572 [00:00<00:00, 1421805.48it/s]
100%|██████████| 1179572/1179572 [00:00<00:00, 1498985.79it/s]


In [6]:
# # del Y_test, Y_train
# Y_train = np.ndarray((1179572))
# for key in tqdm(y_train_dict):
#     Y_train[key] = len(y_train_dict[key])

# Y_test = np.ndarray((1179572))
# for key in tqdm(y_val_dict):
#     Y_test[key] = len(y_val_dict[key])

# Y_train = Y_train.astype('int32')
# Y_test = Y_test.astype('int32')


# import lightgbm as lgb

# verbose = 1
# plot = 1
# params = {
# #     "max_bin": 1024,
# #     "learning_rate": 0.1,
#     "boosting_type": "goss",
#     "objective": "regression",
# #     'num_iterations':1000,
# #     "metric": "auc",
#     "num_leaves": 10000,
#     "verbose": 1,
# #     "min_data": 100,
# #     "boost_from_average": True
# }

# d_train = lgb.Dataset(X_train.toarray(), Y_train)
# # d_valid = lgb.Dataset(X_test, label=y_test)
# model = lgb.train(params, d_train)


# model.save_model('regressin')

# y_pred = model.predict(X_test.toarray())
# from sklearn.metrics import mean_squared_error
# # print(roc_auc_score(y_test.values, y_pred))
# print(mean_squared_error(y_pred ,Y_test)**0.5)

# df_fi = pd.DataFrame(model.feature_name(), columns=['feature'])
# df_fi['importance'] = list(model.feature_importance('gain'))
# df_fi.sort_values('importance', ascending=False, inplace=True)
# # print(df_fi)
# if plot:
#     plt.figure()
#     df_fi.head(10).plot(kind='barh', x='feature', y='importance')
#     plt.title('LightGBM Feature Importance')
#     plt.xlabel('relative importance')
#     plt.show()

# idx = (y_pred).argsort()[:53979]
# idx # ids to test main task

In [10]:
# %who
# del y_train_np

y_train_np = y_train.toarray().ravel()
X_train_np = X_train.toarray()


In [11]:
import lightgbm as lgb

verbose = 1
plot = 1
params = {
#     "max_bin": 1024,
#     "learning_rate": 0.1,
    "boosting_type": "goss",
    "objective": "lambdarank",
#     'num_iterations':1000,
#     "metric": "auc",
    "num_leaves": 10000,
    "verbose": 1
#     'num_class': 931
#     "min_data": 100,
#     "boost_from_average": True
}

d_train = lgb.Dataset(X_train_np, label=y_train_np)
model = lgb.train(params, d_train)

# model = lgb.LGBMClassifier(objective='multiclass', num_class=931, verbose=1)
# model.fit(X_train, X_test)

LightGBMError: b'len of label is not same with #data'

In [None]:
import gc
gc.collect()