In [65]:
import numpy as np
import pandas as pd
import sys
import itertools 
import pickle
import matplotlib.pylab as plt
import time
from scipy.io import loadmat, mmread

In [67]:
import rankingmethod
import rankingmeasure
import rank

In [69]:
from  experiment import *

In [66]:
%matplotlib inline

In [68]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# loading data sets

In [16]:
data = pd.read_csv("../datasets/soc-Epinions1.txt", sep='\t', header=4)
data = data.as_matrix()

In [17]:
epinion, epinion_shape = make_valid_data(data)

In [18]:
data = pd.read_csv("../datasets/Slashdot0811.txt", sep='\t', header=4)
data = data.as_matrix()

In [19]:
slashdot, slashdot_shape = make_valid_data(data)

In [70]:
data =  pd.read_csv("../datasets/ml-100k/u.data", sep='\t', header=None).ix[:, :1]
data = data.as_matrix()

In [71]:
ml100k, ml100k_shape = make_valid_data(data)

In [None]:
data = pd.read_csv("../datasets/ml-1m/ratings.dat", sep='::', header=None,  engine='python').ix[:, :1]
data = data.as_matrix()

In [23]:
ml1m, ml1m_shape = make_valid_data(data)

### Зададим параметры, которые были выбраны при помощи валидации

In [72]:
def create_list_cf(user_item):
    list_cf = [
        rankingmethod.PopRec(user_item),
        rank.CLiMF(user_item, K=10, reg=0.005, lrate=0.001, verbose=0, maxiter=15),
        rank.CLiMF(user_item, K=10, reg=0.005, lrate=0.001, verbose=0, maxiter=150),
        rank.BPR_MF(user_item, K = 10, lrate=0.05, regIpos=0.0025, 
                                  regIneg=0.0025, regU=0.0025, maxiter=300, verbose=0 ),
         rankingmethod.iMF(user_item, K=10, lmbd=0.015, alpha=1, maxiter=15, verbose=0),
        rank.TFMAP(user_item, lrate=0.01, maxiter=20)
    ]
    return list_cf

## Сравнение работы алгоритмов

In [73]:
def climf_experiment(data, list_cf, K=5, maxiter=1, topK=5, verbose=0, validation_type=1):
    res = np.zeros((6, len(list_cf)))
    for t in range(maxiter):
        if validation_type == 0:
            test, train, traintest = givenK_train_test(data, K)
        elif validation_type == 1:
            test, train, traintest = ratio_train_test(data, K)
        for i, cf in enumerate(list_cf):
            start_time = time.time()
            cf.fit(train)
#             print(time.time() - start_time)
            prec_k, one_recal_k, ngdc, mp = rankingmeasure.get_Kmetrics(train, test, cf, K=topK)
            mrr, auc = rankingmeasure.get_listmetrics(train, test, cf)
            res[:, i] += np.array([prec_k, one_recal_k, ngdc, mp, mrr, auc])
            if verbose == 1:
                print(t, i, res[:, i] / (t + 1))
                sys.stdout.flush()
    res /= maxiter
    res = pd.DataFrame(res)
    
    res.columns = [type(cf).__name__ for cf in list_cf]
    res.index = [name + str(topK) for name in ['prec@', '1recal@', 'NDCG@', 'MAP@']] + ['MRR', 'AUC']
    return res

In [74]:
climf_experiment(ml100k, create_list_cf(ml100k_shape), K =0.1, maxiter=1, topK=5,  verbose=0)

Unnamed: 0,PopRec,CLiMF,CLiMF.1,BPR_MF,iMF,TFMAP
prec@5,0.146154,0.143176,0.011663,0.256328,0.298015,0.146898
1recal@5,0.495037,0.471464,0.058313,0.671216,0.753102,0.467742
NDCG@5,0.158041,0.162355,0.018914,0.278594,0.325244,0.165907
MAP@5,0.295272,0.30951,0.054942,0.454999,0.518848,0.309374
MRR,0.339705,0.358258,0.063969,0.5079,0.574299,0.350316
AUC,0.856558,0.843774,0.354636,0.931878,0.932253,0.847722


In [53]:
climf_experiment(ml100k, create_list_cf(ml100k_shape), K =0.2, maxiter=1, topK=5,  verbose=0)

Unnamed: 0,PopRec,CLiMF,BPR_MF,iMF,TFMAP
prec@5,0.240199,0.246898,0.394541,0.447643,0.257568
1recal@5,0.662531,0.655087,0.846154,0.888337,0.615385
NDCG@5,0.259162,0.261185,0.41181,0.47822,0.274834
MAP@5,0.4245,0.417957,0.581512,0.665104,0.416967
MRR,0.480649,0.471118,0.633542,0.720579,0.462428
AUC,0.856661,0.841716,0.929072,0.928343,0.847104


In [54]:
climf_experiment(epinion, create_list_cf(epinion_shape), K =0.1, maxiter=1, topK=5,  verbose=0)

Unnamed: 0,PopRec,CLiMF,BPR_MF,iMF,TFMAP
prec@5,0.031782,0.030375,0.055755,0.076686,0.029694
1recal@5,0.141203,0.133712,0.226788,0.28899,0.13076
NDCG@5,0.036236,0.035332,0.058648,0.08455,0.034052
MAP@5,0.083203,0.081563,0.121041,0.171857,0.078353
MRR,0.106883,0.106103,0.154131,0.203232,0.102533
AUC,0.869996,0.83347,0.919266,0.896619,0.867311


In [55]:
climf_experiment(epinion, create_list_cf(epinion_shape), K =0.2, maxiter=1, topK=5,  verbose=1)

0 0 [ 0.0600227   0.2492622   0.06807169  0.14936594  0.18106551  0.87154629]
0 1 [ 0.058479    0.24245176  0.06726779  0.14920545  0.18087421  0.82344455]
0 2 [ 0.0999773   0.35323496  0.10529264  0.19731429  0.23854742  0.91856299]
0 3 [ 0.13593644  0.43950057  0.14624658  0.26657901  0.3050251   0.89413977]
0 4 [ 0.05471056  0.23087401  0.06345845  0.1434027   0.17469081  0.866994  ]


Unnamed: 0,PopRec,CLiMF,BPR_MF,iMF,TFMAP
prec@5,0.060023,0.058479,0.099977,0.135936,0.054711
1recal@5,0.249262,0.242452,0.353235,0.439501,0.230874
NDCG@5,0.068072,0.067268,0.105293,0.146247,0.063458
MAP@5,0.149366,0.149205,0.197314,0.266579,0.143403
MRR,0.181066,0.180874,0.238547,0.305025,0.174691
AUC,0.871546,0.823445,0.918563,0.89414,0.866994


In [56]:
climf_experiment(slashdot, create_list_cf(slashdot_shape), K=0.1, maxiter=1, topK=5,  verbose=0)

Unnamed: 0,PopRec,CLiMF,BPR_MF,iMF,TFMAP
prec@5,0.01496,0.015475,0.024743,0.043965,0.014531
1recal@5,0.069937,0.072511,0.104548,0.166476,0.066505
NDCG@5,0.017404,0.017257,0.026554,0.048168,0.015411
MAP@5,0.041657,0.040385,0.056634,0.097097,0.034294
MRR,0.059544,0.057325,0.07754,0.121921,0.050586
AUC,0.849921,0.795534,0.871066,0.853451,0.843581


In [57]:
climf_experiment(slashdot, create_list_cf(slashdot_shape), K=0.2, maxiter=1, topK=5,  verbose=0)

Unnamed: 0,PopRec,CLiMF,BPR_MF,iMF,TFMAP
prec@5,0.029863,0.029348,0.042048,0.074485,0.028175
1recal@5,0.13115,0.129148,0.160898,0.255578,0.124142
NDCG@5,0.033434,0.032729,0.043991,0.080004,0.03064
MAP@5,0.07474,0.073377,0.08674,0.149358,0.067775
MRR,0.100142,0.097755,0.11501,0.181893,0.091801
AUC,0.843574,0.778659,0.863101,0.846316,0.836296


In [58]:
climf_experiment(ml1m, create_list_cf(ml1m_shape), K=0.1, maxiter=1, topK=5, verbose=1)

0 0 [ 0.1336457   0.41124527  0.14399806  0.2564864   0.29679144  0.85737817]
0 1 [ 0.11573256  0.36925572  0.11985524  0.20916582  0.25224184  0.84890599]
0 2 [ 0.22375203  0.61416471  0.24006127  0.39429927  0.44244401  0.92443239]
0 3 [ 0.2600829   0.65200937  0.27999383  0.44248588  0.48957763  0.91715365]
0 4 [ 0.07925752  0.28707875  0.08380195  0.16032644  0.19862019  0.84915509]


Unnamed: 0,PopRec,CLiMF,BPR_MF,iMF,TFMAP
prec@5,0.133646,0.115733,0.223752,0.260083,0.079258
1recal@5,0.411245,0.369256,0.614165,0.652009,0.287079
NDCG@5,0.143998,0.119855,0.240061,0.279994,0.083802
MAP@5,0.256486,0.209166,0.394299,0.442486,0.160326
MRR,0.296791,0.252242,0.442444,0.489578,0.19862
AUC,0.857378,0.848906,0.924432,0.917154,0.849155


In [59]:
climf_experiment(ml1m, create_list_cf(ml1m_shape), K=0.2, maxiter=1, topK=5, verbose=0)

Unnamed: 0,PopRec,CLiMF,BPR_MF,iMF,TFMAP
prec@5,0.226563,0.212507,0.334547,0.397225,0.14053
1recal@5,0.577942,0.55956,0.753109,0.801045,0.442062
NDCG@5,0.238727,0.218131,0.350127,0.417528,0.150439
MAP@5,0.37808,0.33991,0.508567,0.576701,0.268975
MRR,0.418982,0.385665,0.558877,0.625551,0.314291
AUC,0.857999,0.852355,0.922871,0.915748,0.850259


# Построение ансамбля

In [60]:
from sklearn import cross_validation
import rankensemble
import rankingmeasure

In [38]:
def netflix_ensemble_experiment(data, list_cf, list_ensemble, Ktrain=0.1, Kvalidation=0.1, 
                                     maxiter=1, topK=5,  verbose=0):
    res = np.zeros((4, len(list_cf) + len(list_ensemble)))
    for t in range(maxiter):
        test, train, traintest = ratio_train_test(data, Ktrain)
        validation, train, trainvalidation = ratio_train_test(train, Kvalidation / (1 - Ktrain), data_is_xlist=True)
        
        for i in range(len(list_cf)):
            list_cf[i].fit(train)
            prec_k, one_recal_k, ngdc, mp = rankingmeasure.get_Kmetrics(trainvalidation, test, list_cf[i], K=topK)
            res[:, i] += np.array([prec_k, one_recal_k, ngdc, mp])
            if verbose == 1:
                print(t, i, res[:, i] / (t + 1))
                sys.stdout.flush()
        for j, ens in enumerate(list_ensemble):
            m = j + len(list_cf)
            ens.fit(list_cf, validation, train, trainvalidation)

            prec_k, one_recal_k, ngdc, mp = rankingmeasure.get_Kmetrics(trainvalidation, test, ens, K=topK)
            res[:, m] += np.array([prec_k, one_recal_k, ngdc, mp])
            if verbose == 1:
                print(t,  m, res[:, m] / (t + 1))

        
    res /= maxiter
    res[:, len(list_cf) - 1] = np.max(res[:, :len(list_cf)], axis=1)
    res = res[:, len(list_cf)- 1:]
    res = pd.DataFrame(res)
    res.columns = ["best_result"] + [type(ens).__name__ for ens  in list_ensemble]
    res.index = [name + str(topK) for name in ['prec@', '1recal@', 'NDCG@', 'MAP@']]

    return res

In [39]:
import copy
from sklearn import linear_model, ensemble
from sklearn.kernel_ridge import KernelRidge
import rankensemble

In [41]:
list_ensemble = [
    rankensemble.RegressionEnsemble(linear_model.Ridge(alpha=100)),
    rankensemble.InnerValueEnsemble(),
    rankensemble.BoostValEns(rankingmeasure.get_NDCG,[3,0,1,2], num_weights=11),
    rankensemble.TreeValEns(rankingmeasure.get_NDCG, [[3, 0],[1,2]], num_weights=11)
]

In [42]:
netflix_ensemble_experiment(ml100k, create_list_cf(ml100k_shape)[1:5], list_ensemble, Ktrain=0.1, Kvalidation=0.1, verbose=1, maxiter=1)

0 0 [ 0.14665012  0.4764268   0.16461605  0.31256376]
0 1 [ 0.25558313  0.69478908  0.26735163  0.42890819]
0 2 [ 0.29652605  0.73449132  0.3201774   0.50352392]
0 3 [ 0.15533499  0.46774194  0.17802119  0.32637855]
0 4 [ 0.16029777  0.49503722  0.17781655  0.32390233]
0 5 [ 0.2528536   0.67121588  0.27830066  0.45995313]
0 6 [ 0.30694789  0.75806452  0.33076854  0.51851392]
0 7 [ 0.30198511  0.75558313  0.32575543  0.51528639]


Unnamed: 0,best_result,regression_ensemble,inner_value_ensemble,boost_val_ens,tree_val_ens
prec@5,0.296526,0.160298,0.252854,0.306948,0.301985
1recal@5,0.734491,0.495037,0.671216,0.758065,0.755583
NDCG@5,0.320177,0.177817,0.278301,0.330769,0.325755
MAP@5,0.503524,0.323902,0.459953,0.518514,0.515286


In [43]:
netflix_ensemble_experiment(ml100k, create_list_cf(ml100k_shape)[1:5], list_ensemble, Ktrain=0.2, Kvalidation=0.1, verbose=1, maxiter=1)

0 0 [ 0.2369727   0.61662531  0.25952715  0.42063344]
0 1 [ 0.37121588  0.82133995  0.38483334  0.55116487]
0 2 [ 0.44516129  0.87965261  0.4705842   0.64806314]
0 3 [ 0.25682382  0.59181141  0.2775557   0.42044217]
0 4 [ 0.25310174  0.62034739  0.27384985  0.42610456]
0 5 [ 0.38312655  0.80397022  0.40708563  0.5763751 ]
0 6 [ 0.44987593  0.87841191  0.47700751  0.6555883 ]
0 7 [ 0.44640199  0.87344913  0.47523242  0.65628619]


Unnamed: 0,best_result,regression_ensemble,inner_value_ensemble,boost_val_ens,tree_val_ens
prec@5,0.445161,0.253102,0.383127,0.449876,0.446402
1recal@5,0.879653,0.620347,0.80397,0.878412,0.873449
NDCG@5,0.470584,0.27385,0.407086,0.477008,0.475232
MAP@5,0.648063,0.426105,0.576375,0.655588,0.656286


In [44]:
netflix_ensemble_experiment(epinion, create_list_cf(epinion_shape)[1:5], list_ensemble, Ktrain=0.1, Kvalidation=0.1, verbose=0, maxiter=1)

Unnamed: 0,best_result,regression_ensemble,inner_value_ensemble,boost_val_ens,tree_val_ens
prec@5,0.0716,0.029557,0.053485,0.076459,0.07605
1recal@5,0.271056,0.130306,0.218388,0.292168,0.290806
NDCG@5,0.077573,0.033425,0.057291,0.083081,0.082269
MAP@5,0.156124,0.075919,0.119397,0.168882,0.167201


In [45]:
netflix_ensemble_experiment(epinion, create_list_cf(epinion_shape)[1:5], list_ensemble, Ktrain=0.2, Kvalidation=0.1, verbose=0, maxiter=1)

Unnamed: 0,best_result,regression_ensemble,inner_value_ensemble,boost_val_ens,tree_val_ens
prec@5,0.128309,0.055074,0.100068,0.132894,0.13639
1recal@5,0.417934,0.224291,0.359591,0.43723,0.443814
NDCG@5,0.138436,0.061518,0.106281,0.142505,0.146894
MAP@5,0.253614,0.131608,0.203544,0.260715,0.268619


In [46]:
netflix_ensemble_experiment(ml1m, create_list_cf(ml1m_shape)[1:5], list_ensemble, Ktrain=0.1, Kvalidation=0.1, verbose=0, maxiter=1)

Unnamed: 0,best_result,regression_ensemble,inner_value_ensemble,boost_val_ens,tree_val_ens
prec@5,0.260155,0.106506,0.237412,0.262858,0.263975
1recal@5,0.654172,0.342404,0.61867,0.656695,0.659398
NDCG@5,0.279527,0.11048,0.24766,0.283079,0.283604
MAP@5,0.438469,0.193646,0.385993,0.444041,0.443896


In [47]:
netflix_ensemble_experiment(ml1m, create_list_cf(ml1m_shape)[1:5], list_ensemble, Ktrain=0.2, Kvalidation=0.1, verbose=0, maxiter=1)

Unnamed: 0,best_result,regression_ensemble,inner_value_ensemble,boost_val_ens,tree_val_ens
prec@5,0.393657,0.197441,0.361687,0.398378,0.394485
1recal@5,0.798522,0.5255,0.77149,0.807533,0.806632
NDCG@5,0.412783,0.200736,0.371032,0.41927,0.415761
MAP@5,0.572967,0.312733,0.516635,0.583956,0.582231


In [48]:
netflix_ensemble_experiment(slashdot, create_list_cf(slashdot_shape)[1:5], list_ensemble, Ktrain=0.1, Kvalidation=0.1, verbose=0, maxiter=1)

Unnamed: 0,best_result,regression_ensemble,inner_value_ensemble,boost_val_ens,tree_val_ens
prec@5,0.043135,0.018678,0.029176,0.043879,0.043621
1recal@5,0.164474,0.08667,0.127288,0.168335,0.167191
NDCG@5,0.047393,0.020103,0.03125,0.048117,0.048136
MAP@5,0.095921,0.045589,0.068486,0.097646,0.097892


In [49]:
netflix_ensemble_experiment(slashdot, create_list_cf(slashdot_shape)[1:5], list_ensemble, Ktrain=0.2, Kvalidation=0.1, verbose=0, maxiter=1)

Unnamed: 0,best_result,regression_ensemble,inner_value_ensemble,boost_val_ens,tree_val_ens
prec@5,0.074685,0.036213,0.051916,0.075715,0.075801
1recal@5,0.257008,0.157752,0.203804,0.263158,0.265303
NDCG@5,0.080385,0.039745,0.055659,0.081924,0.082233
MAP@5,0.150617,0.087731,0.114039,0.155043,0.156295


In [50]:
list_ensemble = [
    rankensemble.tree_val_ens(rankingmeasure.get_prec_K, [[3, 0],[1,2]], num_weights=11),
    rankensemble.tree_val_ens(rankingmeasure.get_one_recal_K, [[3, 0],[1,2]], num_weights=11),
    rankensemble.tree_val_ens(rankingmeasure.get_MAP, [[3, 0],[1,2]], num_weights=11),
    rankensemble.tree_val_ens(rankingmeasure.get_NDCG, [[3, 0],[1,2]], num_weights=11)
]

In [51]:
netflix_ensemble_experiment(ml100k, create_list_cf(ml100k_shape)[1:5], list_ensemble, Ktrain=0.1, Kvalidation=0.1, verbose=1, maxiter=1)

0 0 [ 0.14094293  0.46153846  0.15209963  0.27955783]
0 1 [ 0.2560794   0.70347395  0.27412447  0.4495985 ]
0 2 [ 0.2866005   0.74069479  0.31632992  0.515681  ]
0 3 [ 0.14218362  0.43920596  0.1613946   0.29342604]
0 4 [ 0.30248139  0.77171216  0.32922651  0.52425903]
0 5 [ 0.30074442  0.77295285  0.32667672  0.52699545]
0 6 [ 0.29875931  0.7630273   0.32817649  0.52715571]
0 7 [ 0.30372208  0.76674938  0.33034877  0.52233423]


Unnamed: 0,best_result,tree_val_ens,tree_val_ens.1,tree_val_ens.2,tree_val_ens.3
prec@5,0.2866,0.302481,0.300744,0.298759,0.303722
1recal@5,0.740695,0.771712,0.772953,0.763027,0.766749
NDCG@5,0.31633,0.329227,0.326677,0.328176,0.330349
MAP@5,0.515681,0.524259,0.526995,0.527156,0.522334


In [52]:
netflix_ensemble_experiment(ml100k, create_list_cf(ml100k_shape)[1:5], list_ensemble, Ktrain=0.2, Kvalidation=0.1, verbose=0, maxiter=1)

Unnamed: 0,best_result,tree_val_ens,tree_val_ens.1,tree_val_ens.2,tree_val_ens.3
prec@5,0.44665,0.452605,0.447643,0.44665,0.452605
1recal@5,0.867246,0.870968,0.866005,0.867246,0.870968
NDCG@5,0.472116,0.479477,0.474829,0.472116,0.479477
MAP@5,0.644829,0.653071,0.64831,0.644829,0.653071


In [53]:
netflix_ensemble_experiment(epinion, create_list_cf(epinion_shape)[1:5], list_ensemble, Ktrain=0.1, Kvalidation=0.1, verbose=0, maxiter=1)

Unnamed: 0,best_result,tree_val_ens,tree_val_ens.1,tree_val_ens.2,tree_val_ens.3
prec@5,0.076686,0.078638,0.075323,0.077957,0.078638
1recal@5,0.285358,0.29353,0.287174,0.294211,0.29353
NDCG@5,0.083629,0.085524,0.082551,0.08535,0.085524
MAP@5,0.166548,0.171056,0.168206,0.172582,0.171056


In [54]:
netflix_ensemble_experiment(epinion, create_list_cf(epinion_shape)[1:5], list_ensemble, Ktrain=0.2, Kvalidation=0.1, verbose=0, maxiter=1)

Unnamed: 0,best_result,tree_val_ens,tree_val_ens.1,tree_val_ens.2,tree_val_ens.3
prec@5,0.126402,0.132168,0.132168,0.132168,0.132168
1recal@5,0.41521,0.433825,0.433825,0.433825,0.433825
NDCG@5,0.135165,0.140922,0.140922,0.140922,0.140922
MAP@5,0.246909,0.255909,0.255909,0.255909,0.255909


In [55]:
netflix_ensemble_experiment(ml1m, create_list_cf(ml1m_shape)[1:5], list_ensemble, Ktrain=0.1, Kvalidation=0.1, verbose=0, maxiter=1)

Unnamed: 0,best_result,tree_val_ens,tree_val_ens.1,tree_val_ens.2,tree_val_ens.3
prec@5,0.257091,0.261453,0.260407,0.261092,0.261092
1recal@5,0.648225,0.658137,0.658857,0.657055,0.657055
NDCG@5,0.276227,0.280478,0.279486,0.280263,0.280263
MAP@5,0.436236,0.442457,0.442252,0.442613,0.442613


In [56]:
netflix_ensemble_experiment(ml1m, create_list_cf(ml1m_shape)[1:5], list_ensemble, Ktrain=0.2, Kvalidation=0.1, verbose=0, maxiter=1)

Unnamed: 0,best_result,tree_val_ens,tree_val_ens.1,tree_val_ens.2,tree_val_ens.3
prec@5,0.398162,0.402054,0.402054,0.401261,0.402054
1recal@5,0.798522,0.812218,0.812218,0.806271,0.812218
NDCG@5,0.417878,0.422001,0.422001,0.422102,0.422001
MAP@5,0.576011,0.58198,0.58198,0.583496,0.58198


In [57]:
netflix_ensemble_experiment(slashdot, create_list_cf(slashdot_shape)[1:5], list_ensemble, Ktrain=0.1, Kvalidation=0.1, verbose=0, maxiter=1)

Unnamed: 0,best_result,tree_val_ens,tree_val_ens.1,tree_val_ens.2,tree_val_ens.3
prec@5,0.040818,0.042162,0.042162,0.042162,0.042162
1recal@5,0.157323,0.163759,0.163759,0.163759,0.163759
NDCG@5,0.045011,0.046524,0.046524,0.046524,0.046524
MAP@5,0.092162,0.095501,0.095501,0.095501,0.095501


In [58]:
netflix_ensemble_experiment(slashdot, create_list_cf(slashdot_shape)[1:5], list_ensemble, Ktrain=0.2, Kvalidation=0.1, verbose=0, maxiter=1)

Unnamed: 0,best_result,tree_val_ens,tree_val_ens.1,tree_val_ens.2,tree_val_ens.3
prec@5,0.073284,0.075715,0.075715,0.075715,0.075715
1recal@5,0.248427,0.258724,0.258724,0.258724,0.258724
NDCG@5,0.07906,0.082042,0.082042,0.082042,0.082042
MAP@5,0.147654,0.155032,0.155032,0.155032,0.155032
