In [3]:
import json
import numpy as np
import os
import paramiko
import sklearn
import subprocess

print(np.__version__)
print(sklearn.__version__)

1.18.5
0.24.1


## LinTS MovieLens Recommendations

This notebook explores the differences in MovieLens recommendations across multiple NumPy environments. In particular, this notebook uses the preprocessed data generated in [LinTS MovieLens notebook](LinTS%20MovieLens.ipynb), with a bit of a twist: we add an `unknown` column, denoting whether we know the occupation of the user, by doing a logical or on the `other` and `none` columns. All scenarios use LinTS for generating recommendations.

In [None]:
%%writefile example_lints.py

# Run this cell for every experiment
from datetime import datetime
import json
import pandas as pd
import numpy as np
import os
import platform
import pickle
from sklearn.preprocessing import StandardScaler
import sys

from mabwiser.mab import MAB
from mabwiser.linear import _RidgeRegression, _Linear

from utils import contains_repeated_eigenvalues, all_positive_definite

random_option = sys.argv[1]

class LinTSExample(_RidgeRegression):
    def predict(self, x):
        if self.scaler is not None:
            x = self._scale_predict_context(x) 
        if random_option == 'cholesky':
            beta_sampled = rng2.multivariate_normal(self.beta, self.A_inv, method='cholesky')
        else:
            beta_sampled = rng2.multivariate_normal(self.beta, self.A_inv)
        return np.dot(x, beta_sampled)
    
class LinearExample(_Linear):
    factory = {"ts": LinTSExample}

    def __init__(self, rng, arms, n_jobs=1, backend=None, l2_lambda=1, alpha=1, regression='ts', arm_to_scaler = None):
        super().__init__(rng, arms, n_jobs, backend, l2_lambda, alpha, regression)
       
        self.l2_lambda = l2_lambda
        self.alpha = alpha
        self.regression = regression

        # Create ridge regression model for each arm
        self.num_features = None

        if arm_to_scaler is None:
            arm_to_scaler = dict((arm, None) for arm in arms)

        self.arm_to_model = dict((arm, LinearExample.factory.get(regression)(rng, l2_lambda,
                                                                       alpha, arm_to_scaler[arm])) for arm in arms)


base_path = os.path.dirname(__file__)

# Dataset 1
users = pd.read_csv(os.path.join(base_path, 'movielens_users.csv'))
users['unknown'] = np.logical_or(users['other'], users['none'])
responses = pd.read_csv(os.path.join(base_path, 'movielens_responses.csv'))
train = users[users['set']=='train']
test = users[users['set']=='test']

train = train.merge(responses, how='left', on='user id')

context_features = [c for c in users.columns if c not in ['user id', 'set']]
none_ind = context_features.index('none')
other_ind = context_features.index('other')

decisions = MAB._convert_array(train['item id'])
rewards = MAB._convert_array(train['rated'])
contexts = MAB._convert_matrix(train[context_features]).astype('float')
test_contexts = MAB._convert_matrix(test[context_features]).astype('float')

scaler = StandardScaler()
contexts = scaler.fit_transform(contexts)
test_contexts = scaler.transform(test_contexts)
item_ids = list(responses['item id'].unique())

if random_option == 'randomstate':
    rng = np.random.RandomState(seed=11)
    rng2 = rng
elif random_option == 'svd':
    rng = np.random.RandomState(seed=11)
    rng2 = np.random.default_rng(11)
elif random_option == 'cholesky':
    rng = np.random.RandomState(seed=11)
    rng2 = np.random.default_rng(11)

mab = LinearExample(rng=rng, arms=item_ids, l2_lambda=1, alpha=1, regression='ts', n_jobs=1, backend=None)


np.random.seed(42)
mab.fit(decisions, rewards, contexts)
print(contains_repeated_eigenvalues(mab))
print(all_positive_definite(mab))
exps = mab.predict_expectations(test_contexts)

recs = [max(user_exps, key=user_exps.get).item() for user_exps in exps]
print(recs)

We try training a MAB model and getting a recommendation from the model across 4 different environments:
1. OpenBLAS-backed NumPy on MacOS Catalina
2. MKL-backed NumPy on MacOS Catalina
3. OpenBLAS-backed NumPy on Ubuntu 18.04
4. MKL-backed NumPy on Ubuntu 18.04

All environments contain NumPy version 1.18.5.

In [22]:
with open('ssh_config.json') as fp:
    conf = json.load(fp)

local_conf = {'host_name': 'localhost'}

envs = [
    ('mac_openblas', local_conf, os.path.expanduser('~/Tools/miniconda3/envs/reprod/bin/python')),
    ('mac_mkl', local_conf, os.path.expanduser('~/Tools/miniconda3/envs/reprod2/bin/python')),
    ('linux_openblas', conf, '$HOME/Tools/miniconda3/envs/reprod/bin/python'),
    ('linux_mkl', conf, '$HOME/Tools/miniconda3/envs/reprod2/bin/python'),
]

def test_envs(env_lis, script, option):
    all_vals = []
    for env_name, conf, python_exec in env_lis:
        print(f'Running {env_name}...')
        if conf['host_name'] == 'localhost':
            res = subprocess.run([python_exec, script, option], capture_output=True).stdout.decode('utf-8').strip()
            contains_repeated_eigenvalues, all_positive_definite, res = res.split('\n')
            print(f"Contains repeated eigenvalues: {contains_repeated_eigenvalues}")
            print(f"All covariances positive definite: {all_positive_definite}")
            all_vals.append(eval(res))
        else:
            with paramiko.SSHClient() as ssh:
                ssh.load_system_host_keys()
                ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
                ssh.connect(hostname=conf['host_name'], username=conf['username'], key_filename=os.path.expanduser(conf['key_filename']))
                with ssh.open_sftp() as ftp:
                    ftp.put(script, f'/tmp/{script}')
                    ftp.put('utils.py', '/tmp/utils.py')
                    ftp.put('movielens_users.csv', '/tmp/movielens_users.csv')
                    ftp.put('movielens_responses.csv', '/tmp/movielens_responses.csv')

                _, stdout, stderr = ssh.exec_command(f'{python_exec} /tmp/{script} {option}', get_pty=True)

                # Log the stdout as it comes
                contains_repeated_eigenvalues, all_positive_definite, res = [line.strip() for line in stdout.readlines()]
                print(f"Contains repeated eigenvalues: {contains_repeated_eigenvalues}")
                print(f"All covariances positive definite: {all_positive_definite}")
                all_vals.append(eval(res))

    return np.unique(all_vals, axis=0)

### Scenario 1

We test MovieLens data with the legacy `RandomState` class across the 4 different environments.

In [23]:
test_envs(envs, 'example_lints.py', 'randomstate')

Running mac_openblas...
Contains repeated eigenvalues: True
All covariances positive definite: True
Running mac_mkl...
Contains repeated eigenvalues: True
All covariances positive definite: True
Running linux_openblas...
Contains repeated eigenvalues: True
All covariances positive definite: True
Running linux_mkl...
Contains repeated eigenvalues: True
All covariances positive definite: True


array([[ 504,  168,  227,  151,  213,  441,  105,  119, 1210, 1286,  394,
         822, 1435, 1471, 1615,  288, 1077, 1008,  257,  552,  762,  317,
         288,  918,  636,  136, 1156,  412, 1679,  160, 1478,  181,  275,
        1548,  728,  285,  262,  748,   11, 1657,  449,  249,  543,  655,
         337,  169, 1489,  655,  179,    8,    1,  158,  271,  321,  288,
        1101,  359, 1503,   14,   53,  568,  702,  712, 1224,  344, 1216,
         298, 1093,  825,  879,  449,  210,   31,  137,  288, 1095,   11,
         478,  269,  459,  488,  239,   17,  218, 1227,   53,  879, 1673,
         765, 1203, 1123,  525,  232,  884,  288,    7, 1276, 1012,  455,
         655, 1372, 1562,  177,  872,  732, 1014,  422,  422,  345,  294,
         619,  789,  978,  239,  403, 1490,  756,  823,  306,  992, 1473,
         302,   33,  990,  418,  642, 1302,  388,  228, 1430,  204,  121,
         975,  319, 1166,  325,  186,  624, 1563,  612,  100, 1063,  810,
        1092, 1388, 1047,    1, 1468, 

As we can see, the top 1 recommendations generated for each user can be quite different across multiple runs. In fact, these 4 different environments lead to 3 different sets of recommendations, showing clearly that setting the seed isn't enough for reproducibility here.

### Scenario 2
We test MovieLens data using the new `Generator` class with default arguments across 4 different environments.

In [24]:
test_envs(envs, 'example_lints.py', 'svd')

Running mac_openblas...
Contains repeated eigenvalues: True
All covariances positive definite: True
Running mac_mkl...
Contains repeated eigenvalues: True
All covariances positive definite: True
Running linux_openblas...
Contains repeated eigenvalues: True
All covariances positive definite: True
Running linux_mkl...
Contains repeated eigenvalues: True
All covariances positive definite: True


array([[  31,  399,  483, 1166,  517,  590,   32, 1272,  196,  537,  658,
        1262,  475, 1676,  987,  332,   71, 1659,  550,   27,  700,  126,
         288, 1123,  199,  434, 1181,  593, 1346,  993, 1209,  693,  604,
         126, 1033,  486,   62,  952,  759,  857,  315, 1400,  206,  391,
         332, 1402,  214,  536,  204,  917, 1053, 1117,    7, 1561,  288,
         221,  873,  937, 1250,   11, 1052,  615,  956,  155, 1429, 1473,
         222,  405,  469,    7,  183,  597,  511, 1675,   55,  582,  597,
         103,  540, 1270,  286,  547,  775, 1344,  256, 1596,  336,  402,
         846,  576,  335,  994,  272, 1225,  222,  226,  668,  893,  121,
        1158,  402,  258,  586, 1009,  435,  748,  879, 1652, 1148,  866,
         909,  172, 1137,  482,  186, 1640, 1674,  581,  353,   89,  162,
         218,  196,  690,  949,  174,  245,  127,   65,  816,  227,    7,
         547,  187, 1480,  549,  946,  764,  183, 1010,  562,  671,  211,
         839,   87,  285,   15,  968, 

We can see that even with the new `Generator` class, which uses SVD decomposition by default, we still have a problem, with 4 different environments giving 3 different set of recommendations.

### Scenario 3
We test MovieLens data using the new `Generator` class with Cholesky decomposition method, across 4 different environments.

In [25]:
test_envs(envs, 'example_lints.py', 'cholesky')

Running mac_openblas...
Contains repeated eigenvalues: True
All covariances positive definite: True
Running mac_mkl...
Contains repeated eigenvalues: True
All covariances positive definite: True
Running linux_openblas...
Contains repeated eigenvalues: True
All covariances positive definite: True
Running linux_mkl...
Contains repeated eigenvalues: True
All covariances positive definite: True


array([[ 655,  235,  137,  444,  134,  150,  121, 1541,  259, 1588, 1246,
         414, 1354,  302, 1330,  288, 1289,  577,  318,  554,  751,  298,
         249, 1019,  539,  510,   79,  996,   17,  311,  321,  288,  416,
         870, 1106, 1482,  929,  172,  808,  475,  196, 1384,   65, 1320,
         158,  396, 1114,  355, 1458,  271,  682,    4,  288, 1117,  288,
         519,  278,  981, 1515, 1077,  455,  595,   73,   66, 1629,  269,
         559,  410,  520, 1408,    1,  117,  879, 1619,  273,  627,  679,
         286,  149,  456, 1222, 1081,  742,  343,  341,  987,  817,  939,
         545,  435, 1618, 1412,  828,  150,  121,  150,   32, 1254,  567,
         856, 1629,  255,  651,   30,  791,  111, 1051, 1570,  225,  845,
         342,  268, 1131, 1349,  518,  257,  295,  839,  304,   35,   99,
         270,  147, 1544,  819,  423, 1532, 1085,   92,  692,  180,  411,
        1434,  201, 1057,  498,  231,    1,  708,  603,  357, 1664,  177,
          86,  631,   97,  118,  478, 

We can see that using Cholesky decomposition, there is only one single set of recommendations the users receive, further showing that Cholesky decomposition is important for reproducibility purposes.