In [1]:
# code for loading the format for the notebook
import os

# path : store the current path to convert back to it later
path = os.getcwd()
os.chdir( os.path.join('..', '..', 'notebook_format') )
from formats import load_style
load_style()

In [2]:
os.chdir(path)
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = 8, 6 # change default figure size
plt.rcParams['font.size'] = 12 # and font size

# 1. magic to print version
# 2. magic so that the notebook will reload external python modules
%load_ext watermark
%load_ext autoreload 
%autoreload 2

from scipy.sparse import csr_matrix
from sklearn.metrics import mean_squared_error

%watermark -a 'Ethen' -d -t -v -p numpy,pandas,matplotlib,scikit-learn,scipy

Ethen 2016-12-24 14:51:19 

CPython 3.5.2
IPython 4.2.0

numpy 1.11.3
pandas 0.18.1
matplotlib 1.5.1
scikit-learn 0.18
scipy 0.18.1


In [3]:
file_path = os.path.join('data', 'model_likes_anon.psv')
df = pd.read_csv(file_path, sep = '|', quotechar = '\\')

print( 'Drop duplicated rows: ', df.duplicated().sum() )
df = df.drop_duplicates()
df = df[['mid', 'uid']]

print('dimension: ', df.shape)
df.head()

Drop duplicated rows:  155
dimension:  (632677, 2)


Unnamed: 0,mid,uid
0,5dcebcfaedbd4e7b8a27bd1ae55f1ac3,7ac1b40648fff523d7220a5d07b04d9b
1,5dcebcfaedbd4e7b8a27bd1ae55f1ac3,2b4ad286afe3369d39f1bb7aa2528bc7
2,5dcebcfaedbd4e7b8a27bd1ae55f1ac3,1bf0993ebab175a896ac8003bed91b4b
3,5dcebcfaedbd4e7b8a27bd1ae55f1ac3,6484211de8b9a023a7d9ab1641d22e7c
4,5dcebcfaedbd4e7b8a27bd1ae55f1ac3,1109ee298494fbd192e27878432c718a


In [4]:
def print_data_info(data):
    """prints number of unique users, items and sparsity"""
    n_users  = data['uid'].unique().shape[0]
    n_items  = data['mid'].unique().shape[0]
    sparsity = data.shape[0] / (n_users * n_items) * 100
    print('Number of users: {}'.format(n_users))
    print('Number of models: {}'.format(n_items))
    print('Sparsity: {:4.3f}%'.format(sparsity))

In [5]:
def remove_sparsity(data, min_mid = 5, min_uid = 5):
    """
    remove users whose number of items that he/she rated is 
    below threshold (min_mid)
    and remove items that has been rated by less than the 
    threshold number of users (min_uid)
    """
    print_data_info(data)
    done = False
    while not done:
        starting_shape = data.shape[0]
        mid_counts = data.groupby('uid').count()
        uid_remove = mid_counts[ mid_counts['mid'] >= min_mid ].index
        data = data[ data['uid'].isin(uid_remove) ] 
        uid_counts = data.groupby('mid').count()
        mid_remove = uid_counts[ uid_counts['uid'] >= min_uid ].index
        data = data[ data['mid'].isin(mid_remove) ]
        ending_shape = data.shape[0]
        
        if starting_shape == ending_shape:
            done = True
    
    data = data.reset_index(drop = True)
    data['mid'] = data['mid'].astype('category')
    data['uid'] = data['uid'].astype('category')
    print_data_info(data)
    return data

In [6]:
min_uid, min_mid = 5, 5
df = remove_sparsity(df, min_mid, min_uid)
df.head()

Number of users: 62583
Number of models: 28806
Sparsity: 0.035%
Number of users: 15274
Number of models: 25655
Sparsity: 0.140%


Unnamed: 0,mid,uid
0,5dcebcfaedbd4e7b8a27bd1ae55f1ac3,2b4ad286afe3369d39f1bb7aa2528bc7
1,5dcebcfaedbd4e7b8a27bd1ae55f1ac3,1bf0993ebab175a896ac8003bed91b4b
2,5dcebcfaedbd4e7b8a27bd1ae55f1ac3,1109ee298494fbd192e27878432c718a
3,5dcebcfaedbd4e7b8a27bd1ae55f1ac3,8626c70d4b85af57804a8fc1173cbbe0
4,5dcebcfaedbd4e7b8a27bd1ae55f1ac3,e1527fdfa8782e70d499e177efc28605


In [7]:
ratings = csr_matrix( ( np.ones(df.shape[0]), 
                      ( df['uid'].cat.codes, df['mid'].cat.codes ) ) )
ratings

<15274x25655 sparse matrix of type '<class 'numpy.float64'>'
	with 547477 stored elements in Compressed Sparse Row format>

In [8]:
def create_train_test(ratings, n_split, n_frac):
    """
    split ratings into train and test sets,
    it will only consider users who have at least 2 * n_split
    interactions and from those users, choose n_frac to 
    randomly select n_split ratings into the training set
    
    Parameters
    ----------
    ratings : scipy sparse csr_matrix
        user-item-interactions
    
    n_split : int
        number of user-item-interactions per user to move
        from training to test set
    
    n_frac : float
        fraction of users to split off some of their
        interactions into test set
    """
    try:
        np.random.seed(1234)
        train = ratings.copy()
        threshold = 2 * n_split
        nonzero_row = train.nonzero()[0]
        n_samples = int( n_frac * train.shape[0] )
        user = np.where( np.bincount(nonzero_row) >= threshold )[0]
        user_index = np.random.choice(user, size = n_samples, replace = False)
    except ValueError:
        message = ( 'Not enough users with > {} interactions for fraction of {}'
                    .format(threshold, n_frac) )
        raise ValueError(message)
    
    # for all the users that met the threshold (have interactions
    # greater than 2 * n_split) assign randomly chosen interactions
    # to the test and assign those interactions to zero on the training
    test = csr_matrix(train.shape)
    for u in user_index:
        split_index = train[u].indices
        test_index = np.random.choice(split_index, size = n_split, replace = False)
        test[u, test_index] = train[u, test_index]
        train[u, test_index] = 0
        train.eliminate_zeros()
    
    return train, test, user_index

In [9]:
n_split = 5
n_frac = 0.2
train, test, user_index = create_train_test(ratings, n_split, n_frac)
train

<15274x25655 sparse matrix of type '<class 'numpy.float64'>'
	with 532207 stored elements in Compressed Sparse Row format>

In [19]:
from recsys import ALS

# default is True, which will generate progress bar
# for training iteration, might be annoying when doing grid search
als = ALS(verbose = True)
als.fit(train)

ALS: 100%|██████████| 15/15 [00:01<00:00,  9.04it/s]


ALS(alpha=15, n_factors=20, n_iters=15, n_jobs=8, reg=0.01, seed=1234,
  verbose=True)

In [11]:
def compute_neg_mse(model, ratings, user_index):
    """
    negative mean squared error for the ALS model,
    the negative is used for grid search, so there's
    a consistency that the higher the score the better
    """
    temp = ratings[user_index]
    mask = temp.nonzero()
    y_true = temp.data
    y_pred = model.predict()[mask]
    neg_mse = -mean_squared_error(y_true, y_pred)
    return neg_mse

In [20]:
from search import GridSearch

als = ALS(verbose = True)
als_params_opt = {
    'n_factors': [10, 15], 
    'reg': [0.1], 
    'alpha': [50]
}
gs_als = GridSearch(
    base_model = als, 
    params_opt = als_params_opt,
    scorer = compute_neg_mse # string
)
gs_als.fit(train, user_index)

ALS: 100%|██████████| 15/15 [00:00<00:00, 16.69it/s]
ALS: 100%|██████████| 15/15 [00:01<00:00, 11.97it/s]


<search.GridSearch at 0x11ebffdd8>

In [13]:
haha

NameError: name 'haha' is not defined

In [None]:
neg_mse = compute_neg_mse(current_model, train, user_index)
neg_mse

In [None]:
def compute_mapk(model, ratings, user_index, k):
    """mean average precision at k for the ALS model"""
    # compare the top k predictions' index to the actual index
    mapk = 0
    for u in user_index:
        y_true = ratings[u].indices
        u_pred = model.predict()[u]
        y_pred = np.argsort(u_pred)[::-1][:k]
        mapk += compute_apk(y_true, y_pred, k)
    
    mapk /= user_index.shape[0]
    return mapk

In [None]:
def compute_apk(y_true, y_pred, k):
    """
    average precision at k
    
    Example
    -------
    k = 2
    y_true = np.array([1, 2, 3, 4, 5])
    y_pred = np.array([6, 4, 7, 1, 2])[:k]
    compute_apk(y_true, y_pred, k) # 0.25
    """
    # convert to set since
    # membership testing in
    # a set is vastly faster
    actual = set(y_true)
    
    # precision at i is a percentage of correct 
    # items among first i recommendations; the
    # correct sum will be summed up by n_hit,
    # while score while store the percentage
    n_hit, score = 0, 0
    for i, p in enumerate(y_pred, 1):
        if p in actual and p in y_pred[:i]:
            n_hit += 1
            score += n_hit / i
    
    # divide by recall at the very end
    score /= min(len(actual), k)
    return score

In [None]:
k = 5
mapk = compute_mapk(current_model, train, user_index, k)
mapk