### Import Required Packages and Set Options

In [179]:
import os
import sys

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime
from pandas.api.types import CategoricalDtype
from scipy.sparse import csr_matrix

from sklearn.base import BaseEstimator, TransformerMixin

In [152]:
REPO_ROOT = "/Users/ericlundquist/Repos/rankfm"
DATA_ROOT = os.path.join(REPO_ROOT, "data/ml-100k")

### Load Example Data

In [153]:
users = pd.read_csv(os.path.join(DATA_ROOT, "users.csv"))
items = pd.read_csv(os.path.join(DATA_ROOT, "items.csv"))
ratings = pd.read_csv(os.path.join(DATA_ROOT, "ratings.csv"))

print("users: {} items: {} ratings: {}".format(users.shape, items.shape, ratings.shape))

users: (943, 5) items: (1682, 21) ratings: (100000, 4)


#### Prepare Users Data

In [154]:
users['agegroup'] = pd.cut(users['age'], [0, 30, 45, 100], right=False, labels=False)
users = users.drop(['age', 'zip_code'], axis=1)
users = pd.get_dummies(users, prefix_sep='__', columns=['agegroup', 'gender', 'occupation'])
users.mean()

user_id                      472.000000
agegroup__0                    0.433722
agegroup__1                    0.348887
agegroup__2                    0.217391
gender__F                      0.289502
gender__M                      0.710498
occupation__administrator      0.083775
occupation__artist             0.029692
occupation__doctor             0.007423
occupation__educator           0.100742
occupation__engineer           0.071050
occupation__entertainment      0.019088
occupation__executive          0.033934
occupation__healthcare         0.016967
occupation__homemaker          0.007423
occupation__lawyer             0.012725
occupation__librarian          0.054083
occupation__marketing          0.027572
occupation__none               0.009544
occupation__other              0.111347
occupation__programmer         0.069989
occupation__retired            0.014846
occupation__salesman           0.012725
occupation__scientist          0.032874
occupation__student            0.207847


In [155]:
users.user_id.describe()

count    943.000000
mean     472.000000
std      272.364951
min        1.000000
25%      236.500000
50%      472.000000
75%      707.500000
max      943.000000
Name: user_id, dtype: float64

#### Prepare Items Data

In [156]:
item_names = items[['item_id', 'item_name']]
item_names.head()

Unnamed: 0,item_id,item_name
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [157]:
items = items.drop(['item_name', 'release_date'], axis=1)
items.columns = ['item_id'] + ["genre__{}".format(col) for col in items.columns[1:]]
items.mean()

item_id               841.500000
genre__action           0.149227
genre__adventure        0.080262
genre__animation        0.024970
genre__childrens        0.072533
genre__comedy           0.300238
genre__crime            0.064804
genre__documentary      0.029727
genre__drama            0.431034
genre__fantasy          0.013080
genre__film_noir        0.014269
genre__horror           0.054697
genre__musical          0.033294
genre__mystery          0.036266
genre__romance          0.146849
genre__scifi            0.060048
genre__thriller         0.149227
genre__war              0.042212
genre__western          0.016052
dtype: float64

#### Prepare Ratings Data

In [158]:
ratings['timestamp'] = pd.to_datetime(ratings['unix_timestamp'], origin='unix', unit='s')
ratings = ratings.drop('unix_timestamp', axis=1)
ratings.mean()

user_id    462.48475
item_id    425.53013
rating       3.52986
dtype: float64

### Build the Sparse Model Matrix

In [462]:
def fit(interactions=None, user_features=None, item_features=None):
    """build the model matrix from the component feature dataframes

    :param interactions: dataframe of [user_id, item_id, rating] records 
    :param user_features: dataframe of [user_id, user_feature_1, ... , user_feature_n] records
    :param item_features: dataframe of [item_id, item_feature_1, ... , item_feature_n] records
    :return: self
    """
    
    # copy the interaction matrix and store all user/item/feature values 
    interactions_ = interactions.assign(mm_row=np.arange(len(interactions)))
    users_ = np.sort(interactions_.iloc[:, 0].unique())
    items_ = np.sort(interactions_.iloc[:, 1].unique())
    user_features_ = np.sort(user_features.columns[1:])
    item_features_ = np.sort(item_features.columns[1:])
    
    # create namespace-specific column name lists
    user_cols = pd.DataFrame({'type': 'users', 'col_name': users_})
    item_cols = pd.DataFrame({'type': 'items', 'col_name': items_})
    user_features_cols = pd.DataFrame({'type': 'user_features', 'col_name': user_features_})
    item_features_cols = pd.DataFrame({'type': 'item_features', 'col_name': item_features_})
    rating_cols = pd.DataFrame({'type': 'rating', 'col_name': ['rating']}) 
    
    # combine the column names from all namespaces and map to model matrix column indexes
    all_cols = pd.concat([user_cols, item_cols, user_features_cols, item_features_cols, rating_cols], axis=0, ignore_index=True)
    all_cols['mm_col'] = np.arange(len(all_cols))

    # separate the column names / indexes by namespace
    user_columns = all_cols[all_cols['type'] == 'users']
    item_columns = all_cols[all_cols['type'] == 'items']
    user_feature_columns = all_cols[all_cols['type'] == 'user_features']
    item_feature_columns = all_cols[all_cols['type'] == 'item_features']
    rating_column = all_cols[all_cols['type'] == 'rating']['mm_col'].squeeze()
    
    # reshape (user_features, item_features) long and drop zero-valued features to save space  
    user_features_long = pd.melt(user_features, id_vars='user_id', var_name='feature_name', value_name='mm_val').query('mm_val > 0')
    item_features_long = pd.melt(item_features, id_vars='item_id', var_name='feature_name', value_name='mm_val').query('mm_val > 0')
    
    # join the (user_features, item_features) with the interactions matrix to get dataframes at the [user_id, item_id, feature_id] level
    user_features_long = pd.merge(interactions_, user_features_long, on='user_id', how='inner')
    item_features_long = pd.merge(interactions_, item_features_long, on='item_id', how='inner')
    
    # create the component (row, col, val) tuples necessary to build the final sparse model matrix
    user_tuples = pd.merge(interactions_, user_columns, left_on='user_id', right_on='col_name', how='inner')[['mm_row', 'mm_col']].assign(mm_val=1)
    item_tuples = pd.merge(interactions_, item_columns, left_on='item_id', right_on='col_name', how='inner')[['mm_row', 'mm_col']].assign(mm_val=1)
    user_features_tuples = pd.merge(user_features_long, user_feature_columns, left_on='feature_name', right_on='col_name', how='inner')[['mm_row', 'mm_col', 'mm_val']]
    item_features_tuples = pd.merge(item_features_long, item_feature_columns, left_on='feature_name', right_on='col_name', how='inner')[['mm_row', 'mm_col', 'mm_val']]
    rating_tuples = interactions_[['mm_row', 'rating']].assign(mm_col=rating_column).rename({'rating': 'mm_val'}, axis=1)[['mm_row', 'mm_col', 'mm_val']]
    all_tuples = pd.concat([user_tuples, item_tuples, user_features_tuples, item_features_tuples, rating_tuples], axis=0, ignore_index=True)
    
    # create the final sparse model matrix
    mm_input = (all_tuples['mm_val'], (all_tuples['mm_row'], all_tuples['mm_col']))
    mm_shape = (len(interactions_), len(all_cols))
    model_matrix = csr_matrix(mm_input, shape=mm_shape)
    return all_cols, model_matrix
    

#### Build the Model Matrix

In [463]:
interactions = ratings[['user_id', 'item_id', 'rating']]

In [464]:
%%timeit

all_cols, model_matrix = fit(interactions, users, items)

315 ms ± 7.14 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### Do Some Diagnostic Checks

In [465]:
all_cols.type.value_counts()

items            1682
users             943
user_features      26
item_features      18
rating              1
Name: type, dtype: int64

In [453]:
type(model_matrix), model_matrix.shape, model_matrix.data.nbytes / 1e6

(scipy.sparse.csr.csr_matrix, (100000, 2670), 6.50068)

In [456]:
users.shape[0] + items.shape[0] + (users.shape[1] - 1) + (items.shape[1] - 1) + 1

2670

In [457]:
mm_sample = pd.DataFrame(model_matrix[0:5].todense(), columns=all_cols.col_name)
mm_sample.head()

col_name,1,2,3,4,5,6,7,8,9,10,...,genre__film_noir,genre__horror,genre__musical,genre__mystery,genre__romance,genre__scifi,genre__thriller,genre__war,genre__western,rating
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,1,0,0,3
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,1,2
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [458]:
interactions.head()

Unnamed: 0,user_id,item_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [459]:
cols = mm_sample.columns
nonzero = mm_sample.apply(lambda x: x > 0)
nonzero = nonzero.apply(lambda x: list(cols[x.values]), axis=1)

for i, interaction in enumerate(nonzero):
    print("\ninteraction: {}".format(i))
    print(interaction)


interaction: 0
[196, 242, 'agegroup__2', 'gender__M', 'occupation__writer', 'genre__comedy', 'rating']

interaction: 1
[186, 302, 'agegroup__1', 'gender__F', 'occupation__executive', 'genre__crime', 'genre__film_noir', 'genre__mystery', 'genre__thriller', 'rating']

interaction: 2
[22, 377, 'agegroup__0', 'gender__M', 'occupation__writer', 'genre__childrens', 'genre__comedy', 'rating']

interaction: 3
[244, 51, 'agegroup__0', 'gender__M', 'occupation__technician', 'genre__drama', 'genre__romance', 'genre__war', 'genre__western', 'rating']

interaction: 4
[166, 346, 'agegroup__2', 'gender__M', 'occupation__educator', 'genre__crime', 'genre__drama', 'rating']


In [460]:
users[users.user_id == 22]

Unnamed: 0,user_id,agegroup__0,agegroup__1,agegroup__2,gender__F,gender__M,occupation__administrator,occupation__artist,occupation__doctor,occupation__educator,...,occupation__marketing,occupation__none,occupation__other,occupation__programmer,occupation__retired,occupation__salesman,occupation__scientist,occupation__student,occupation__technician,occupation__writer
21,22,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [461]:
items[items.item_id == 377]

Unnamed: 0,item_id,genre__action,genre__adventure,genre__animation,genre__childrens,genre__comedy,genre__crime,genre__documentary,genre__drama,genre__fantasy,genre__film_noir,genre__horror,genre__musical,genre__mystery,genre__romance,genre__scifi,genre__thriller,genre__war,genre__western
376,377,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
