In [1]:
from typing import Dict, List, Union, Tuple, NamedTuple
from collections import OrderedDict
import tqdm
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
from torch import nn
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
movie_data_path =  "/Volumes/ExFAT/dataset/ml-1m/movies.dat"
user_data_path = "/Volumes/ExFAT/dataset/ml-1m/users.dat"
ratings_data_path = "/Volumes/ExFAT/dataset/ml-1m/ratings.dat"

# Dataset processing

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin


In [4]:
class LabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_encode: List[str]):
        self.columns_to_encode = columns_to_encode
        self.unseen = -1

    def fit(self, df: pd.DataFrame, y=None):
        df_ = df[self.columns_to_encode].copy()

        df_[self.columns_to_encode] = df[self.columns_to_encode].astype(
            'str')


        unique_column_vals = {col: df_[col].unique()
                              for col in self.columns_to_encode}

        self.encoding_dict_ = OrderedDict()

        for col in self.columns_to_encode:
            unique_value = unique_column_vals[col]
            self.encoding_dict_[col] = {val: idx for idx, val in enumerate(unique_value)}
            self.encoding_dict_[col][self.unseen] = len(self.encoding_dict_[col])

        return self

    def transform(self, df: pd.DataFrame):
        try:
            self.encoding_dict_
        except AttributeError:
            raise NotFittedError(
                "This LabelEncoder instance is not fitted yet. "
                "Call 'fit' with appropriate arguments before using this LabelEncoder."
            )
        df_ = df.copy()
        filtered_columns = [col for col in self.columns_to_encode if col in df_.columns]
        df_[filtered_columns] = df_[filtered_columns].astype('str')

        for col, encoding_map in self.encoding_dict_.items():
            original_value = [f for f in encoding_map.keys() if f != self.unseen]
            if col in filtered_columns:
                df_[col] = np.where(df_[col].isin(
                    original_value), df_[col], self.unseen)
                df_[col] = df_[col].apply(lambda x: encoding_map[x])
        return df_

    
class MultiLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_encode: List[str], label_splitter='|'):
        self.columns_to_encode = columns_to_encode
        self.label_splitter = label_splitter
        self.unseen = -1
        
        
    def fit(self, df: pd.DataFrame, y=None):
        df_ = df[self.columns_to_encode].copy()
       
        df_[self.columns_to_encode] = df[self.columns_to_encode].astype(
            'str')

        
        self.encoding_dict_ = OrderedDict()
        for col in self.columns_to_encode:
            label_2_idx = self._fetch_label2idx_from_column(df_, col)
            label_2_idx[self.unseen] = len(label_2_idx)
            self.encoding_dict_[col] = label_2_idx
        return self
    
    
    def _fetch_label2idx_from_column(self, df: pd.DataFrame, col_name: str) -> Dict[str, int]:
        all_combination = df[col_name].tolist()
        
        unique_label = set()
        for c in all_combination:
            unique_label.update(c.split(self.label_splitter))
        
        return {label: idx for idx, label in enumerate(unique_label)}
        
    def transform(self, df: pd.DataFrame):
        try:
            self.encoding_dict_
        except AttributeError:
            raise NotFittedError(
                "This LabelEncoder instance is not fitted yet. "
                "Call 'fit' with appropriate arguments before using this LabelEncoder."
            )
        df_ = df.copy()
        df_[self.columns_to_encode] = df_[self.columns_to_encode].astype(str)
        
        for col, encoding_map in self.encoding_dict_.items():
            df_[col] = df_[col].apply(lambda x: self._encoding_fn(encoding_map, x))
        
        return df_
    
    def _encoding_fn(self, encoding_map: Dict[str, int], multi_label: str)-> List[int]:
        multi_label = multi_label.split(self.label_splitter)
        encoded = [encoding_map[i] if i in encoding_map else encoding_map[self.unseen] for i in multi_label]
        return encoded
        

In [5]:
class IDFeaturesGenerator(BaseEstimator, TransformerMixin):
    def __init__(self, label_cols: List[str], multi_label_cols: List[str]=[]):
        self.label_cols = label_cols
        self.multi_label_cols = multi_label_cols
        self.all_columns = self.label_cols + self.multi_label_cols
        
    def fit(self, df: pd.DataFrame, y=None):
        
        df_id = df[self.all_columns].copy()
        self.label_encoder_ = LabelEncoder(self.label_cols).fit(df_id)
        self.multi_label_encoder_ = MultiLabelEncoder(self.multi_label_cols).fit(df_id)

        self.embed_cols_unique_labels_ = OrderedDict()
        self.embed_cols_unique_labels_.update({col: len(v) for col, v in self.label_encoder_.encoding_dict_.items()})
        self.embed_cols_unique_labels_.update({col: len(v) for col, v in self.multi_label_encoder_.encoding_dict_.items()})
    
        return self

    def transform(self, df: pd.DataFrame)-> pd.DataFrame:
        input_columns = df.columns
        filtered_columns = [col for col in self.all_columns if col in input_columns]
        df_id = df[filtered_columns].copy()
        df_id = self.label_encoder_.transform(df_id)
        df_id = self.multi_label_encoder_.transform(df_id)
        return df_id
        


In [6]:
class MovieLensDataset(torch.utils.data.Dataset):
    def __init__(self, user_path: str, movie_path: str, rate_path: str):
        self.target_col = 'rating'
        self.rating_threshold = 3

        self.user_fields = ['userId', 'gender', 'age', 'occupation', 'zipCode']
        self.item_fields = ['movieId']

        self.all_fields = self.user_fields + self.item_fields
        self.fields_index = {col: idx for idx, col in enumerate(self.all_fields)}
        
        X_df, Y_df = self._get_X_Y(user_path, movie_path, rate_path)
        self.X_df = self._X_processing(X_df)
        self.Y_df = self._Y_processing(Y_df)
        self.X = self.X_df.values  # keep in order
        self.Y = self.Y_df.values
        
        self.fields_dims = self._gen_field_dims()
        
        
    def __len__(self):
        return self.Y.shape[0]
    
    def __getitem__(self, index):
        
        return self.X[index], self.Y[index]
    
    def _get_X_Y(self, user_path: str, movie_path: str, rate_path: str) -> (pd.DataFrame, pd.DataFrame):
        user_columns = ["gender", "age", "occupation", "zipCode"] 
        movie_columns = ['title', 'genres']
        user_df = pd.read_csv(user_path, sep="::", header=None, engine="python", names=user_columns)
        item_df = pd.read_csv(movie_path, sep="::", header=None, engine="python", names=movie_columns).drop('title', axis=1)
        rate_df = pd.read_csv(rate_path, sep="::", engine="python", header=None, names=['userId', 'movieId', 'rating', 'timestamp']).drop('timestamp', axis=1) 

        rate_df = rate_df.merge(user_df, left_on=['userId'], right_index=True, how='inner')
        rate_df = rate_df.merge(item_df, left_on=['movieId'], right_index=True, how='inner').reset_index(drop=True)
        return rate_df.drop([self.target_col], axis=1), rate_df[[self.target_col]]
        

    def _X_processing(self, input_df)-> pd.DataFrame:
        self.id_feature_encoder_ = IDFeaturesGenerator(label_cols=self.all_fields).fit(input_df)
        df_label = self.id_feature_encoder_.transform(input_df)

        return df_label[self.all_fields]
    
    def _Y_processing(self, input_df)-> pd.DataFrame:
        target_y = input_df[self.target_col].values.copy()
        target_y = (target_y > self.rating_threshold).astype(int)
        input_df[self.target_col] = target_y
        return input_df
    
    def _gen_field_dims(self)-> Dict[str, int]:
        field_dims = OrderedDict(self.id_feature_encoder_.embed_cols_unique_labels_)
        return field_dims
        

# FM Model

In [7]:
class FactorizationMachine(torch.nn.Module):
    def __init__(self, reduce_sum=True):
        super().__init__()
        self.reduce_sum = reduce_sum
    
    def forward(self, x):
        square_of_sum = torch.sum(x, dim=1) ** 2
        sum_of_square = torch.sum(x ** 2, dim=1)
        ix = square_of_sum - sum_of_square
        if self.reduce_sum:
            ix = torch.sum(ix, dim=1, keepdim=True) # [b_size, 1]
        return 0.5 * ix 
    
class FactorizationMachineModel(torch.nn.Module):
    def __init__(self, fields_index: Dict[str, int], fields_dims: Dict[str, int], embed_dim):
        super(FactorizationMachineModel, self).__init__()
        self.fields_index = fields_index
        self.all_fields = [k for k, v in sorted(self.fields_index.items(), key=lambda kv: kv[1])]
        self.embed_dim = embed_dim
        
        self.fields_dims_dict = fields_dims
        self.fields_dims = np.array([self.fields_dims_dict[f] for f in self.all_fields])
        
        self.fields_offset = torch.tensor((0, *np.cumsum(self.fields_dims)[:-1]), dtype=torch.long).requires_grad_(False).unsqueeze(0)
        '''
            field_offset [1, sum_of_field_dims]
            field_dims:  np.array([10, 20 ,5, 7, 9])
            [*np.cumsum(self.field_dims)[:-1]] = [10, 30, 35, 42]
        '''
        self.fields_range = self._gen_fields_range(self.fields_dims)
        self.embedding = torch.nn.Embedding(self.fields_dims.sum(), embed_dim)
        torch.nn.init.xavier_uniform_(self.embedding.weight.data)
        
        self.fc = torch.nn.Embedding(self.fields_dims.sum(), 1)
        self.bias = torch.nn.Parameter(torch.zeros((1, )))
        
        self.fm = FactorizationMachine(reduce_sum=True)
    
    def _forward_embedding(self, X: torch.Tensor):
        
        return self.embedding(X) # [b_size, field_num, embed_dim]
    
    def _forward_linear(self, X: torch.Tensor):
     
        return torch.sum(self.fc(X), dim=1) + self.bias # [b_size, output_dim]
        
    def get_field_vector(self, field_name: str, label: int)-> np.ndarray:
        assert field_name in self.fields_index
        assert label < self.fields_dims_dict[field_name]
        
        field_index = self.fields_index[field_name]
        field_range = self.fields_range[field_index]
        
        field_vector = self.embedding.weight.data[field_range[0]: field_range[1]]
        return field_vector[label]

    def _gen_fields_range(self, fields_dims: np.ndarray)-> List[Tuple[int, int]]:
        fields_offset = [0, *np.cumsum(fields_dims)]
        return [(s, e) for s, e in zip(fields_offset, fields_offset[1:])]
    

    def forward(self, X: torch.Tensor):
        # X [b_size, num_fields]
        if X.get_device() != self.fields_offset.get_device():
            self.fields_offset = self.fields_offset.to(X.device)
        
        X = X + self.fields_offset
        X = self._forward_linear(X) + self.fm(self._forward_embedding(X)) # [b_size, 1]
        return X.squeeze(1) # [b_size]

    def predict_probs(self, X: torch.Tensor):
        with torch.no_grad():
            out = self.forward(X)
            return torch.sigmoid(out)

# Training

##  evaluation

In [15]:
import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import metrics

@torch.no_grad()
def cal_metrics(model, X, Y):
    model.eval()
    y_prob = model.predict_probs(*X)
    metrics = _cal_metrics(Y, y_prob)
    
    model.train()
    return metrics


def _cal_metrics(Y: torch.Tensor, Y_prob: torch.Tensor):
    Y_pre_label = (Y_prob > 0.5).long()
    
    Y = Y.view(-1).cpu().detach().numpy()
    Y_prob = Y_prob.cpu().detach().numpy()
    Y_pre_label = Y_pre_label.cpu().detach().numpy()
    tn, fp, fn, tp  = confusion_matrix(Y, Y_pre_label).ravel()

    precision = (tp) / (tp + fp)
    recall = (tp) / (tp + fn)
    acc = (tp + tn) / (tn + fp + fn + tp)
    
    auc = metrics.roc_auc_score(Y, Y_prob)

    return {'prec': precision, 'recall': recall, 'acc': acc, 'auc': auc}
    

@torch.no_grad()
def cal_metrics_for_data_loader(model: nn.Module, dataloader):
    model.eval()
    total_Y = []
    total_Y_prob = []
    for X, Y in dataloader:
        y_prob = model.predict_probs(X)
        total_Y.append(Y)
        total_Y_prob.append(y_prob)
    
    total_Y = torch.cat(total_Y, axis=0)
    total_Y_prob = torch.cat(total_Y_prob, axis=0)
    metrics = _cal_metrics(total_Y, total_Y_prob)
    model.train()



    return metrics
    

def validation_step(model: nn.Module, X, Y, loss_fn):
    model.eval()
    with torch.no_grad():
        y_pre = model(X)
        loss = loss_fn(y_pre.view(-1).float(), Y.float())  # mean loss
    
    model.train()
    return loss.item(), cal_metrics(model, X, Y)

def validation_step_for_data_loader(model: nn.Module, dataloader, loss_fn, device=torch.device('cpu')):
    model.eval()
    total_Y = []
    total_Y_pre = []
    with torch.no_grad():
        for X, Y in dataloader:
            X = X.to(device)
            # offset = offset.to(device)
            Y = Y.to(device)
            y_pre = model(X)
            total_Y.append(Y)
            total_Y_pre.append(y_pre)
            
    
        Y = torch.cat(total_Y, axis=0)
        Y_pre = torch.cat(total_Y_pre, axis=0)

        Y_prob = torch.sigmoid(Y_pre)
        loss = loss_fn(Y_pre, Y.float().view(-1))
        matrics = _cal_metrics(Y, Y_prob)
    model.train()
    return loss.item(), matrics

## Training stage

In [8]:
%%time

dataset = MovieLensDataset(user_data_path, movie_data_path, ratings_data_path)
train_set_size = int(len(dataset) * 0.8)
valid_set_size = len(dataset) - train_set_size
train_set, valid_set = torch.utils.data.random_split(dataset, [train_set_size, valid_set_size])


CPU times: user 15.6 s, sys: 1.35 s, total: 16.9 s
Wall time: 20.2 s


In [9]:
dataset.fields_index, dataset.fields_dims

({'age': 2,
  'gender': 1,
  'movieId': 5,
  'occupation': 3,
  'userId': 0,
  'zipCode': 4},
 OrderedDict([('userId', 6041),
              ('gender', 3),
              ('age', 8),
              ('occupation', 22),
              ('zipCode', 3440),
              ('movieId', 3707)]))

In [55]:
log_interval = 500
epoch = 5
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [56]:
dims = 50
fields_index = dataset.fields_index
fields_dims = dataset.fields_dims
fm_model = FactorizationMachineModel(fields_index, fields_dims, dims)
criterion = torch.nn.BCEWithLogitsLoss(reduction='mean')
optimizer = torch.optim.AdamW(params=fm_model.parameters(), lr=0.001, weight_decay=1e-6)

In [57]:

fm_model = fm_model.to(device)
train_data_loader = torch.utils.data.DataLoader(train_set, batch_size=300, shuffle=True)
valid_data_loader = torch.utils.data.DataLoader(valid_set, batch_size=300, shuffle=False)

count = 0
for epoch_i in range(epoch):
    total_loss = 0
    # tk0 = tqdm.tqdm(train_data_loader, smoothing=0, mininterval=1.0)
    tk0 = tqdm.tqdm(train_data_loader, smoothing=0, mininterval=1.0, position=0, leave=True)
    for i, (X, Y) in enumerate(tk0):
        X, Y = X.to(device), Y.to(device)
        fm_model.train()
        y_pre = fm_model(X)
        loss = criterion(y_pre.view(-1), Y.float().view(-1))
        
        fm_model.zero_grad()
        loss.backward()
        
        optimizer.step()
     
        
        total_loss += loss.item()
        count += 1
        if (count) % log_interval == 0:
            valid_loss, score = validation_step_for_data_loader(fm_model, valid_data_loader, criterion, device)
            tk0.set_postfix(train_loss=total_loss/log_interval, valid_loss=valid_loss, metrics={k: np.round(v, 4) for k, v in score.items()})
            total_loss = 0
            count = 0

100%|██████████| 2668/2668 [01:15<00:00, 35.29it/s, metrics={'prec': 0.7313, 'recall': 0.7909, 'acc': 0.7125, 'auc': 0.7744}, train_loss=0.568, valid_loss=0.564]
100%|██████████| 2668/2668 [01:26<00:00, 30.98it/s, metrics={'prec': 0.7367, 'recall': 0.7872, 'acc': 0.7157, 'auc': 0.7792}, train_loss=0.556, valid_loss=0.559]
100%|██████████| 2668/2668 [01:24<00:00, 31.46it/s, metrics={'prec': 0.7368, 'recall': 0.8012, 'acc': 0.721, 'auc': 0.7857}, train_loss=0.546, valid_loss=0.552]
100%|██████████| 2668/2668 [01:11<00:00, 37.38it/s, metrics={'prec': 0.7455, 'recall': 0.7913, 'acc': 0.7245, 'auc': 0.7899}, train_loss=0.538, valid_loss=0.546]
100%|██████████| 2668/2668 [01:32<00:00, 28.93it/s, metrics={'prec': 0.7458, 'recall': 0.8, 'acc': 0.7281, 'auc': 0.7945}, train_loss=0.522, valid_loss=0.542]


In [58]:
validation_step_for_data_loader(fm_model, valid_data_loader, criterion)

(0.5396221280097961,
 {'acc': 0.7284720208756161,
  'auc': 0.7961434666838393,
  'prec': 0.7469499557022214,
  'recall': 0.7985436471385621})

# Item Embedding & User Embedding

In [19]:
from collections import namedtuple
ID2Vector = namedtuple('id2vector', 'id vector')
class FMEmbedding:
    def __init__(self, id_encoder, user_fields, item_fields, fm_model):
        self.id_encoder = id_encoder
        self.user_fields = user_fields
        self.item_fields = item_fields
        self.fm_model = fm_model
        
        self.fields_dims_dict = fm_model.fields_dims_dict
        self.fields_index=  fm_model.fields_index
        self.fields_dims = fm_model.fields_dims
        self.fields_range = self._gen_fields_range(self.fields_dims)
        
        
        self.fields_vectors = self.fm_model.embedding.weight.data.numpy() # hidden vector
        self.fields_weights = self.fm_model.fc.weight.data.numpy()
        self.bias = self.fm_model.bias.data.numpy()
    


    
    def get_user_embedding(self, primitive_features: pd.DataFrame, userId_column='userId')-> List[ID2Vector]:
        assert set(primitive_features.columns) == set(self.user_fields)
        user_id_features = self.id_encoder.transform(primitive_features).values
        user_fields_index = [self.fields_index[col] for col in self.user_fields]
        
        fields_offset = np.array([0, *np.cumsum(self.fields_dims)[:-1]])[user_fields_index].reshape((1, -1))
        user_IDs_features = user_id_features + fields_offset
        
    
      
        '''  field vector'''
        user_IDs_vectors = np.take(self.fields_vectors, user_IDs_features, axis=0) # [b_size, user_field_size, vector_dims]
        user_fields_cross = self._cross_vector(user_IDs_vectors) # [b_size]
        
        user_IDs_vectors = user_IDs_vectors.sum(axis=1) # [b_size, vector_dims]

        
        '''field weight'''
        user_fields_weights = np.take(self.fields_weights, user_IDs_features, axis=0).squeeze(2) # [b_size, user_field_size]
        user_fields_weights = user_fields_weights.sum(axis=1) # [b_size]

       
        
        '''
            user embebding composition [1 :: (user_fields_cross + user_fields_weghts):: user_IDs_vectors]
        '''
        ones = np.ones((len(user_fields_weights), 1)) # [b_size]
        
        embedding =  np.hstack([ones, np.expand_dims(user_fields_cross + user_fields_weights, 1), user_IDs_vectors])

        id2vector = [ID2Vector(userId, vector) for userId, vector in zip(primitive_features[userId_column].tolist(), embedding)]
        return id2vector

        
        
    
    def get_item_embedding(self, primitive_features: pd.DataFrame, itemId_col='itemId')-> List[ID2Vector]:
        assert set(primitive_features.columns) == set(self.item_fields)
        '''
            primitive_features may contain unseen itemId
        '''
        item_id_feautres = self.id_encoder.transform(primitive_features).values
        item_fields_index = [self.fields_index[col] for col in self.item_fields]
        
        fields_offset = np.array([0, *np.cumsum(self.fields_dims)[:-1]])[item_fields_index].reshape((1, -1))
        
        item_IDs_features = item_id_feautres + fields_offset
        
        ''' field vector '''
        item_IDs_vectors = np.take(self.fields_vectors, item_IDs_features, axis=0) # [b_size, item_fields_size, field_vector_dims]
        item_fields_cross = self._cross_vector(item_IDs_vectors) # [b_size]
        item_IDs_vectors = item_IDs_vectors.sum(axis=1) # [b_size, field_vector_dims]
        
        '''field weight'''
        item_fields_weights = np.take(self.fields_weights, item_IDs_features, axis=0).squeeze(2) # [b_size, item_field_size]
        item_fields_weights = item_fields_weights.sum(axis=1) # [b_size]
    
#         '''bias'''
#         bias = np.full((len(item_fields_weights), 1), self.bias.data)
        
        '''
            item embedding composition: [(item_fields_weights + item_fileds_cross ):: 1 :: item field vector]
    
        '''
        ones = np.ones((len(item_fields_weights), 1)) # [b_size, 1]
        sum_ = np.expand_dims(item_fields_weights + item_fields_cross, 1)

        
        embedding = np.hstack([sum_, ones, item_IDs_vectors])
        id2vector = [ID2Vector(itemId, vector) for itemId, vector in zip(primitive_features[itemId_col].tolist(), embedding) ]
        return id2vector
    
    def _l2_norm(self, vectors: np.ndarray)-> np.ndarray:
        l2norm = np.linalg.norm(vectors, 2, axis=1, keepdims=True) # [b_size, 1]
        return vectors / l2norm
    
    def _cross_vector(self, vectors: np.ndarray)-> int:
        '''
            @vectors: [b_size, vector_dims]
        '''
        square_of_sum = np.sum(vectors, axis=1) ** 2
        sum_of_square = np.sum(vectors ** 2, axis=1)
        reduce_sum = np.sum(square_of_sum - sum_of_square, axis=1) # [b_size, 1]
        return 0.5 * reduce_sum
        
    def _get_field_vector(self, field_name: str, label: int)-> np.ndarray:
        assert field_name in self.fields_index
        assert label < self.fields_dims_dict[field_name]
        
        field_index = self.fields_index[field_name]
        field_range = self.fields_range[field_index]
        
        field_vector = self.fields_vectors[field_range[0]: field_range[1]]
        
        return field_vector[label]

        
    
    def _gen_fields_range(self, fields_dims: np.ndarray)-> List[Tuple[int, int]]:
        fields_offset = [0, *np.cumsum(fields_dims)]
        return [(s, e) for s, e in zip(fields_offset, fields_offset[1:])]
    


In [20]:
user_columns = ["gender", "age", "occupation", "zipCode"] 
movie_columns = ['title', 'genres']
user_df = pd.read_csv(user_data_path, sep="::", header=None, engine="python", names=user_columns)
item_df = pd.read_csv(movie_data_path, sep="::", header=None, engine="python", names=movie_columns).drop('title', axis=1)
item_df = item_df.reset_index().rename({'index': 'movieId'}, axis=1)[['movieId']]

user_df = user_df.reset_index().rename({'index': 'userId'}, axis=1)

In [21]:
id_encoder = dataset.id_feature_encoder_
user_field = dataset.user_fields
item_field = dataset.item_fields

fm_embedding = FMEmbedding(id_encoder, user_field, item_field, fm_model)

item_id_2_vector = fm_embedding.get_item_embedding(item_df, 'movieId')
user_id_2_vector = fm_embedding.get_user_embedding(user_df, 'userId')

len(item_id_2_vector), len(user_id_2_vector)

(3883, 6040)

In [72]:
user_id_2_vector[0]

id2vector(id=1, vector=array([ 1.        ,  1.57251287,  0.31802261, -0.05311833, -0.51956671,
       -0.41219246, -0.24985576, -0.65933889,  0.59215838,  0.41595197,
        0.11497089, -0.69019455,  0.24329846,  0.45048714, -0.05286196,
        0.2054476 ,  0.30616254,  0.30125472,  0.40432882,  0.26946735,
       -0.28497586,  0.23863903,  0.37228948, -0.47429329, -0.21513283,
       -0.37172595, -0.29311907,  0.32599026, -0.37352464, -0.65939361,
       -0.13792202, -0.29631072, -0.70424175, -0.03108542, -0.47497728,
        0.65669644, -0.0536177 , -0.46591866,  0.04575365, -0.47661048,
       -0.23746635, -0.48891699, -0.39626658,  0.34011322,  0.42570654,
       -0.53802925, -0.32868454,  0.31107759,  0.65648586,  0.36503255,
        0.22789076,  0.56083   ]))

In [71]:
item_id_2_vector[0]

id2vector(id=1, vector=array([-1.6513232 ,  1.        , -0.15136771, -0.19989893, -0.17711505,
       -0.27605495, -0.17580783, -0.20358004,  0.19178246,  0.28852561,
        0.19627182, -0.21493186,  0.22243072,  0.24396233, -0.17809424,
        0.19624405,  0.21878579,  0.20799968,  0.12430456,  0.18400647,
       -0.18915175,  0.17806746,  0.21123466, -0.15553948, -0.20221886,
       -0.23133394, -0.19063245,  0.25346702, -0.19234048, -0.20784694,
       -0.12557872, -0.25455537, -0.15617739, -0.22168253, -0.14932276,
        0.25438485,  0.19298089, -0.23894864, -0.26424453, -0.18523659,
        0.20755866, -0.17146595, -0.1505574 ,  0.26266149,  0.12615746,
       -0.16710922, -0.19842891,  0.20556726,  0.16274993,  0.14940131,
        0.16785385,  0.24925581]))


# Task
* I2I match
* U2I match
* coarse ranking

In [22]:
from scipy import spatial
import faiss

In [23]:
class VectorProvider:
    def __init__(self, id2vector: ID2Vector):
        self.ID2Vector = {id_: vec for id_, vec in id2vector}
    
    def __getitem__(self, id_):
        return self.ID2Vector[id_]

    def __len__(self):
        return len(self.ID2Vector)

## I2I  match
we could match item by item embedding

offline:
    push embedding into faiss

In [24]:
all_item_id, item_embedding = zip(*item_id_2_vector)
all_item_id = np.array(all_item_id)
item_embedding = np.array(item_embedding)
movie_embedding_faiss_index = faiss.IndexFlatIP(item_embedding.shape[1])
movie_id_faiss_index = faiss.IndexIDMap(movie_embedding_faiss_index)
movie_id_faiss_index.add_with_ids(item_embedding.astype('float32'), np.array(all_item_id))

### online match
match I2I online with faiss

In [25]:
item_vector_provider = VectorProvider(item_id_2_vector)

In [44]:
'''
online
    fetch movie ids and their corresponding vectors which user have interacted
'''
user_seen_movie_id = [10 , 20, 50, 70]
vectors = np.array([item_vector_provider[id_] for id_ in user_seen_movie_id])
vectors.shape

(4, 52)

In [27]:
each_match_num = 20
sim_score, movie_ids = movie_id_faiss_index.search(vectors.astype('float32'), each_match_num)
sim_score.shape, movie_ids.shape

((4, 20), (4, 20))

In [28]:
movie_ids.flatten()

array([ 810, 2818, 3667, 1322, 3564, 2449, 1826, 2298, 1520, 2368, 2383,
       1324, 1853, 3042, 1328, 3041, 2534,  470,  460, 3463, 2818, 3458,
       3463, 3778, 1764, 3772, 3945, 2982,  240, 1538, 1714, 2974, 1011,
        393, 1983,  853, 1760, 3534, 3440, 1556, 2562, 3359, 1262, 2819,
        475, 1910, 1206, 2503, 3281, 2731,  265,  551,  527,  722,  326,
        853, 1268,  746, 1250, 1188, 3778, 2562,  853,  240, 1983, 1910,
        551, 3534, 1268, 3458, 1058, 2299, 3359,  475, 1206, 1730, 2819,
       1764,  722, 1011])

### offline match
we could compute all i2i offline, and store I2I in redis or other DB


In [29]:
each_match_num = 30
sim_score, match_movie_ids = movie_id_faiss_index.search(item_embedding.astype('float32'), each_match_num)
sim_score.shape, match_movie_ids.shape

((3883, 30), (3883, 30))

In [30]:
key_2_I = []
for trigger_id, match_ids, match_scores in zip(all_item_id, match_movie_ids, sim_score):
    k2i = {}
    k2i['trigger_key'] = trigger_id
    pairs = [{'key':id_, 'score': s } for id_, s in zip(match_ids, match_scores) if i != trigger_id]
    k2i['pairs'] = pairs
    key_2_I.append(k2i)
    

In [54]:
'''
    now, we have each items' match. we could push them into low latency/RT DB such as Radis
'''
print(len(key_2_I))
key_2_I[0]

3883


{'pairs': [{'key': 2562, 'score': 8.487403},
  {'key': 3359, 'score': 7.370528},
  {'key': 853, 'score': 7.3140664},
  {'key': 1910, 'score': 7.243814},
  {'key': 3778, 'score': 7.064466},
  {'key': 2819, 'score': 6.9514284},
  {'key': 475, 'score': 6.9262433},
  {'key': 551, 'score': 6.886018},
  {'key': 1262, 'score': 6.7728105},
  {'key': 1206, 'score': 6.7252},
  {'key': 1268, 'score': 6.680213},
  {'key': 265, 'score': 6.674436},
  {'key': 722, 'score': 6.644379},
  {'key': 3281, 'score': 6.6103525},
  {'key': 2299, 'score': 6.536831},
  {'key': 1983, 'score': 6.5208454},
  {'key': 1188, 'score': 6.499143},
  {'key': 1730, 'score': 6.414803},
  {'key': 240, 'score': 6.3780065},
  {'key': 2757, 'score': 6.29891},
  {'key': 2731, 'score': 6.23072},
  {'key': 3534, 'score': 6.217514},
  {'key': 746, 'score': 6.1960998},
  {'key': 326, 'score': 6.194806},
  {'key': 3296, 'score': 6.143187},
  {'key': 1058, 'score': 6.070843},
  {'key': 2503, 'score': 6.015757},
  {'key': 1187, 'score'

## U2I match
we could match items by user embedding

In [32]:
'''
    Offline
      push item embedding into faiss
'''
all_item_id, item_embedding = zip(*item_id_2_vector)
all_item_id = np.array(all_item_id)
item_embedding = np.array(item_embedding)
movie_embedding_faiss_index = faiss.IndexFlatIP(item_embedding.shape[1])
movie_id_faiss_index = faiss.IndexIDMap(movie_embedding_faiss_index)
movie_id_faiss_index.add_with_ids(item_embedding.astype('float32'), np.array(all_item_id))

In [33]:
'''
    Offline
        push user embedding into faiss
'''
all_user_id, user_embedding = zip(*user_id_2_vector)
all_user_id = np.array(all_user_id)
user_embedding = np.array(user_embedding)
user_embedding_faiss_index = faiss.IndexFlatIP(user_embedding.shape[1])
user_id_faiss_index = faiss.IndexIDMap(user_embedding_faiss_index)
user_id_faiss_index.add_with_ids(user_embedding.astype('float32'), np.array(all_user_id))


In [34]:
'''
    Online
        fetch user embedding first
'''
user_vector_provider = VectorProvider(user_id_2_vector)
query_user_id = 500
u_embedding = user_vector_provider[query_user_id]
u_embedding

array([ 1.        ,  0.39185381,  0.19535223, -0.20054442, -0.20946348,
       -0.36996123, -0.08126716, -0.44307205,  0.22863899, -0.03199186,
        0.17294753, -0.19460428,  0.16487858,  0.34271821, -0.35793361,
       -0.02658781, -0.00436664,  0.20797843,  0.02937129, -0.06523762,
       -0.1621742 , -0.12532774,  0.2529372 , -0.21797934, -0.20383394,
       -0.20071481, -0.32262015,  0.16958363, -0.02863088, -0.04059249,
       -0.18969983, -0.3594957 , -0.54900992, -0.18492299, -0.0788543 ,
       -0.09508141, -0.07884572,  0.12967639, -0.30631033, -0.3674185 ,
       -0.1241807 , -0.18993324,  0.07831126,  0.23598528,  0.12474939,
       -0.23666811, -0.11751484,  0.0548125 ,  0.21935403,  0.07483938,
        0.02026522, -0.4154602 ])

In [38]:
'''
    excute u2i
'''
topk = 20
sim_scores, match_movie_ids = movie_id_faiss_index.search(np.expand_dims(u_embedding.astype('float32'), 0), topk)
''' now we have socre and match item ids'''
sim_scores, match_movie_ids 

(array([[4.173138 , 3.2012074, 3.1826928, 3.0340588, 2.753229 , 2.7324831,
         2.664315 , 2.6517272, 2.6254842, 2.6195984, 2.5988796, 2.5158465,
         2.511135 , 2.4963562, 2.4919398, 2.4910007, 2.4246888, 2.409114 ,
         2.3496847, 2.3489919]], dtype=float32),
 array([[3460,  545, 1249, 2839, 1086, 3888,  758, 2243, 2584, 1235, 3542,
         3547,   73, 3647, 2309,  868, 1934, 1471, 3140, 1002]]))

## Rank task
if we have tons of match items from different match methods such as C2I, contentI2I, SwingI2I ..., we may need a coarse rank before sending these huge amount of items into fine rank

In [39]:
def sigmoid(input_: np.ndarray):
    return 1 / ( 1 + np.exp(-input_))

In [40]:
'''
    online vector provider
'''
item_vector_provider = VectorProvider(item_id_2_vector)
user_vector_provider = VectorProvider(user_id_2_vector)

In [46]:
'''
    suppost we have the following match items from different methods
'''
uid = 1000
match_from_A = np.random.choice(np.array(list(item_vector_provider.ID2Vector.keys())), size=30)
match_from_B = np.random.choice(np.array(list(item_vector_provider.ID2Vector.keys())),  size=30)
match_from_C = np.random.choice(np.array(list(item_vector_provider.ID2Vector.keys())), size=30)
all_match = np.hstack([match_from_A, match_from_B, match_from_C])
len(all_match), all_match

(90, array([2050, 2211, 3185, 2229, 2209, 1596,  702, 2365, 1055, 1648, 3004,
         115, 2738, 3064, 1339,  975,  969, 3935, 1892, 2269,  759, 3348,
        2523, 2908, 1233, 1819, 3604, 1731, 2546,  111, 3287, 1855, 2507,
        1394, 2937,  618, 1032, 1366, 1207,  246, 1180, 1776, 2745,  717,
        2421, 3203,  356,  870, 3888, 1930,   93, 3900, 1340, 3431, 2167,
        1441, 3261, 2157, 3465, 3094, 1608,  562, 2401,  790, 3226,  338,
        2272, 2980, 3610,  656, 2031, 2719,  950, 1490, 3570, 1980,  270,
        3082, 2814, 3645,  621, 3147,   20,  835, 1997,  621, 3490, 2082,
        1244,  518]))

In [47]:
'''
    we need their vector to compute similairty score
'''
u_vector = user_vector_provider[uid]
i_vectors = np.array([item_vector_provider[movie_id] for movie_id in all_match])
u_vector.shape, i_vectors.shape


((52,), (90, 52))

In [48]:
'''
    compute the probability of the userId liking these movies
'''
scores = np.dot(i_vectors, u_vector)
predicted_prob = sigmoid(scores)
predicted_prob

array([0.33590833, 0.86267896, 0.54783987, 0.39107268, 0.60883679,
       0.75336175, 0.79866159, 0.33833122, 0.34040079, 0.85555893,
       0.26738468, 0.39107268, 0.75699356, 0.28491491, 0.602693  ,
       0.3113216 , 0.95712211, 0.33731565, 0.69468696, 0.27277618,
       0.91938752, 0.39107268, 0.18719235, 0.91121389, 0.96071996,
       0.39107268, 0.7464813 , 0.08801003, 0.35710347, 0.93656007,
       0.3688742 , 0.28884342, 0.61531689, 0.88622646, 0.90222217,
       0.15696627, 0.77505876, 0.66975122, 0.98230138, 0.96528168,
       0.92719865, 0.39107268, 0.84378158, 0.79882062, 0.20848772,
       0.86573958, 0.89256964, 0.18088923, 0.98779076, 0.95272191,
       0.24272658, 0.73399055, 0.83591797, 0.37842401, 0.58474354,
       0.77333044, 0.7288076 , 0.69604849, 0.27603054, 0.80239154,
       0.72072302, 0.86574538, 0.76624671, 0.09432834, 0.39107268,
       0.38262288, 0.76975123, 0.39107268, 0.12335406, 0.32328629,
       0.32221952, 0.159449  , 0.96235551, 0.05106423, 0.64326

In [53]:
'''
    If we only need the order of movies, 
    we don't need to compute sigmoid which is time consuming
'''

(all_match[np.argsort(predicted_prob)[::-1]] == all_match[np.argsort(scores)[::-1]]).all()

True

#### debug

In [63]:
cross_i = i_vectors[0][2:]

In [65]:
cross_i

array([ 0.37054822,  0.13287237,  0.06564476,  0.07536279,  0.17927961,
        0.10481395, -0.02779581, -0.08033053, -0.1240812 ,  0.08976817,
       -0.16599716, -0.13023286,  0.10579411, -0.1032544 , -0.14951137,
       -0.18511195, -0.17597082, -0.20446178,  0.1293332 , -0.22905825,
       -0.08866128,  0.12667163,  0.11225216,  0.05634275,  0.16195573,
       -0.14689794,  0.16846056,  0.10221358,  0.18979949,  0.10170981,
        0.05192529,  0.13978741,  0.25230175, -0.08643388, -0.09486112,
        0.13592705,  0.1512354 ,  0.11873481, -0.15848108,  0.11546322,
        0.12418077, -0.10911771, -0.13562456,  0.11305237,  0.11150885,
       -0.12580965, -0.14510785, -0.19855882, -0.12131623, -0.18957341])