In [21]:
!pip install lightfm

Processing ./.cache/pip/wheels/f0/cd/a5/b07914aa223c05ed61880d4c59f64a7febf117dbd2c2cbcf49/lightfm-1.15-cp37-cp37m-linux_x86_64.whl
Installing collected packages: lightfm
Successfully installed lightfm-1.15


In [28]:
import sys
import pandas as pd
import numpy as np

from lightfm.data import Dataset
from lightfm import LightFM
from lightfm.evaluation import auc_score

import scipy.sparse as sps

In [438]:
df_clientes = generate_int_id(df_clientes, 'cliente_id_num')
df_produtos = generate_int_id(df_produtos, 'produto_id_num')

In [11]:
def generate_feature_list(dataframe, features_name):
    """
    Generate features list for mapping 

    Parameters
    ----------
    dataframe: Dataframe
        Pandas Dataframe for Users or Q&A. 
    features_name : List
        List of feature columns name avaiable in dataframe. 
        
    Returns
    -------
    List of all features for mapping 
    """
    features = dataframe[features_name].apply(
        lambda x: ','.join(x.map(str)), axis=1)
    features = features.str.split(',')
    features = features.apply(pd.Series).stack().reset_index(drop=True)
    return features

def create_features(dataframe, features_name, id_col_name):
    """
    Generate features that will be ready for feeding into lightfm

    Parameters
    ----------
    dataframe: Dataframe
        Pandas Dataframe which contains features
    features_name : List
        List of feature columns name avaiable in dataframe
    id_col_name: String
        Column name which contains id of the question or
        answer that the features will map to.
        There are two possible values for this variable.
        1. questions_id_num
        2. professionals_id_num

    Returns
    -------
    Pandas Series
        A pandas series containing process features
        that are ready for feed into lightfm.
        The format of each value
        will be (user_id, ['feature_1', 'feature_2', 'feature_3'])
        Ex. -> (1, ['military', 'army', '5'])
    """

    features = dataframe[features_name].apply(
        lambda x: ','.join(x.map(str)), axis=1)
    features = features.str.split(',')
    features = list(zip(dataframe[id_col_name], features))
    return features

In [6]:
from sqlalchemy import create_engine
import pymysql

db_connection_str = 'mysql+pymysql://root:123@db/invest'
db_connection = create_engine(db_connection_str)

df_clientes = pd.read_sql('SELECT * FROM cli_Clientes', con=db_connection)
df_cliente_produto = pd.read_sql('SELECT * FROM cli_Produtos', con=db_connection)
df_produtos = pd.read_sql('SELECT * FROM Produtos', con=db_connection)

In [13]:
cliente_feature_list = generate_feature_list(df_clientes, ['perfilinvestidor', 'idade', 'genero', 'rendamensal'])
produto_feature_list = generate_feature_list(df_produtos, ['RiscoAtivo'])

In [55]:
user_item_matrix = sps.lil_matrix((len(df_clientes), len(df_produtos)), dtype=np.int8)

for i in range(len(df_cliente_produto)):
    sys.stdout.write(
        "\rProcessing row " + str(i) + "/ " + str(df_cliente_produto.shape[0]))
    sys.stdout.flush()
    user_id = df_cliente_produto['userid'][i]
    product_id = df_cliente_produto['produtoid'][i]
    user_item_matrix[user_id - 1, product_id - 1] = 1

Processing row 56035/ 56036

<function print>

In [80]:
dataset = Dataset()
dataset.fit(
    set(df_clientes['userid']), 
    set(df_produtos['dataId']),
    user_features=cliente_feature_list,
    item_features=produto_feature_list
)

In [81]:
df_clientes['cli_features'] = create_features(
    df_clientes, ['perfilinvestidor', 'idade', 'genero', 'rendamensal'], 'userid')

df_produtos['prod_features'] = create_features(
    df_produtos, ['RiscoAtivo'], 'dataId')

In [90]:
cli_features = dataset.build_user_features(df_clientes['cli_features'])
prod_features = dataset.build_item_features(df_produtos['prod_features'])

In [104]:
model = LightFM(
    no_components=10,
    learning_rate=0.05,
    loss='warp',
    random_state=2019)

model.fit(
    user_item_matrix,
    item_features=prod_features,
    user_features=cli_features,
    epochs=5, num_threads=4, verbose=True)

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4


<lightfm.lightfm.LightFM at 0x7f487a4a5650>

In [109]:
df_use_for_prediction = df_produtos

scores = model.predict(1024, 
              np.arange(len(df_produtos), dtype=np.int32),
              user_features=cli_features, 
              item_features=prod_features
             )

df_use_for_prediction['scores'] = scores
df_use_for_prediction = df_use_for_prediction.sort_values(by='scores', ascending=False)[:8]
print('Recommended Products: ')
display(df_use_for_prediction[['dataId', 'RiscoAtivo']])

Recommended Products: 


Unnamed: 0,dataId,RiscoAtivo
5971,5971,0.0
6084,6084,2.0
6081,6081,4.0
5385,5385,1.0
2964,2964,4.0
6137,6137,0.0
2963,2963,5.0
2959,2959,0.0


In [110]:
def calculate_auc_score(lightfm_model, interactions_matrix, 
                        question_features, professional_features): 
    """
    Measure the ROC AUC metric for a model. 
    A perfect score is 1.0.

    Parameters
    ----------
    lightfm_model: LightFM model 
        A fitted lightfm model 
    interactions_matrix : 
        A lightfm interactions matrix 
    question_features, professional_features: 
        Lightfm features 
        
    Returns
    -------
    String containing AUC score 
    """
    score = auc_score( 
        lightfm_model, interactions_matrix, 
        item_features=question_features, 
        user_features=professional_features, 
        num_threads=4).mean()
    return score


In [112]:
calculate_auc_score(model, user_item_matrix, prod_features, cli_features)

0.99621606