### Imports

In [80]:
import random
import string
import pandas as pd
import numpy as np
import pickle
import json
import time
import math
from lightfm import LightFM
from lightfm.evaluation import precision_at_k, recall_at_k
import scipy
from scipy.sparse import coo_matrix, eye, hstack, lil_matrix
from skopt import forest_minimize

### Creating fake data
### Run it only one time and then read the created df from picke

In [50]:
machines = list(string.ascii_uppercase)

In [51]:
risks = ['banana', 'apple', 'orange', 'lemon', 'lime', 'strawberry', 'kiwi', 'peach', 'grape', 'pineapple', 'mango']

In [52]:
num_risks = len(risks)

In [99]:
df = pd.DataFrame(index=range(1, 101))

In [100]:
df['operator_id'] = 0

In [101]:
df.shape

(100, 1)

In [102]:
def get_selected_risks_per_machine():
    operator_id = random.choice(range(1, 3001))
    num_of_usages = random.choice(range(1, 11))
    used_machine = random.choice(machines)
    selected_num_of_risks = random.choice(range(1, math.ceil(num_risks/3)))
    selected_risks = random.sample(risks, selected_num_of_risks)
    return operator_id, used_machine, selected_risks

In [103]:
df['operator_id'], df['machine'], df['risks'] = zip(*df.apply(lambda row: get_selected_risks_per_machine(), axis=1))

In [104]:
df

Unnamed: 0,operator_id,machine,risks
1,1705,T,"[lemon, grape]"
2,1306,O,"[peach, mango, orange]"
3,48,L,"[kiwi, lemon]"
4,2853,B,[kiwi]
5,503,Z,"[orange, kiwi, grape]"
...,...,...,...
96,2157,B,[grape]
97,2374,Z,"[pineapple, lemon, mango]"
98,2163,X,[apple]
99,2142,C,"[pineapple, grape]"


In [105]:
df.to_csv('risks_machine_data.csv', index=False)

### Let's suppose that by reading the dataframe from CSV we are reading it from a database

In [106]:
df = pd.read_csv('risks_machine_data.csv')

In [107]:
df.shape

(100, 3)

In [108]:
df.columns

Index(['operator_id', 'machine', 'risks'], dtype='object')

#### Here machines represent our users, since it is for the machines that we want to find the recommended risks

In [109]:
new_df = df.drop(columns=['operator_id'])

In [110]:
new_df.iloc[0].risks

"['lemon', 'grape']"

In [111]:
new_df['risks'] = new_df['risks'].apply(lambda row: json.loads(row.replace("'", "\"")))

In [112]:
new_df['count'] = 1

In [113]:
new_df = new_df.explode('risks')

In [114]:
new_df_columns = list(new_df.columns)

In [115]:
new_df_columns.remove('count')

In [116]:
new_dff = new_df.groupby(new_df_columns)['count'].sum().reset_index()

In [117]:
new_dff

Unnamed: 0,machine,risks,count
0,A,apple,1
1,A,kiwi,2
2,A,lemon,1
3,A,lime,1
4,A,mango,1
...,...,...,...
136,Z,lime,1
137,Z,mango,1
138,Z,orange,1
139,Z,peach,3


In [118]:
new_dff[new_dff.machine == 'A']

Unnamed: 0,machine,risks,count
0,A,apple,1
1,A,kiwi,2
2,A,lemon,1
3,A,lime,1
4,A,mango,1
5,A,orange,1
6,A,strawberry,2


In [119]:
machines = list(new_dff['machine'].unique())

In [120]:
risks = list(new_dff['risks'].unique())

### Map user (machine) to index, index to user, item (risk) to index and index to item

In [121]:
def id_mappings(customer_list, product_list):
    customer_to_index_mapping = {}
    index_to_customer_mapping = {}
    for user_index, user_id in enumerate(customer_list):
        customer_to_index_mapping[user_id] = user_index
        index_to_customer_mapping[user_index] = user_id
        
    product_to_index_mapping = {}
    index_to_product_mapping = {}
    for item_index, item_id in enumerate(product_list):
        product_to_index_mapping[item_id] = item_index
        index_to_product_mapping[item_index] = item_id      
        
    return customer_to_index_mapping, index_to_customer_mapping, \
            product_to_index_mapping, index_to_product_mapping

In [122]:
machine_to_index_mapping, index_to_machine_mapping, \
            risk_to_index_mapping, index_to_risk_mapping = id_mappings(machines, risks)

### Get interaction and feature matrix

In [123]:
def get_interaction_matrix(df):
    
    row = df['machine'].apply(lambda x: machine_to_index_mapping[x]).values
    col = df['risks'].apply(lambda x: risk_to_index_mapping[x]).values
    value = df['count'].values
    
    return coo_matrix((value, (row, col)), shape = (len(machine_to_index_mapping), len(risk_to_index_mapping)))

In [124]:
interaction_matrix = get_interaction_matrix(new_dff)

In [125]:
interaction_matrix.shape

(26, 11)

### Split dataset into train and validation datasets

In [126]:
def train_test_split(ratings, split_count, fraction):
    ratings = ratings.tocsr()
    train = ratings.copy().tocoo()
    test = lil_matrix(train.shape)
    
    if fraction:
        try:
            user_index = np.random.choice(
                np.where(np.bincount(train.row) >= split_count * 2)[0], 
                replace=False,
                size=np.int32(np.floor(fraction * train.shape[0]))
            ).tolist()
        except:
            print((f'Not enough users with > {split_count * 2} '+
                    f'interactions for fraction of {fraction}'))
            raise
    else:
        user_index = range(train.shape[0])
        
    train = train.tolil()

    for user in user_index:
        test_ratings = np.random.choice(ratings.getrow(user).indices, 
                                        size=split_count, 
                                        replace=False)
        
        train[user, test_ratings] = 0.

        test[user, test_ratings] = ratings[user, test_ratings]
   
    assert(train.multiply(test).nnz == 0)
    return train.tocsr(), test.tocsr()

In [127]:
rows_len = [len(row) for row in interaction_matrix.tolil().rows]

In [128]:
rows_mean = np.mean(np.array(rows_len))
print(f'{rows_mean} risks per machine')

5.423076923076923 risks per machine


In [129]:
train, test = train_test_split(interaction_matrix, 1, fraction=0.2)

### Optimizing Hyperparameters

In [130]:
def objective_wsideinfo(params):
    # unpack
    epochs, learning_rate,\
    no_components, item_alpha,\
    scale = params
    
    user_alpha = item_alpha * scale
    model = LightFM(loss='warp',
                    random_state=2020,
                    learning_rate=learning_rate,
                    no_components=no_components,
                    user_alpha=user_alpha,
                    item_alpha=item_alpha)
    model.fit(train, epochs=epochs,
              num_threads=4, verbose=False)
    
    patks = precision_at_k(model, test,
                          train_interactions=None,
                          k=5, num_threads=3)
    mapatk = np.mean(patks)
    # Make negative because we want to _minimize_ objective
    out = -mapatk
    # Weird shit going on
    if np.abs(out + 1) < 0.01 or out < -1.0:
        return 0.0
    else:
        return out

In [131]:
space = [(1, 200), # epochs
         (10**-4, 1.0, 'log-uniform'), # learning_rate
         (2, 100), # no_components
         (10**-6, 10**-3, 'log-uniform'), # alpha
         (0.001, 1., 'log-uniform') # user_scaling
        ]

res_fm = forest_minimize(objective_wsideinfo, space, n_calls=100,
                         random_state=0,
                         verbose=False)

In [132]:
print('Maximum p@k found: {:6.5f}'.format(-res_fm.fun))
print('Optimal parameters:')
params = ['epochs', 'learning_rate', 'no_components', 'alpha', 'scaling']
for (p, x_) in zip(params, res_fm.x):
    if p == 'epochs':
        optimal_epochs = x_
    print('{}: {}'.format(p, x_))
    
# 0.11972

Maximum p@k found: 0.16000
Optimal parameters:
epochs: 164
learning_rate: 0.0005563260205698556
no_components: 57
alpha: 4.463248454508345e-06
scaling: 0.0025448050634013252


### Create model with optimal hyperparameters for the whole dataset

In [133]:
epochs, learning_rate,\
no_components, item_alpha,\
scale = res_fm.x

user_alpha = item_alpha * scale

model = LightFM(loss='warp',
                random_state=2020,
                learning_rate=learning_rate,
                no_components=no_components,
                user_alpha=user_alpha,
                item_alpha=item_alpha)

In [134]:
start = time.time()

model.fit(interaction_matrix,
          epochs=epochs,
          num_threads=4)

end = time.time()
print("time taken for retraining = {0:.{1}f} seconds".format(end - start, 2))

time taken for retraining = 0.08 seconds


### Get Recommendations

In [214]:
def recommendations_by_machine(machine, k=3, print_recommendations=False):
        
    machine_idx = machine_to_index_mapping.get(machine, None)

    if machine_idx is None:
        return None
    
    risks = np.array(list(risk_to_index_mapping.keys()))
    
    known_positive_indices = interaction_matrix.tocsr()[machine_idx].indices
    
    if len(known_positive_indices) == len(risks):
        print('Machine has already been classified with all possible risks so there is none to be recommended.')
        return []

    known_positives = risks[known_positive_indices]

    scores = model.predict(user_ids = [machine_idx], item_ids = np.arange(interaction_matrix.shape[1]))

    assert(interaction_matrix.shape[1] == len(scores) == len(risks))

    scores[known_positive_indices] = np.nan
    scores_indices = np.argsort(-scores)

    top_risks = risks[scores_indices]

    if len(set(known_positives).intersection(set(top_risks[:k]))) > 0:
        print(known_positives)
        print(top_risks[:k])
        print('Recommending items that customer already have. That is a problem.')

    if print_recommendations:

        print('\n--------------\n')
        print(f'Machine {machine}')
        print('Known positives:')

        for x in known_positives:
            print(f'\t{x}')


        print('Recommended:')

        for x in top_risks[:k]:
            print(f'\t{x}')

    recommendations = []

    for rank, recommendation in enumerate(top_risks[:k]):
        recommendation_dict = {
        'machine': machine,
        'recommended_risk_to_consider': recommendation,
        'rank': rank + 1
        }
        recommendations.append(recommendation_dict)

    return recommendations

In [215]:
machine_to_recommend = random.choice(machines)
machine_to_recommend

'R'

In [216]:
recommendations_by_machine(machine_to_recommend)

[{'machine': 'R', 'recommended_risk_to_consider': 'mango', 'rank': 1},
 {'machine': 'R', 'recommended_risk_to_consider': 'lime', 'rank': 2},
 {'machine': 'R', 'recommended_risk_to_consider': 'apple', 'rank': 3}]

In [217]:
all_recommendations = [recommendations_by_machine(machine, k=3, print_recommendations=True) for machine in machines]


--------------

Machine A
Known positives:
	apple
	kiwi
	lemon
	lime
	mango
	orange
	strawberry
Recommended:
	pineapple
	banana
	peach

--------------

Machine B
Known positives:
	kiwi
	lime
	banana
	grape
	pineapple
Recommended:
	mango
	orange
	strawberry

--------------

Machine C
Known positives:
	lime
	mango
	orange
	banana
	grape
	pineapple
	peach
Recommended:
	kiwi
	strawberry
	apple

--------------

Machine D
Known positives:
	apple
	kiwi
	lime
	mango
	orange
	strawberry
	banana
	pineapple
Recommended:
	grape
	peach
	lemon

--------------

Machine E
Known positives:
	lime
	mango
	peach
Recommended:
	kiwi
	orange
	pineapple

--------------

Machine F
Known positives:
	apple
	kiwi
	mango
	strawberry
Recommended:
	orange
	lime
	pineapple

--------------

Machine G
Known positives:
	apple
	kiwi
	lemon
	orange
	strawberry
	peach
Recommended:
	mango
	lime
	pineapple

--------------

Machine H
Known positives:
	lime
	strawberry
	pineapple
	peach
Recommended:
	kiwi
	mango
	orange

----