In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, Binarizer
import bayessets
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import os

In [2]:
DATA_PATH = os.path.expanduser('~/Documents/code/trabalho-disciplina/set-expansion-structured-data/data')
dfquery = pd.read_csv(os.path.join(DATA_PATH, 'CNAE-9_query.csv'))
df = pd.read_csv(os.path.join(DATA_PATH, 'CNAE-9.data'), header=None)

In [18]:
data_matrix = df.iloc[:, 1:].as_matrix().astype(np.float64)
scaler = MinMaxScaler()
binarizer = Binarizer(threshold=0.5)
data = binarizer.fit_transform(scaler.fit_transform(data_matrix))
model = bayessets.BernoulliBayesianSet(data)

In [24]:
%%time
results = []
for n in [10, 20, 30, 50, 100, 150, 200]:
    precisions = []
    recalls = []
    
    for query in dfquery.iterrows():
        index = query[0]
        query_items = np.array([int(x) for x in query[1]['query'].split(',')])
        query_targets = np.array([int(x) for x in query[1]['target'].split(',')])
        
        scores = model.query(query_items)
        most_likely = np.argsort(scores)[::-1]

        # remove the query itself from the results
        valid_guesses = np.where(~np.isin(most_likely, query_items))

        expansion = most_likely[valid_guesses][:n]

        hits = np.sum(np.isin(expansion, query_targets))
        precision = hits / n
        recall = hits / len(query_targets)
        precisions.append(precision)
        recalls.append(recall)
    
    results.append({'n': n, 'algorithm': 'bayessets',
                    'precision': np.mean(precisions), 'recall': np.mean(recalls)})

print(results)

[{'n': 10, 'algorithm': 'bayessets', 'precision': 0.93022222222222228, 'recall': 0.086131687242798335}, {'n': 20, 'algorithm': 'bayessets', 'precision': 0.90722222222222226, 'recall': 0.16800411522633743}, {'n': 30, 'algorithm': 'bayessets', 'precision': 0.87237037037037046, 'recall': 0.24232510288065842}, {'n': 50, 'algorithm': 'bayessets', 'precision': 0.76764444444444435, 'recall': 0.3553909465020576}, {'n': 100, 'algorithm': 'bayessets', 'precision': 0.5391111111111111, 'recall': 0.49917695473251028}, {'n': 150, 'algorithm': 'bayessets', 'precision': 0.39577777777777784, 'recall': 0.54969135802469138}, {'n': 200, 'algorithm': 'bayessets', 'precision': 0.33533333333333337, 'recall': 0.62098765432098768}]
CPU times: user 2.52 s, sys: 0 ns, total: 2.52 s
Wall time: 2.52 s
