In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors

In [4]:
DATA_PATH = os.path.expanduser('~/Documents/code/trabalho-disciplina/set-expansion-structured-data/data')
dfquery = pd.read_csv(os.path.join(DATA_PATH, 'CNAE-9_query.csv'))
df = pd.read_csv(os.path.join(DATA_PATH, 'CNAE-9.data'), header=None)

data_matrix = df.iloc[:, 1:].as_matrix().astype(np.float64)

In [17]:
%%time
results = []

for n in [10, 20, 30, 50, 100, 150, 200]:
    centroidprecisions = []
    centroidrecalls = []

    nbrs = NearestNeighbors(n_neighbors=n * 2).fit(data_matrix)

    for row in dfquery.iterrows():
        query = np.array([int(x) for x in row[1].query.split(',')])
        target = np.array([int(x) for x in row[1].target.split(',')])

        examples = df.iloc[query, 1:]
        centroid = examples.mean()
        distances, indices = nbrs.kneighbors([centroid])

        expansion = set(indices[0][~np.isin(indices[0], query, assume_unique=True)][:n])
        valid_guesses = set(target)

        hits = len(expansion.intersection(valid_guesses))
        size = len(expansion)

        precision = hits / size
        recall = hits / len(target)
        centroidprecisions.append(precision)
        centroidrecalls.append(recall)
    
    results.append({'n': n, 'algorithm': 'centroidknn',
                    'precision': np.mean(centroidprecisions), 'recall': np.mean(centroidrecalls)})
    
print(results)

[{'n': 10, 'algorithm': 'centroidknn', 'precision': 0.79311111111111121, 'recall': 0.073436213991769528}, {'n': 20, 'algorithm': 'centroidknn', 'precision': 0.72077777777777785, 'recall': 0.13347736625514403}, {'n': 30, 'algorithm': 'centroidknn', 'precision': 0.66859259259259263, 'recall': 0.1857201646090535}, {'n': 50, 'algorithm': 'centroidknn', 'precision': 0.57320000000000004, 'recall': 0.26537037037037037}, {'n': 100, 'algorithm': 'centroidknn', 'precision': 0.39942222222222223, 'recall': 0.36983539094650203}, {'n': 150, 'algorithm': 'centroidknn', 'precision': 0.32354074074074074, 'recall': 0.44936213991769552}, {'n': 200, 'algorithm': 'centroidknn', 'precision': 0.27305555555555555, 'recall': 0.50565843621399176}]
CPU times: user 9.39 s, sys: 4 ms, total: 9.39 s
Wall time: 9.39 s


In [18]:
%%time
def get_closest(distances, indices, remove_duplicate=True):
    distances = distances.flatten()
    indices = indices.flatten()
    inds = distances.argsort()
    closest = indices[inds]
    
    if remove_duplicate:
        _, idx = np.unique(closest, return_index=True)
        return closest[np.sort(idx)]
    return closest


results = []

for n in [10, 20, 30, 50, 100, 150, 200]:
    anyprecisions = []
    anyrecalls = []
    nbrs = NearestNeighbors(n_neighbors=n * 2).fit(data_matrix)
    
    for row in dfquery.iterrows():
        query = np.array([int(x) for x in row[1].query.split(',')])
        target = np.array([int(x) for x in row[1].target.split(',')])
        
        examples = df.iloc[query, 1:]
        distances, indices = nbrs.kneighbors(examples)
        ordered = get_closest(distances, indices)
        
        expansion = set(indices[0][~np.isin(indices[0], query, assume_unique=True)][:n])
        valid_guesses = set(target)

        hits = len(expansion.intersection(valid_guesses))
        size = len(expansion)

        precision = hits / size
        recall = hits / len(target)
        anyprecisions.append(precision)
        anyrecalls.append(recall)
    
    results.append({'n': n, 'algorithm': 'anyknn',
                    'precision': np.mean(anyprecisions), 'recall': np.mean(anyrecalls)})
    
print(results)

[{'n': 10, 'algorithm': 'anyknn', 'precision': 0.69466666666666677, 'recall': 0.064320987654320982}, {'n': 20, 'algorithm': 'anyknn', 'precision': 0.60644444444444434, 'recall': 0.11230452674897119}, {'n': 30, 'algorithm': 'anyknn', 'precision': 0.5523703703703704, 'recall': 0.15343621399176954}, {'n': 50, 'algorithm': 'anyknn', 'precision': 0.47053333333333336, 'recall': 0.21783950617283948}, {'n': 100, 'algorithm': 'anyknn', 'precision': 0.34562222222222222, 'recall': 0.32002057613168727}, {'n': 150, 'algorithm': 'anyknn', 'precision': 0.27976296296296299, 'recall': 0.38855967078189302}, {'n': 200, 'algorithm': 'anyknn', 'precision': 0.23903333333333332, 'recall': 0.44265432098765434}]
CPU times: user 1min 5s, sys: 0 ns, total: 1min 5s
Wall time: 1min 5s
