In [1]:
%matplotlib inline
import os
import re
import copy
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from lxml import etree
import json
import time
import numpy as np

In [2]:
#  truth format is fos \t author author author.....
#  result format is author \t author \t author  .....

In [20]:
# some constant
wkdir = os.path.normpath(os.path.join(os.path.abspath(''), '..', '..'))
model = 'bigclam'
version = 'bigclam_L1_072416'
link_mode = 'cite'
conference = 'AAAI'
if version:
    resdir = os.path.join(wkdir, 'res', model, link_mode, conference, version)
    measure_result_dir = os.path.join(wkdir, 'measure', model, link_mode, conference, version)
else:
    resdir = os.path.join(wkdir, 'res', model, link_mode, conference)
    measure_result_dir = os.path.join(wkdir, 'measure', model, link_mode, conference)
pos_tag = 'cmty'
truth_dir = os.path.join(wkdir, 'data', link_mode, conference)

if not os.path.exists(measure_result_dir):
    os.makedirs(measure_result_dir)

In [25]:
def load_r_comm(step):
    comm = []
    filename = os.path.join(resdir, '%s.%s.txt' % (step, pos_tag))
    f = open(filename, 'r')
    if model == 'cdot':
        for i, line in enumerate(f):
            comm.append(set())
            tpls = line.split('\t')[:-1]
            for tpl in tpls:
                au = tpl.split(',')[0]
                # filter >0.1
                fs = float(tpl.split(',')[1])
                if fs > 0.1:
                    comm[i].add(au)
    elif model == 'bigclam':
        for i, line in enumerate(f):
            comm.append(set())
            tpls = line.split('\t')
            if len(tpls) > 1:
                tpls = tpls[:-1]
                for au in tpls:
                    comm[i].add(au)
            else:
                continue
    return comm

In [26]:
res_comm = load_r_comm('final')

In [27]:
def load_t_comm(scope):
    comm = []
    fos_name = []
    filename = os.path.join(truth_dir, 'c_fos_L%d.txt' % scope)
    f = open(filename, 'r')
    for i, line in enumerate(f):
        line = line.strip('\n')
        comm.append(set())
        fos_name = line.split('\t')[0]
        aus = line.split('\t')[1]
        for au in aus.split(' '):
            comm[i].add(au)
    return comm, fos_name
        

In [28]:
truth_comm_scopes = []
truth_comm_scopes_name = []
for i in range(4):
    comm, fos_name = load_t_comm(i)
    truth_comm_scopes.append(comm)
    truth_comm_scopes_name.append(fos_name)

In [29]:
def precision(r, t):
#     r, t a set of authors
#     r-predict t -ruth
#     r & t TP
#     r - r & t FP
#     t - r & t TN
#     precision = tp/(tp+fp)
    if len(r) == 0:
        return 0
    return len(r&t)/(len(r))

In [30]:
def recall(r, t):
#     recall = tp/(tp+tn)
    if len(t) == 0:
        return 0
    return len(r&t)/(len(t))

In [31]:
def f1(r, t):
    pre = precision(r, t)
    rec = recall(r, t)
    if pre+rec < 1e-8:
        return 0
    return 2*(pre*rec)/(pre+rec)

In [32]:
def find_bestmatch_f1(xset, ycomm):
    best_score = -1
    for yset in ycomm:
        score = f1(xset, yset)
        if score > best_score:
            best_score = score
    return best_score

In [33]:
def measure(rcomm, tcomm):
    sum1 = 0
    sum2 = 0
    for rset in rcomm:
        sum1 += find_bestmatch_f1(rset, tcomm)
    for tset in tcomm:
        sum2 += find_bestmatch_f1(tset, rcomm)
    return 0.5*(sum1/len(rcomm) + sum2/len(tcomm))

In [34]:
measure_res = []
for i in range(4):
    measure_res.append(measure(res_comm, truth_comm_scopes[i]))

In [14]:
print (measure_res)

[0.12429569652827531, 0.10821168480452202, 0.11447276286084976, 0.13187742987558682]


In [15]:
measure_filename = os.path.join(measure_result_dir, 'f1score-0.1.txt')
mf = open(measure_filename, 'w')
for i in measure_res:
    mf.write('%.5f\n' % i)
mf.close()