# Predicting

In [1]:
import sys
sys.path.append("../")
from MapSci.papers import papers
from MapSci.research_space import research_space
from MapSci.entities import entities
import MapSci.agg as agg
import MapSci.predict as pr

TEXT

In [2]:
pp = papers("lattes")
maps = research_space("lattes", pp)
maps.compute(2000, 2014)

guevara(2000, 2014, 0.1)
chinazzi(2000, 2014, 0.1, 200)


In [3]:
presence = pp.presence(2000, 2014)
indices = maps.phi["guevara(2000, 2014, 0.1)"][1]
scientists = entities(presence, indices)

In [5]:
inst, st = agg.get_insts(scientists.set, "../dataset/lattes/pesquisadores.csv", sep=";sep;")

xi = agg.aggregate(presence[1], inst)
presence_inst = pp.presence(2000, 2014, x=xi)
institutions = entities(presence_inst, indices)

xs = agg.aggregate(presence[1], st)
presence_st = pp.presence(2000, 2014, x=xs)
states = entities(presence_st, indices)

### Prediction

In [9]:
guev = maps.phi["guevara(2000, 2014, 0.1)"][0]
chin = maps.phi["chinazzi(2000, 2014, 0.1, 200)"][0]

*evaluating the transition from an inactive to an active state*

In [22]:
# Laender
scientists.predict(9089204821424223, guev, 'inactive-active')[:10]

[(0.32805, 'museology'),
 (0.27583, 'signal processing'),
 (0.27155, 'theoretical computer science'),
 (0.26721, 'media technology'),
 (0.24403, 'logic'),
 (0.22847, 'computer graphics and computer-aided design'),
 (0.22656, 'human-computer interaction'),
 (0.17321, 'information systems and management'),
 (0.16858, 'control and systems engineering'),
 (0.15695, 'management information systems')]

In [32]:
# Ana
scientists.predict(2408991231058279, chin, 'inactive-active')[:10]

[(0.07867, 'information systems'),
 (0.06749, 'computer vision and pattern recognition'),
 (0.06358, 'computer science (miscellaneous)'),
 (0.06116, 'artificial intelligence'),
 (0.0604, 'theoretical computer science'),
 (0.05736, 'signal processing'),
 (0.05637, 'computer graphics and computer-aided design'),
 (0.05598, 'information systems and management'),
 (0.05582, 'computer science applications'),
 (0.05411, 'logic')]

*evaluating the transition from an nascent to an developed state*

In [18]:
# Universidade Federal de Minas Gerais
institutions.predict('universidade federal de minas gerais', guev, 'nascent-developed')[:10]

[(0.72025, 'family practice'),
 (0.71126, 'urology'),
 (0.61904, 'advanced and specialized nursing'),
 (0.60379, 'anatomy'),
 (0.5844, 'human factors and ergonomics'),
 (0.57649, 'logic'),
 (0.5606, 'critical care and intensive care medicine'),
 (0.55542, 'electrochemistry'),
 (0.53811, 'medical and surgical nursing'),
 (0.52879, 'radiological and ultrasound technology')]

*evaluating the transition from an intermediate to an developed state*

In [31]:
# Minas Gerais
states.predict('31', chin, 'intermediate-developed')[:10]

[(0.53724, 'nature and landscape conservation'),
 (0.53666, 'electrochemistry'),
 (0.52147, 'process chemistry and technology'),
 (0.51276, 'aquatic science'),
 (0.503, 'polymers and plastics'),
 (0.49625, 'surfaces and interfaces'),
 (0.4933, 'colloid and surface chemistry'),
 (0.47303, 'renewable energy, sustainability and the environment'),
 (0.46187, 'community and home care'),
 (0.46187, 'atmospheric science')]

### Accuracy

In [41]:
def accuracy(level, transition):
    auc = list()
    dic = {x: rs_2020.fields.index(x) for x in rs_2011.fields}
    rng = [rs_2011.scientists, set(x[0] for x in rs_2011.rca[1]), set(x[0] for x in rs_2011.rca[2])]
    u = 0 if transition == 'inactive-active' else 1
    
    if level == 'scientist':
        lev = 0
    elif level == 'institution':
        lev = 1
    elif level == 'state':
        lev = 2

    for s in rng[lev]:
        pred = rs_2011.predict(s, level, transition)        
        true = [1 if dic[x[1]] in rs_2020._U[lev][u][s] else 0 for x in pred]
        prob = [x[0] for x in pred]

        if sum(true) == 0 or sum(true) == len(true):
            continue

        auc.append(roc_auc_score(true, prob))
        
    fp = dict(marker='o', markersize=5, linestyle='none', alpha=0.01)
    plt.boxplot(auc, flierprops=fp)
    plt.title("{}: {}".format(level, transition))
    plt.show()
    
    return sum(1 for x in auc if x < 0.5) / len(auc)

In [None]:
from collections import defaultdict

sumx = defaultdict(int)
sumi = defaultdict(int)
sume = defaultdict(int)

s = set(rs_2011.scientists)

for sf in rs_2011.x:
    if sf[0] not in s:
        continue

    ins = rs_2011.inst[sf[0]]
    est = rs_2011.est[sf[0]]

    sumi[ins] += rs_2011.x[sf]
    sume[est] += rs_2011.x[sf]
    sumx[sf[0]] += rs_2011.x[sf]
    
sums = [sumx, sumi, sume]

In [None]:
def accuracy(level, transition):
    # Translation
    dic = {x: rs_2020.fields.index(x) for x in rs_2011.fields}
    dic11 = {x: rs_2011.fields.index(x) for x in rs_2011.fields}
    rng = [rs_2011.scientists, set(x[0] for x in rs_2011.rca[1]), set(x[0] for x in rs_2011.rca[2])]
    u = 0 if transition == 'inactive-active' else 1
    
    if level == 'scientist':
        lev = 0
    elif level == 'institution':
        lev = 1
    elif level == 'state':
        lev = 2

    auc = [list(), list()]
    wins = [0,0,0]
    wins_over = [0,0,0]
    spearman = list()
    areas = [list(), list()]
    X = list()
    
    for s in rng[lev]:
        # Prediction
        pred = [0,0]
        pred[0] = rs_2011.predict(s, level, transition)
        pred[1] = ss_2011.predict(s, level, transition)
        rank = [set([x[1] for x in p]) for p in pred]
        
        # Add missing classes
        for i in [0, 1]:
            for j in [x for x in rank[not i] if x not in rank[i]]:
                pred[i].append((0.0, j))
                        
        # Ground truth
        true = [[1 if dic[x[1]] in rs_2020._U[lev][u][s] else 0 for x in p] for p in pred]
        if sum([1 for t in true if sum(t) == 0 or sum(t) == len(t)]) > 0:
            continue
        
        prob = [[x[0] for x in p] for p in pred]
        rank = [[x[1] for x in p] for p in pred]
        
        auc[0].append(roc_auc_score(true[0], prob[0]))
        auc[1].append(roc_auc_score(true[1], prob[1]))
        
        # Summary
        if auc[0][-1] > auc[1][-1]:
            wins[0] += 1
            if auc[1][-1] > 0.5:
                wins_over[0] += 1
        elif auc[0][-1] < auc[1][-1]:
            wins[1] += 1
            if auc[0][-1] > 0.5:
                wins_over[1] += 1
        else:
            wins[2] += 1
            if auc[1][-1] > 0.5:
                wins_over[2] += 1
        
        # Plot data
        n = len(true[0])
        spearman.append(1 - (6*sum((x - rank[1].index(rank[0][x]))**2 for x in range(n)))/(n*(n**2-1)))
        areas[0].append(sum([1 if dic11[x] in rs_2011._U[lev][0][s] else 0 for x in rs_2011.fields]))
        areas[1].append(sum([1 if dic11[x] in rs_2011._U[lev][1][s] else 0 for x in rs_2011.fields]))
        X.append(sums[lev][s])
        
    # Plots
    plt.rcParams["figure.figsize"] = (18,18)
    plt.subplot(3,3,1)
#     fp = dict(marker='o', markersize=5, linestyle='none', alpha=0.01)
#     plt.boxplot(auc, flierprops=fp)
    plt.violinplot(auc, points=60, widths=0.7, showextrema=True, showmedians=True, bw_method=0.5)
    plt.title("{}: {}".format(level, transition))
    plt.ylabel('AUC ROC')
    plt.xticks([1, 2], ['RS', 'SS'])
    
    ax = plt.subplot(3,3,2)
#     plt.hist(spearman, density=True, bins=20)
    pd.DataFrame(spearman).plot(kind='density', ax=ax)
    plt.title("Spearman Correlation Distribution")
    plt.ylabel('Densidade')
    
    ax = plt.subplot(3,3,3)
    gaussian(ax, auc[0], auc[1], "RS", "SS", "Comparação entre previsões", comp=True)
    
    ax = plt.subplot(3,3,4)
    gaussian(ax, areas[0], auc[0], "Número de áreas ativas", "AUC ROC", "Áreas ativas X predição (RS)", 'PuBu')
    ax = plt.subplot(3,3,7)
    gaussian(ax, areas[0], auc[1], "Número de áreas ativas", "AUC ROC", "Áreas ativas X predição (SS)", 'PuBu')
    
    ax = plt.subplot(3,3,5)
    gaussian(ax, areas[1], auc[0], "Número de áreas desenvolvidas", "AUC ROC", "Áreas desenvolvidas X predição (RS)", 'Oranges')
    ax = plt.subplot(3,3,8)
    gaussian(ax, areas[1], auc[1], "Número de áreas desenvolvidas", "AUC ROC", "Áreas desenvolvidas X predição (SS)", 'Oranges')
    
    ax = plt.subplot(3,3,6)
    gaussian(ax, np.log(X), auc[0], "LogNúmero de publicações", "AUC ROC", "LogPublicações e predição (RS)", 'Greens')
    ax = plt.subplot(3,3,9)
    gaussian(ax, np.log(X), auc[1], "LogNúmero de publicações", "AUC ROC", "LogPublicações e predição (SS)", 'Greens')
    
    plt.show()
    
    # print summary
    print("Valor mínimo", [min(x) for x in auc])
    print("Fração menor que 0.5", [sum(1 for x in a if x < 0.5) / len(a) for a in auc])
    print("Fração que foi melhor", [x / sum(wins) for x in wins[:2]])
    print("Fração que foi melhor, acima de 0.5", [x / sum(wins_over) for x in wins_over[:2]])
    print("Médias", [np.mean(x) for x in auc])
    print("Anova:", st.f_oneway(*auc))
    print("Número de amostras:", len(auc[0]))