In [1]:
# Extensions
%load_ext autoreload
%autoreload 2

In [88]:
# Imports
import unidecode
import random
import numpy as np
import scipy.stats
import itertools
import copy
from IPython.display import HTML, display

from research.data.data          import *
from research.data.utils         import *
from research.score.scores       import *
from research.score.filters      import *
from core.utils.entity_type      import Entity_type
from core.search.query_paper_mag import paper_mag_multiquery
from core.search.query_info      import paper_info_mag_check_multiquery

In [33]:
# Params
YEAR_DIFF = 10

In [70]:
# Get data dataframe
DATA_PATH = '~/influencemap/research/data/ccnr-science-careers-basic-stats.csv'
data_df = get_q_data(DATA_PATH)
data_df['AuthorDName'] = data_df['AuthorName']
data_df['AuthorName']  = data_df['AuthorName'].apply(name_normalise)

In [4]:
# Get author names and normalise
author_names = data_df['AuthorDName'].str.lower()
author_names = author_names.str.replace('.', '')
author_names = author_names.str.replace('\'', '\\\'')
author_names = list(map(unidecode.unidecode, list(author_names)))
random.shuffle(author_names)

print("Number of authors:", len(author_names))

Number of authors: 2887


In [5]:
# Get author ids
name_maps = get_author_ids(author_names[0:100]) # For testing

print("Number of authors:", len(name_maps.keys()))

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 5.96 µs
Number of authors: 98


In [6]:
# Generate paper mapping
paper_map = dict()
for name, author_ids in name_maps.items():
    res_row = dict()
    res_row['AuthorIds'] = author_ids
    res_row['PaperIds']  = paper_mag_multiquery(Entity_type.AUTH, author_ids)
    
    paper_map[name] = res_row

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 6.44 µs


In [7]:
# Stats of paper mapping results
paper_counter  = lambda x: len(x['PaperIds'])
author_counter = lambda x: len(x['AuthorIds'])

paper_count_per_name  = list(map(paper_counter, paper_map.values()))
author_count_per_name = list(map(author_counter, paper_map.values()))

print("Total number of papers",
      np.sum(paper_count_per_name))

print("Total number of author ids", 
      np.sum(author_count_per_name))

print("Average number of papers per name",
      np.average(paper_count_per_name))

print("Average number of author ids per name",
      np.average(author_count_per_name))

Total number of papers 16512
Total number of author ids 3935
Average number of papers per name 168.48979591836735
Average number of author ids per name 40.1530612244898


In [None]:
# Generate the paper information per author
THREADS    = 8
BATCH_SIZE = 20

import multiprocess
from functools import reduce

p = multiprocess.Pool(THREADS)

paper_info_map = dict()
for name, val_dict in paper_map.items():
    papers = val_dict['PaperIds']

    # Number of papers to process
    print(name, len(papers))
    
    # Set up for threads
    batches = (papers[i:i+BATCH_SIZE] for i in \
               range(0, len(papers), BATCH_SIZE))
    batch_res = p.map(paper_info_mag_check_multiquery, batches)
    paper_info_map[name] = reduce(lambda x, y: x + y, batch_res)

    # Process in batches
    #paper_info_map[name] = list()
    #for batch in range(0, len(papers), BATCH_SIZE):
    #     += paper_info_mag_check_multiquery(papers[batch:batch+BATCH_SIZE])

philip w anderson 388
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
hyatt m gibbs 20
gary t horowitz 220
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
k m leung 177
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
lyle patrick 54
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
myron strongin 270
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call

In [26]:
# Save selected authors
with open('selectedauthors', 'w') as f:
    for author in author_names:
        f.write(author + '\n')

In [39]:
# Set up for the scoring
author_scores = dict()
for name in paper_map.keys():
    author_scores[name] = dict()
    author_scores[name]['AuthorName'] = name

In [40]:
# Score authors - Total Q Score
for name, paper_info_list in paper_info_map.items():
    filter_paper_info = filter_pub_year_diff(paper_info_list, diff = YEAR_DIFF)
    q_score = q_score_paper_info_list(filter_paper_info)
    author_scores[name]['QScore'] = q_score

In [121]:
# Generate n-hot topic/fos arrays for each of the paper informations

topic_index = ['JournalName', 'ConferenceName']

# Generate the total topics
topics = set()
for name, paper_info_list in paper_info_map.items():
    for paper_info in paper_info_list:
        for topic in topic_index:
            if topic in paper_info:
                # Just get first
                topics.add(paper_info[topic])

topic_vector = sorted(list(topics))

name_topic_vector_map = dict()
for name, paper_info_list in paper_info_map.items():
    # Init topic vector
    name_topic_vec = [0] * len(topic_vector)
    for paper_info in paper_info_list:
        for topic in topic_index:
            if topic in paper_info:
                # Just get first
                name_topic_vec[topic_vector.index(paper_info[topic])] = 1

    name_topic_vector_map[name] = name_topic_vec

name_topic_vector_map
print("Total number of topics", len(topic_vector))
print("Average number of topics per author",
      np.average(list(map(np.sum, name_topic_vector_map.values()))))

Total number of topics 2014
Average number of topics per author 34.214285714285715


In [159]:
# Regression on topics above,
x = list()
y = list()
train_authors = list(name_topic_vector_map.keys())
random.shuffle(train_authors)
for name in train_authors:
    x.append(name_topic_vector_map[name])
    y.append(author_scores[name]['QScore'])

data_split = int(len(x) * 0.2)

train_X = x[:-data_split]
train_Y = y[:-data_split]
test_X  = x[-data_split:]
test_Y  = y[-data_split:]

print(len(train_X), len(test_X), len(x))

from sklearn import linear_model
regr = linear_model.LinearRegression()
regr.fit(train_X, train_Y)
print(regr)

print('Variance score:', regr.score(test_X, test_Y))

79 19 98
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
Variance score: -0.20725191529773324


In [160]:
# Generate n-hot topic/fos arrays for each of the paper informations

topic_index = 'FieldOfStudy'

# Generate the total topics
topics = set()
for name, paper_info_list in paper_info_map.items():
    for paper_info in paper_info_list:
        if topic_index in paper_info:
            # Just get first
            for fos in paper_info[topic_index]:
                topics.add(fos['FieldOfStudyName'])
                #break

topic_vector = sorted(list(topics))

name_topic_vector_map = dict()
for name, paper_info_list in paper_info_map.items():
    # Init topic vector
    name_topic_vec = [0] * len(topic_vector)
    for paper_info in paper_info_list:
        if topic_index in paper_info:
            # Just get first
            for fos in paper_info[topic_index]:
                name_topic_vec[topic_vector.index(fos['FieldOfStudyName'])] = 1

    name_topic_vector_map[name] = name_topic_vec

name_topic_vector_map
len(topic_vector)

12896

In [91]:
# Display author scores
score_df = pd.DataFrame(list(author_scores.values()))
#score_df

In [90]:
# Compare the q score rankings
datascore_df = data_df[['AuthorName', 'QScore', 'CC10']]
compare_df = pd.merge(score_df[score_df['QScore'] > 0],
                      datascore_df, how='left', on='AuthorName')
#compare_df = pd.merge(score_df, datascore_df, how='left', on='AuthorName')

# Calculate ranks
compare_df['QRank_x']  = compare_df['QScore_x'].rank()
compare_df['QRank_y']  = compare_df['QScore_y'].rank()
compare_df['CC10Rank'] = compare_df['CC10'].rank()

ranks = [ 'QRank_x', 'QRank_y', 'CC10Rank' ]

kendalltau_matrix = [[None for _ in range(len(ranks)+1)] \
                      for _ in range(len(ranks)+1)]
for i, rank in enumerate(ranks):
    kendalltau_matrix[0][i+1] = rank
    kendalltau_matrix[i+1][0] = rank
    
spearmanr_matrix = copy.deepcopy(kendalltau_matrix)

for x, y in itertools.combinations(ranks, 2):
    kt = scipy.stats.kendalltau(compare_df[x], compare_df[y])
    sr = scipy.stats.spearmanr(compare_df[x], compare_df[y])
    kendalltau_matrix[ranks.index(x)+1][ranks.index(y)+1] = kt[0]
    spearmanr_matrix[ranks.index(x)+1][ranks.index(y)+1]  = sr[0]


display(HTML(
    '<table><tr>{}</tr></table>'.format(
        '</tr><tr>'.join(
            '<td>{}</td>'.format('</td><td>'.join(str(_) for _ in row)) \
                                 for row in kendalltau_matrix)
    )
))

display(HTML(
    '<table><tr>{}</tr></table>'.format(
        '</tr><tr>'.join(
            '<td>{}</td>'.format('</td><td>'.join(str(_) for _ in row)) \
                                 for row in spearmanr_matrix)
    )
))

0,1,2,3
,QRank_x,QRank_y,CC10Rank
QRank_x,,0.1845515426284187,0.18939399029273618
QRank_y,,,0.8197162671866048
CC10Rank,,,


0,1,2,3
,QRank_x,QRank_y,CC10Rank
QRank_x,,0.2303465631543811,0.2550626309685696
QRank_y,,,0.9462583517457396
CC10Rank,,,
