In [1]:
# Extensions
%load_ext autoreload
%autoreload 2

In [2]:
# Imports
import unidecode
import random
import numpy as np

from research.data.data          import *
from core.utils.entity_type      import Entity_type
from core.search.query_paper_mag import paper_mag_multiquery
from core.search.query_info      import paper_info_mag_check_multiquery

In [3]:
# Get data dataframe
DATA_PATH = '~/influencemap/research/data/ccnr-science-careers-basic-stats.csv'
data_df = get_q_data(DATA_PATH)
#data_df

In [4]:
# Get author names and normalise
author_names = data_df['AuthorName'].str.lower()
author_names = author_names.str.replace('.', '')
author_names = author_names.str.replace('\'', '\\\'')
author_names = list(map(unidecode.unidecode, list(author_names)))
random.shuffle(author_names)

print("Number of authors:", len(author_names))

Number of authors: 2887


In [5]:
# Get author ids
name_maps = get_author_ids(author_names[0:100]) # For testing

print("Number of authors:", len(name_maps.keys()))

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 5.96 µs
Number of authors: 98


In [6]:
# Generate paper mapping
paper_map = dict()
for name, author_ids in name_maps.items():
    res_row = dict()
    res_row['AuthorIds'] = author_ids
    res_row['PaperIds']  = paper_mag_multiquery(Entity_type.AUTH, author_ids)
    
    paper_map[name] = res_row

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 6.44 µs


In [7]:
# Stats of paper mapping results
paper_counter  = lambda x: len(x['PaperIds'])
author_counter = lambda x: len(x['AuthorIds'])

paper_count_per_name  = list(map(paper_counter, paper_map.values()))
author_count_per_name = list(map(author_counter, paper_map.values()))

print("Total number of papers",
      np.sum(paper_count_per_name))

print("Total number of author ids", 
      np.sum(author_count_per_name))

print("Average number of papers per name",
      np.average(paper_count_per_name))

print("Average number of author ids per name",
      np.average(author_count_per_name))

Total number of papers 16512
Total number of author ids 3935
Average number of papers per name 168.48979591836735
Average number of author ids per name 40.1530612244898


In [None]:
# Generate the paper information per author
THREADS    = 8
BATCH_SIZE = 20

import multiprocess
from functools import reduce

p = multiprocess.Pool(THREADS)

paper_info_map = dict()
for name, val_dict in paper_map.items():
    papers = val_dict['PaperIds']

    # Number of papers to process
    print(name, len(papers))
    
    # Set up for threads
    batches = (papers[i:i+BATCH_SIZE] for i in \
               range(0, len(papers), BATCH_SIZE))
    batch_res = p.map(paper_info_mag_check_multiquery, batches)
    paper_info_map[name] = reduce(lambda x, y: x + y, batch_res)

    # Process in batches
    #paper_info_map[name] = list()
    #for batch in range(0, len(papers), BATCH_SIZE):
    #     += paper_info_mag_check_multiquery(papers[batch:batch+BATCH_SIZE])

philip w anderson 388
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
hyatt m gibbs 20
gary t horowitz 220
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
k m leung 177
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
lyle patrick 54
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
myron strongin 270
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call API: 0
To call

In [None]:
# Set up for the scoring
author_scores = dict()
for name in paper_map.keys():
    author_scores[name] = dict()

In [None]:
# Score authors - Total Q Score
for name, paper_info_list in paper_info_map.items():
    q_score = q_score_info_list(paper_info_list)
    author_scores[name]['QScore'] = q_score