In [1]:
import csv
import json
import os
import requests
import holoviews as hv
import pandas as pd
from gql import gql, Client
from gql.transport.requests import RequestsHTTPTransport
from holoviews import opts, dim
from IPython.display import display, Markdown
hv.extension('bokeh')
hv.output(size=200)

In [2]:
# this taken from the py-grant-outputs.ipynb and expressed as a function
def fetch_data():

    _transport = RequestsHTTPTransport(
        url='https://api.datacite.org/graphql',
        use_json=True,
    )

    client = Client(
        transport=_transport,
        fetch_schema_from_transport=True,
    )

    # Generate the GraphQL query: find all outputs of FREYA grant award (https://cordis.europa.eu/project/id/777523) from funder (EU) to date
    query_params = {
        "funderId" : "https://doi.org/10.13039/501100000780",
        "funderAwardQuery" : "fundingReferences.awardNumber:777523",
        "maxWorks" : 200
    }

    query = gql("""query getGrantOutputsForFunderAndAward($funderId: ID!, $funderAwardQuery: String!, $maxWorks: Int!)
    {
    funder(id: $funderId) {
      name
      works(query: $funderAwardQuery, first: $maxWorks) {
          totalCount
          nodes {
            id
            formattedCitation(style: "vancouver")
            titles {
              title
            }
            descriptions {
              description
            }        
            types {
              resourceType
            }
            dates {
              date
              dateType
            }
            versionOfCount
            rights {
              rights
              rightsIdentifier
              rightsUri
            }        
            creators {
              id
              name
            }
            fundingReferences {
              funderIdentifier
              funderName
              awardNumber
              awardTitle
            }
            citationCount
            viewCount
            downloadCount
          }
        }
      }
    }
    """)

    data = client.execute(query, variable_values=json.dumps(query_params))

    return data

In [3]:
# this taken from the py-grant-outputs.ipynb and expressed as a function
def build_matrix(data):
    all_creator_names_by_node = []
    all_creator_names_set = set([])
    funder = data['funder']['works']
    for r in funder['nodes']:
        if r['versionOfCount'] > 0:
            # If the current output is a version of another one, exclude it
            continue
        # To minimise cropping of names in the below, retain just the first letter of the first name
        # if the author name is well formatted
        creator_names = []
        for name in [s['name'] for s in r['creators'] if s['name']]:
            if name.find(",") > 0:
                creator_names.append(name[0:name.index(",") + 3])
            elif name.find(",") == 0:
                creator_names.append(name[1:].strip())
            else:
                creator_names.append(name)
        all_creator_names_by_node.append(creator_names)
        all_creator_names_set.update(creator_names)

    # Assemble data structures for the co-authorship chord diagram
    all_creator_names = sorted(list(all_creator_names_set))

    # Initialise chord data matrix
    length = len(all_creator_names)
    coauthorship_matrix = []
    for i in range(length):
        r = []
        for j in range(length):
            r.append(0)
        coauthorship_matrix.append(r)

    # Populate chord data matrix
    for node_creators in all_creator_names_by_node:
        for creator in node_creators:
            c_pos = all_creator_names.index(creator)
            for co_creator in node_creators:
                co_pos = all_creator_names.index(co_creator)
                if c_pos != co_pos:
                    coauthorship_matrix[c_pos][co_pos] += 1

    return coauthorship_matrix, all_creator_names

In [4]:
def restructure_coauthorship_matrix(coauthorship_matrix, all_creator_names):
    """
    restructure the coauthorship matrix (that chord uses) into unq set of source/target/value triplets (that
    holoviews needs) by grabbing the bottom half of the matrix below the diagonal
    """
    # TODO there is presumably a better way of doing this in pandas/numpy...
    pairs = []
    for r_idx in range(0, len(c_names)):
        for c_idx in range(0, len(c_names)):
            if c_idx < r_idx:
                # exclude 0 value cases
                if co_mtx[r_idx][c_idx] > 0:
                    pairs.append([c_names[c_idx], c_names[r_idx], co_mtx[r_idx][c_idx]])

    # TODO create the DataFrame direct rather than clunky write/read from csv                  
    with open('/home/james/Desktop/coauthorship_pairs.csv', 'w') as outpf:
        my_writer = csv.writer(outpf, delimiter=',', quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
        my_writer.writerow(['source', 'target', 'value'])
        for i in pairs:
            my_writer.writerow(i)
    if os.path.exists('/home/james/Desktop/coauthorship_pairs.csv'):
        pairs_df = pd.read_csv('/home/james/Desktop/coauthorship_pairs.csv')
        
    return pairs_df            

In [5]:
gql_data = fetch_data()

In [6]:
co_mtx, c_names = build_matrix(gql_data)

In [7]:
pairs = restructure_coauthorship_matrix(co_mtx, c_names)

In [8]:
pairs.head()

Unnamed: 0,source,target,value
0,"Baars, C","Braukmann, R",2
1,"Bernal-Llinares, M","Braukmann, R",1
2,"Braukmann, R","Brown, C",1
3,"Baars, C","Bunakov, V",6
4,"Bernal Llinares, M","Bunakov, V",2


In [28]:
# use holoviews to create a chord diagram
# TODO need to tweak options e.g. add colour to differentiate between different authors

In [9]:
hv.Chord(pairs)