In [1]:
from pyalex import (
    Works, Authors, Sources,
    Institutions, Concepts, Publishers, Funders
)
import pyalex
import pandas as pd
import numpy as np
pyalex.config.email = "david@rs21.io"

from flair.embeddings import DocumentPoolEmbeddings
from flair.data import Sentence
from flair.embeddings import SentenceTransformerDocumentEmbeddings

EMBEDDING_MODEL_1 = "all-mpnet-base-v2" 

# this one is also good: all-MiniLM-L6-v2
EMBEDDING_MODEL_2 = "all-MiniLM-L6-v2"
SENT_EMBEDDINGS_1 = SentenceTransformerDocumentEmbeddings(EMBEDDING_MODEL_1)
SENT_EMBEDDINGS_2 = SentenceTransformerDocumentEmbeddings(EMBEDDING_MODEL_2)
DOC_EMBEDDINGS= DocumentPoolEmbeddings([SENT_EMBEDDINGS_2])

import torch
from tqdm import tqdm
import yake
import umap.umap_ as umap
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture as GMM
import altair as alt
import math
import plotly.express as px
import textwrap

2023-08-30 12:53:30.264801: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
search_term = 'quantum computation|quantum computers|quantum sensors|quantum cryptography'
quantum_concepts = Concepts().\
search_filter(display_name=search_term).get()

In [3]:
concepts = []
for i in range(len(quantum_concepts)):
    id_, display_name = quantum_concepts[i]['id'], quantum_concepts[i]['display_name']
    concepts.append((id_, display_name))
concepts

[('https://openalex.org/C58053490', 'Quantum computer'),
 ('https://openalex.org/C144901912', 'Quantum cryptography'),
 ('https://openalex.org/C89143813', 'Quantum sensor'),
 ('https://openalex.org/C187483380', 'Topological quantum computer'),
 ('https://openalex.org/C192353077', 'Adiabatic quantum computation'),
 ('https://openalex.org/C183968085', 'Trapped ion quantum computer'),
 ('https://openalex.org/C108277079', 'Post-quantum cryptography'),
 ('https://openalex.org/C19271316', 'One-way quantum computer')]

In [4]:
def process_works_list(worklist:list):
    """
    transforms the 
    works list into a dataframe.
    """
    abstracts_dict = {h["id"]:h["abstract"] for h in worklist}
    df = pd.DataFrame.from_records(worklist)
    del df['abstract_inverted_index']
    df['abstract'] = df['id'].map(abstracts_dict)
   # df['author_affils'] = df['authorships'].apply(get_authors_and_affils)
    return df

In [5]:
for i in range(len(quantum_concepts)):
    print(quantum_concepts[i]['id'], quantum_concepts[i]['works_count'])


https://openalex.org/C58053490 56711
https://openalex.org/C144901912 10538
https://openalex.org/C89143813 7709
https://openalex.org/C187483380 3278
https://openalex.org/C192353077 1966
https://openalex.org/C183968085 1156
https://openalex.org/C108277079 1228
https://openalex.org/C19271316 1529


In [6]:
def get_concept_frame(concepts_list:list, i:int):
    """
    takes a list of Concepts() results and an index
    and forms the pagination object to retrive the 
    records
    """
    pager = Works().filter(publication_year='>2016',
    #concepts={"id":f"{concepts_list[i]['id']}"}).filter(authorships={"institutions":{"country_code":"CN"}}).\
    #paginate(per_page=200,n_max=None)
    concepts={"id":f"{concepts_list[i]['id']}"}).\
    paginate(per_page=200,n_max=None)
    df = pd.DataFrame()
    for page in tqdm(pager):
        dfpage = process_works_list(page)
        df = pd.concat([df, dfpage], ignore_index=True)
        df.drop_duplicates(subset='id', keep='first',inplace=True)
    return df

In [7]:
frames_list = []
for i in range(len(quantum_concepts)):
    df = get_concept_frame(quantum_concepts, i)
    frames_list.append(df)

152it [04:35,  1.82s/it]
22it [00:36,  1.67s/it]
22it [00:41,  1.88s/it]
6it [00:10,  1.67s/it]
4it [00:06,  1.54s/it]
2it [00:03,  1.77s/it]
4it [00:06,  1.55s/it]
3it [00:06,  2.14s/it]


In [8]:
dftop = pd.concat(frames_list,
                  ignore_index=True)
dftop.drop_duplicates(subset='id', keep='first', 
                      inplace=True)

dftop.set_index('id', inplace=True, drop=False)

dfall = dftop
print(dfall.shape)

dfall['content'] = dfall['title'] + ". " + dfall['abstract']

dfrecords = dfall[~dfall['content'].isna()].copy()

(37892, 40)


In [9]:
def get_keywords(text:str, top:int=7, stopwords=None):
    """
    takes a blob of text and 
    returns the top **top** 
    keywords as a list
    """
    kw_extractor = yake.KeywordExtractor(top=top, stopwords=stopwords)
    keywords = kw_extractor.extract_keywords(text)
    return [p[0] for p in keywords]

In [10]:
def get_top_concepts(concept_list:list,score:float=.6):
    """
    takes a list of concept dictionaries 
    returns the top **top** display_names;
    concepts whose score is >= score
    """
    return [c['display_name'] for c in concept_list if c['score'] >= score]

In [11]:
dfrecords['keywords'] = dfrecords['content'].apply(get_keywords)
dfrecords['top_concepts'] = dfrecords['concepts'].apply(get_top_concepts)

In [12]:
texts = dfrecords['content'].str.lower().values.tolist()
#dfrecords.to_csv('jamming.csv')

In [13]:
def get_content_embeddings(dfrecords:pd.DataFrame) -> pd.DataFrame:
    """
    passes the preprocessed mitigation strings
    data through the embedding model to produce the vector
    space representation of each pet mitigation.
    """
    sent = Sentence("The grass is green.")
    DOC_EMBEDDINGS.embed(sent)
    texts = dfrecords["content"].str.lower().values.tolist()
    all_descriptions = np.empty((len(texts), len(sent.embedding)))
    for i in tqdm(range(len(texts))):
        sent = Sentence(texts[i])
        DOC_EMBEDDINGS.embed(sent)
        all_descriptions[i, :] = sent.embedding.cpu().numpy()
        # gc.collect()
        torch.cuda.empty_cache()
    dfcontentvectors = pd.DataFrame.from_records(all_descriptions, index=dfrecords.index)
    return dfcontentvectors

In [14]:
dfcontentvectors = get_content_embeddings(dfrecords)


100%|███████████████████████████████████████████████████████████████████| 32703/32703 [07:54<00:00, 68.89it/s]


In [15]:
#umap.UMAP?
N_COMPONENTS = 2 # can visualize this way
umap_reducer = umap.UMAP(n_components=N_COMPONENTS,
                       #  metric='euclidean')
                         random_state=1234,
                         metric='cosine')  # can experiment with this metric as well as the other 
# parameters
# to see what other literature is in the same information space, we need to keep this umap_reducer 
# object as well as the gmm model below.

# Apply UMAP to the vectorized strings
reduced_vectors = umap_reducer.fit_transform(dfcontentvectors.to_numpy())
dfreduced = pd.DataFrame.from_records(reduced_vectors, 
                index=dfcontentvectors.index)
dfreduced.columns = ['x','y']

## use hdbscan to cluster

In [16]:
import hdbscan

hdbscan_args = {'min_cluster_size': 15,
                            'metric': 'euclidean',
                            'cluster_selection_method': 'eom',
                            'cluster_selection_epsilon': 0.1
               }

cluster = hdbscan.HDBSCAN(**hdbscan_args).fit(dfreduced[['x','y']].to_numpy())

dfreduced['cluster'] = cluster.labels_
dfreduced['probability'] = cluster.probabilities_

dfpapers = dfrecords.merge(dfreduced, left_index=True,
                           right_index=True)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [17]:
#help(dfpapers.explode)
del dfpapers['id']
dfstart = dfpapers.reset_index()
dfstart.head()

Unnamed: 0,id,doi,title,display_name,publication_year,publication_date,ids,language,primary_location,type,...,created_date,abstract,is_authors_truncated,content,keywords,top_concepts,x,y,cluster,probability
0,https://openalex.org/W3101479050,https://doi.org/10.1038/s41586-019-1666-5,Quantum supremacy using a programmable superco...,Quantum supremacy using a programmable superco...,2019,2019-10-23,{'openalex': 'https://openalex.org/W3101479050...,en,"{'is_oa': True, 'landing_page_url': 'https://d...",article,...,2020-11-23,The promise of quantum computers is that certa...,,Quantum supremacy using a programmable superco...,"[programmable superconducting processor, Quant...","[Quantum computer, Qubit, Computer science]",1.306056,4.435403,-1,0.0
1,https://openalex.org/W2781738013,https://doi.org/10.22331/q-2018-08-06-79,Quantum Computing in the NISQ era and beyond,Quantum Computing in the NISQ era and beyond,2018,2018-08-06,{'openalex': 'https://openalex.org/W2781738013...,en,"{'is_oa': True, 'landing_page_url': 'https://d...",article,...,2018-01-12,Noisy Intermediate-Scale Quantum (NISQ) techno...,,Quantum Computing in the NISQ era and beyond. ...,"[NISQ era, Quantum, NISQ, Quantum Computing, q...","[Quantum computer, Computer science]",0.693603,3.592004,158,1.0
2,https://openalex.org/W2559394418,https://doi.org/10.1038/nature23474,Quantum machine learning,Quantum machine learning,2017,2017-09-01,{'openalex': 'https://openalex.org/W2559394418...,en,"{'is_oa': False, 'landing_page_url': 'https://...",article,...,2016-12-08,Fuelled by increasing computer power and algor...,,Quantum machine learning. Fuelled by increasin...,"[machine learning, Quantum machine learning, m...","[Quantum machine learning, Computer science, Q...",5.575125,4.895947,118,1.0
3,https://openalex.org/W3037447387,https://doi.org/10.1103/revmodphys.89.035002,Quantum sensing,Quantum sensing,2017,2017-07-25,{'openalex': 'https://openalex.org/W3037447387...,en,"{'is_oa': True, 'landing_page_url': 'https://d...",article,...,2020-07-02,"""Quantum sensing"" describes the use of a quant...",,"Quantum sensing. ""Quantum sensing"" describes t...","[Quantum sensing, Quantum, sensing, quantum sy...","[Physics, Quantum sensor, Quantum technology, ...",-5.109255,5.719063,-1,0.0
4,https://openalex.org/W2755255888,https://doi.org/10.1038/nature23879,Hardware-efficient variational quantum eigenso...,Hardware-efficient variational quantum eigenso...,2017,2017-09-01,{'openalex': 'https://openalex.org/W2755255888...,en,"{'is_oa': False, 'landing_page_url': 'https://...",article,...,2017-09-25,Quantum computers can be used to address molec...,,Hardware-efficient variational quantum eigenso...,"[Monte Carlo methods, quantum, Hardware-effici...","[Quantum computer, Pauli exclusion principle]",2.870998,6.35053,153,0.925042


In [18]:
dfstart.shape

(32703, 47)

In [19]:
dfbig = dfstart.explode(column='authorships')
dfbig.shape, dfstart.shape

((144602, 47), (32703, 47))

In [20]:
def add_extra_to_authorships(row: pd.DataFrame):
    """
    row[authorships] is a dictionary;
    add in the id key to that dictionary
    whose value is row[id]
    """
    complete_dict = row["authorships"]
   # assert type(complete_dict) == dict
    #print(type(complete_dict))
    if type(complete_dict) == dict:
        complete_dict["id"] = row["id"]
        complete_dict["x"] = row["x"]
        complete_dict["y"] = row["y"]
        complete_dict["cluster"] = row["cluster"]
        complete_dict["cluster_score"] = row["probability"]
        complete_dict["title"] = row["title"]
        complete_dict["abstract"] = row["abstract"]
        complete_dict["doi"] = row["doi"]
        complete_dict["publication_date"] = row["publication_date"]
        complete_dict["publication_year"] = row["publication_year"]
        complete_dict["grants"] = row["grants"]
        complete_dict["locations"] = row["locations"]
        return complete_dict
    else:
        return row["authorships"]

In [21]:
dfbig['big_authorships'] = dfbig.apply(add_extra_to_authorships, axis=1)

In [22]:
#dfbig['authorships'].tolist()
bigvals = dfbig['authorships'].tolist()

In [23]:
dictvals = [c for c in bigvals if type(c) != float]

In [24]:
dftriple = pd.json_normalize(dictvals,
                  record_path=['institutions'],
                  meta=['id','raw_affiliation_string','author_position', 'doi',
                        'title','abstract','publication_date', 'publication_year',
                        'grants','locations',
                        'is_corrresponding','x','y','cluster','cluster_score',
                       ['author','id'], ['author', 'display_name'],
                       ['author','orcid']],
                  errors='ignore',
                  sep='_',
                  meta_prefix='paper_',
                #  record_prefix='author_'
                 )

In [25]:
dftopics = dfcontentvectors.copy()
dftopics['cluster'] = dfpapers['cluster']
dfmeantopics = dftopics.groupby('cluster').mean().copy()
reduced_topics = umap_reducer.transform(dfmeantopics.to_numpy())
df_reduced_topics = pd.DataFrame.from_records(reduced_topics, 
                index=dfmeantopics.index)
df_reduced_topics.columns = ['x','y']
df_reduced_topics['topic'] = df_reduced_topics.index
df_reduced_topics.head()

def get_cluster_concepts(topic_num:int, n:int=20):
    """
    takes an integer topic_num corresponding to a 
    given topic number and
    returns the list of top n occuring concepts
    from the top_concept field
    """
    top_concepts = dfpapers[dfpapers['cluster'] == topic_num]['top_concepts'].tolist()
    flat_concepts = [item for sublist in top_concepts for item in sublist]
    concepts_dict = {c:flat_concepts.count(c) for c in flat_concepts}
    sorted_concepts = sorted(concepts_dict.items(), key=lambda x:x[1], reverse=True)
    return [c[0] for c in sorted_concepts][:n]

def get_yake_cluster_phrases(topic_num:int, n:int=20):
    """
    takes in an integer n corresponding
    to a given topic number and
    returns the list of keyphrases (TopicRank method)
    """
    documents = dfpapers[dfpapers['cluster'] == topic_num]['content'].tolist()
    topic_input = ". ".join(documents)
    #extractor = pke.unsupervised.TextRank()
    kw_extractor = yake.KeywordExtractor(top=n, stopwords=None)
    keywords = kw_extractor.extract_keywords(topic_input)
    #extractor.load_document(input=topic_input,
    #                    language='en',
    #                    normalization=None)

    #extractor.candidate_selection()

    #window = 2
    #use_stems = False
    #extractor.candidate_weighting(window=window,
    #                          use_stems=use_stems)
    #extractor.candidate_weighting()
    #threshold = 0.8
   # keyphrases = extractor.get_n_best(n=20, threshold=threshold)
    #keyphrases = extractor.get_n_best(n=n)
    return [p[0] for p in keywords]

wikiconcepts = df_reduced_topics['topic'].apply(get_cluster_concepts)

wikikeywords = df_reduced_topics['topic'].apply(get_yake_cluster_phrases)

dfpapers['id'] = dfpapers.index
dfinfo = dfpapers[['x','y','id','title','doi','cluster','grants',
                   'locations',
                 'publication_date','keywords','top_concepts']].copy()

centroids = dfinfo.groupby('cluster')[['x','y']].mean().copy()
centroids['concepts'] = wikiconcepts
centroids['cluster'] = centroids.index
centroids['keywords'] = wikikeywords

In [26]:
def wrap_it(x):
    return "<br>".join(textwrap.wrap(x, width=40))
   # return "<br>".join(te

In [27]:
centroids['wrapped_keywords'] = centroids['keywords'].apply(str).apply(wrap_it)
centroids['wrapped_concepts'] = centroids['concepts'].apply(str).apply(wrap_it)

In [28]:
centroids.to_pickle('quantumcentroids2d.pkl')

In [29]:
dftriple.to_pickle('quantumdftriple2d.pkl')


In [30]:
def get_affils_cluster_sort(dc:pd.DataFrame, cl:int):
    """
    restricts the dataframe dc to cluster value cl
    and returns the results grouped by id, ror sorted
    by the some of probablity descending
    """
    dg = dc[dc['paper_cluster'] == cl].copy()
    print(cl)
    dv = dg.groupby(['id','display_name','country_code',
                     'type'])['paper_cluster_score'].sum().to_frame()
    dv.sort_values('paper_cluster_score', ascending=False, inplace=True)
    kw = centroids[centroids.cluster == cl]['keywords'].iloc[0]
    return dv, kw

In [31]:
dv84, kw84 = get_affils_cluster_sort(dftriple, 1)
print(kw84)
dv84.head(10)

1
['Quantum Battery', 'quantum batteries', 'Quantum', 'Battery', 'Charging', 'open quantum batteries', 'batteries', 'quantum advantage', 'three-level quantum battery', 'quantum battery based', 'energy', 'three-level quantum batteries', 'IBM Quantum', 'energy storage', 'open quantum', 'Charging Process', 'Dicke Quantum Battery', 'quantum devices', 'charging power', 'SYK quantum batteries']


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,paper_cluster_score
id,display_name,country_code,type,Unnamed: 4_level_1
https://openalex.org/I3045169105,Southern University of Science and Technology,CN,education,15.0
https://openalex.org/I4210150858,Beijing Academy of Quantum Information Sciences,CN,facility,10.0
https://openalex.org/I30771326,Italian Institute of Technology,IT,facility,7.186063
https://openalex.org/I177909021,Federal University of São Carlos,BR,education,7.0
https://openalex.org/I4210139958,Beijing Computational Science Research Center,CN,facility,5.0
https://openalex.org/I37802460,Northwest University,CN,education,4.0
https://openalex.org/I114457229,University of Geneva,CH,education,4.0
https://openalex.org/I83816512,University of Genoa,IT,education,3.062021
https://openalex.org/I157210198,Scuola Normale Superiore di Pisa,IT,other,3.062021
https://openalex.org/I4210104335,Institute for Basic Science,KR,facility,3.0


In [32]:
dv84, kw84 = get_affils_cluster_sort(dftriple, 2)
print(kw84)
dv84.head(10)

2
['Quantum Computer Music', 'quantum', 'quantum computing', 'Quantum Computer', 'Computer Music', 'Music', 'quantum computer hardware', 'computer', 'quantum music', 'quantum music generation', 'computing', 'Music Quantum Computing', 'music composition', 'computers', 'quantum algorithms', 'algorithmic computer music', 'quantum hardware', 'quantum computer algorithm', 'quantum computing tools', 'hardware quantum computers']


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,paper_cluster_score
id,display_name,country_code,type,Unnamed: 4_level_1
https://openalex.org/I897542642,University of Plymouth,GB,education,10.125563
https://openalex.org/I36234482,University of Bristol,GB,education,2.0
https://openalex.org/I4210161128,Bristol Robotics Laboratory,GB,facility,2.0
https://openalex.org/I51601045,University of Bath,GB,education,1.0
https://openalex.org/I106645853,Changchun University of Science and Technology,CN,education,0.104611
https://openalex.org/I142608572,Prince Sattam Bin Abdulaziz University,SA,education,0.03487


In [33]:
dv84, kw84 = get_affils_cluster_sort(dftriple, 10)
print(kw84)
dv84.head(20)

10
['Shor algorithm', 'Shor factoring algorithm', 'quantum factoring algorithm', 'quantum algorithm', 'Shor quantum algorithm', 'quantum factorization algorithm', 'quantum', 'Shor quantum factoring', 'Shor quantum factorization', 'algorithm', 'Shor', 'Shor factorization algorithm', 'Shor quantum', 'quantum computer', 'quantum computing', 'factoring algorithm', 'quantum factoring', 'factorization algorithm', 'variational quantum factoring', 'Factoring RSA Integers']


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,paper_cluster_score
id,display_name,country_code,type,Unnamed: 4_level_1
https://openalex.org/I113940042,Shanghai University,CN,education,9.893639
https://openalex.org/I151746483,University of Waterloo,CA,education,6.061485
https://openalex.org/I86987016,Royal Institute of Technology,SE,education,5.37201
https://openalex.org/I37461747,Wuhan University,CN,education,4.983914
https://openalex.org/I169108374,University of the Basque Country,ES,education,4.507125
https://openalex.org/I4210114115,IBM Research - Thomas J. Watson Research Center,US,facility,4.189439
https://openalex.org/I154970844,Jaypee Institute of Information Technology,IN,education,4.085484
https://openalex.org/I161127581,Fluminense Federal University,BR,education,4.0
https://openalex.org/I83205935,Malaviya National Institute of Technology Jaipur,IN,education,3.562361
https://openalex.org/I76571253,Imam Abdulrahman Bin Faisal University,SA,education,3.531676


In [34]:
dfinfo = dfpapers[['x','y','id','title','doi','cluster','probability',
                 'publication_date','grants','locations',
                   'keywords','top_concepts']].copy()

In [35]:
pap_affils_dict = dftriple.groupby('paper_id')['paper_raw_affiliation_string'].\
apply(lambda x: ' | '.join(x.tolist()))

#pap_authors_dict = dftriple.groupby('paper_id')['paper_author_display_name'].\
#apply(lambda x: ' | '.join(x.tolist()))

In [36]:
pap_authors_dict = dftriple.groupby('paper_id')['paper_author_display_name'].apply(lambda x: x.values)

In [37]:
dfinfo['affil_list'] = pap_affils_dict
dfinfo['author_list'] = pap_authors_dict

In [38]:
dfinfo['affil_list'] = dftriple.groupby('paper_id')['paper_raw_affiliation_string'].\
apply(lambda x: x.tolist())

In [39]:
dfinfo['author_list'] =  dftriple.groupby('paper_id')['paper_author_display_name'].\
apply(lambda x: x.tolist())

In [40]:
dfinfo['wrapped_affil_list'] = dfinfo['affil_list'].apply(str).apply(wrap_it)
dfinfo['wrapped_author_list'] = dfinfo['author_list'].apply(str).apply(wrap_it)

In [41]:
dfinfo['wrapped_keywords'] = dfinfo['keywords'].apply(str).apply(wrap_it)

In [42]:
def get_source_name(loc_list):
    """
    grab the first item in the list;
    retturn the display name
    """
    try:
        primary = loc_list[0]
        return primary["source"]["display_name"]
    except:
        return None

def get_source_type(loc_list):
    """
    grab the first item in the list;
    return the source type
    """
    try:
        primary = loc_list[0]
        return primary["source"]["type"]
    except:
        return None
    
    

In [43]:
dfinfo["source"] = dfinfo["locations"].apply(get_source_name)
dfinfo["source_type"] = dfinfo["locations"].apply(get_source_type)

In [44]:
dfinfo.to_pickle('quantumdfinfo2d.pkl')

In [45]:
def get_funder_names(funder_list):
    """
    funder_list is a list of dictionaries
    with three keys; return the list of 
    unique **funder_display_name**
    values
    """
    try:
        funder_names = list(set([f['funder_display_name'] for f in funder_list]))
        return funder_names    
    except:
        return []

In [46]:
dftriple["source"] = dftriple["paper_locations"].apply(get_source_name)
dftriple["source_type"] = dftriple["paper_locations"].apply(get_source_type)
dftriple["funder_list"] = dftriple["paper_grants"].apply(get_funder_names)

In [47]:
dftriple.to_pickle('quantumdftriple2d.pkl')

In [48]:
def get_journals_cluster_sort(dc:pd.DataFrame, cl:int):
    """
    restricts the dataframe dc to cluster value cl
    and returns the results grouped by source (where
    source_type == 'journal') sorted
    by the some of probablity descending
    """
    dg = dc[dc['paper_cluster'] == cl].copy()
    print(cl)
    dv = dg[dg['source_type'] == 'journal'].groupby(['source'])['paper_cluster_score'].sum().to_frame()
    dv.sort_values('paper_cluster_score', ascending=False, inplace=True)
    kw = centroids[centroids.cluster == cl]['keywords'].iloc[0]
    return dv, kw

In [49]:
def get_conferences_cluster_sort(dc:pd.DataFrame, cl:int):
    """
    restricts the dataframe dc to cluster value cl
    and returns the results grouped by source (where
    source_type == 'journal') sorted
    by the some of probablity descending
    """
    dg = dc[dc['paper_cluster'] == cl].copy()
    print(cl)
    dv = dg[dg['source_type'] == 'conference'].groupby(['source'])['paper_cluster_score'].sum().to_frame()
    dv.sort_values('paper_cluster_score', ascending=False, inplace=True)
    kw = centroids[centroids.cluster == cl]['keywords'].iloc[0]
    return dv, kw

In [50]:
dv84, kw84 = get_journals_cluster_sort(dftriple, 1)
print(kw84)
dv84.head(10)

1
['Quantum Battery', 'quantum batteries', 'Quantum', 'Battery', 'Charging', 'open quantum batteries', 'batteries', 'quantum advantage', 'three-level quantum battery', 'quantum battery based', 'energy', 'three-level quantum batteries', 'IBM Quantum', 'energy storage', 'open quantum', 'Charging Process', 'Dicke Quantum Battery', 'quantum devices', 'charging power', 'SYK quantum batteries']


Unnamed: 0_level_0,paper_cluster_score
source,Unnamed: 1_level_1
Quantum science and technology,39.0
Physical Review Letters,33.0
Physical review,15.903064
Batteries,7.0
Physical review research,7.0
Frontiers of Physics in China,1.119261
Epj Web of Conferences,0.496168


In [51]:
dv84, kw84 = get_conferences_cluster_sort(dftriple, 1)
print(kw84)
dv84.head(10)

1
['Quantum Battery', 'quantum batteries', 'Quantum', 'Battery', 'Charging', 'open quantum batteries', 'batteries', 'quantum advantage', 'three-level quantum battery', 'quantum battery based', 'energy', 'three-level quantum batteries', 'IBM Quantum', 'energy storage', 'open quantum', 'Charging Process', 'Dicke Quantum Battery', 'quantum devices', 'charging power', 'SYK quantum batteries']


Unnamed: 0_level_0,paper_cluster_score
source,Unnamed: 1_level_1


In [52]:
def get_country_collaborations_sort(dc:pd.DataFrame, cl:int):
    """
    resticts the dataframe dc to cluster value cl
    and returns the results of paper_id s where there is 
    more than one country_code
    """
    dg = dc[dc['paper_cluster'] == cl].copy()
    dv = dg.groupby('paper_id')['country_code'].apply(lambda x: len(set(x.values))).to_frame()
    dc = dg.groupby('paper_id')['country_code'].apply(lambda x: list(set(x.values))).to_frame()
    dc.columns = ['collab_countries']
    dv.columns = ['country_count']
    dv['collab_countries'] = dc['collab_countries']
    dv.sort_values('country_count',ascending=False, inplace=True)
    di = dfinfo.loc[dv.index].copy()
    di['country_count'] = dv['country_count']
    di['collab_countries'] = dv['collab_countries']
    return di[di['country_count'] > 1]

In [53]:
quantum_concepts

[{'id': 'https://openalex.org/C58053490',
  'wikidata': 'https://www.wikidata.org/wiki/Q176555',
  'display_name': 'Quantum computer',
  'relevance_score': 37201.516,
  'level': 3,
  'description': 'theoretical computation device relying on quantum mechanics',
  'works_count': 56711,
  'cited_by_count': 1084706,
  'summary_stats': {'2yr_mean_citedness': 2.94691092625479,
   'h_index': 402,
   'i10_index': 13002},
  'ids': {'openalex': 'https://openalex.org/C58053490',
   'wikidata': 'https://www.wikidata.org/wiki/Q176555',
   'mag': '58053490',
   'wikipedia': 'https://en.wikipedia.org/wiki/Quantum%20computing'},
  'image_url': 'https://upload.wikimedia.org/wikipedia/commons/6/60/IBM_Q_system_%28Fraunhofer_2%29.jpg',
  'image_thumbnail_url': 'https://upload.wikimedia.org/wikipedia/commons/thumb/6/60/IBM_Q_system_%28Fraunhofer_2%29.jpg/100px-IBM_Q_system_%28Fraunhofer_2%29.jpg',
  'international': {'display_name': {'ar': 'حاسوب كمومي',
    'ast': 'Computación cuántica',
    'az': 'Kvant

In [54]:
import networkx as nx
from pyvis.network import Network
import igraph as ig # for getting a layout w/o relying on slow pyvis physics 

In [55]:
dfinfo["funder_list"] = dfinfo["grants"].apply(get_funder_names)
dfinfo["wrapped_funder_list"] = dfinfo["funder_list"].apply(str).apply(wrap_it)

In [56]:
dfinfo.to_pickle('quantumdfinfo2d.pkl')

In [57]:
kw_dict = dfinfo['keywords'].to_dict()

In [58]:
dc = dftriple[dftriple['paper_cluster'] == 10].copy()
dc.shape

(307, 27)

In [59]:
[x for row in dc['funder_list'].tolist() for x in row]

['Conseil Régional, Île-de-France',
 "Commissariat à l'Énergie Atomique et aux Énergies Alternatives",
 'Conseil Régional, Île-de-France',
 "Commissariat à l'Énergie Atomique et aux Énergies Alternatives",
 'Conseil Régional, Île-de-France',
 "Commissariat à l'Énergie Atomique et aux Énergies Alternatives",
 'Conseil Régional, Île-de-France',
 "Commissariat à l'Énergie Atomique et aux Énergies Alternatives",
 'National Natural Science Foundation of China',
 'National Natural Science Foundation of China',
 'National Natural Science Foundation of China',
 'National Natural Science Foundation of China',
 'National Natural Science Foundation of China',
 'National Natural Science Foundation of China',
 'National Natural Science Foundation of China',
 'National Natural Science Foundation of China',
 'National Natural Science Foundation of China',
 'Science and Technology Commission of Shanghai Municipality',
 'National Natural Science Foundation of China',
 'Eusko Jaurlaritza',
 'Science and

In [60]:
kw_dict = dfinfo['keywords'].to_dict()

# add in the affiliations as nodes as well; that row, author, paper, affil. all three get links. ok.
def create_nx_graph(df: pd.DataFrame, cl:int) -> nx.Graph:
    """
    takes the dataframe df, and creates the undirected graph
    from the source and target columns for each row.
    """
    g = nx.Graph() # dc['paper_cluster'] == cl
    dc = df[df['paper_cluster'] == cl]
    author_counts = dc['paper_author_id'].tolist()
    author_counts_dict = {c:author_counts.count(c) for c in author_counts}
    affiliation_counts = dc['id'].tolist()
    affiliation_counts_dict = {c:affiliation_counts.count(c) for c in affiliation_counts}
    source_counts = dc['source'].tolist()
    source_counts_dict = {c:source_counts.count(c) for c in source_counts}
    funder_counts = [x for row in dc['funder_list'].tolist() for x in row]
    funder_counts_dict = {c:funder_counts.count(c) for c in funder_counts}
    for index, row in df[df['paper_cluster'] == cl].iterrows():
        g.add_node(row['paper_id'], group='work', title=row['paper_title'])
        g.add_node(row['paper_author_id'], title=row['paper_author_display_name'],
                   group='author',value = author_counts_dict[row['paper_author_id']])
        g.add_node(row['id'], group='affiliation',
                   title=row['display_name'] + '\n' + row['country_code'],
                  value = affiliation_counts_dict[row['id']])
        if row['source']:
            g.add_node(row['source'], group=row['source_type'],
                      title=row['source'] + ' :\n ' + row['source_type'],
                      value=source_counts_dict[row['source']])
            g.add_edge(
                row['paper_id'],
                row['source'],
                title=row['paper_title'] + ' :\n ' + str(row['paper_publication_date']) +  \
                ' :\n' + row['source'] + ' :\n ' + \
                row['source_type'],
              #  weight = df[(df['paper_id'] == row['paper_id']) & \
              #              (df['source'] == row['source'])]['paper_cluster_score'].sum()
               # weight = row['paper_cluster_score']
            )
            g.add_edge(
                row['paper_author_id'],
                row['source'],
                title=row['paper_author_display_name'] + ':\n' + row['source'],
             #   weight = df[(df['paper_author_id'] == row['paper_author_id']) & \
              #              (df['source'] == row['source'])]['paper_cluster_score'].sum()
               # weight = row['paper_cluster_score']
            )
        if len(row['funder_list']) > 0:
            for f in row['funder_list']:
                g.add_node(f, group='funder',
                          title=str(f),
                          value = founder_counts_dict[f]),
                g.add_edge(
                       row['paper_id'],
                       f,
                       title=row['paper_title'] + ':\n ' +  str(row['paper_publication_date']) + \
                       ' :\n' + str(f),
                  #  weight = row['paper_cluster_score']
                   )
                g.add_edge(
                       f,
                       row['paper_author_id'],
                       title=row['paper_author_display_name'] + ' :\n ' + \
                       str(f),
                  #  weight = row['paper_cluster_score']
                       
                   )
                g.add_edge(
                       f,
                       row['id'],
                       title=row['display_name'] + '\n' + row['country_code'] + ' :\n ' + \
                       str(f)  ,
                  #  weight = row['paper_cluster_score']
                   )  
                if row["source"]:
                    g.add_edge(
                        f,
                        row["source"],
                        title=row["source"] + ' :\n' + str(f),
                     #   weight = row['paper_cluster_score']
                    )
        g.nodes[row['paper_id']]['title'] = (
            row['paper_title'] + ' :\n ' + str(row['paper_publication_date'] + ':\n' + 
            '\n'.join(kw_dict[row['paper_id']]))
        )
        g.nodes[row['paper_author_id']]['title'] = (
            row['paper_author_display_name']
        )
        g.add_edge(
            row['paper_id'],
            row['paper_author_id'],
        title=row['paper_title'] + ' :\n ' + row['paper_author_display_name'] + ' :\n ' + \
            row['paper_raw_affiliation_string'],
         #   weight = row['paper_cluster_score']
        )
        g.add_edge(
            row['paper_author_id'],
            row['id'],
            title=row['paper_author_display_name'] + ' :\n ' + \
            row['display_name'] + ' :\n ' + row['country_code'],
          #  weight = row['paper_cluster_score']
        )
        g.add_edge(
            row['paper_id'],
            row['id'],
            title=row['paper_title'] + ' :\n ' + str(row['paper_publication_date']) + ':\n' + 
            row['display_name'] + ' :\n ' + row['country_code'],
         #   weight = row['paper_cluster_score']
        )
        
    g_ig = ig.Graph.from_networkx(g) # assign 'x', and 'y' to g before returning
    #layout = g_ig.layout_auto()
    #layout = g_ig.layout_davidson_harel()
    layout = g_ig.layout_umap(min_dist = 2, epochs = 500)
    # https://igraph.org/python/tutorial/0.9.6/visualisation.html
    coords = layout.coords
    allnodes = list(g.nodes())
    coords_dict = {allnodes[i]:(coords[i][0], coords[i][1]) for i in range(len(allnodes))}
    for i in g.nodes():
        g.nodes[i]['x'] = 250 * coords_dict[i][0] # the scale factor needed 
        g.nodes[i]['y'] = 250 * coords_dict[i][1]
    return g

In [61]:
def create_pyvis_html(cl: int, filename: str = "pyvis_coauthorships_graph.html"):
    """
    wrapper function that calls create_nx_graph to finally 
    produce an interactive pyvis standalone html file
    """
    g_nx = create_nx_graph(dftriple, cl);
    h = Network(height="1000px",
          #  heading="Mitigations and Techniques Relationships",
                width="100%",
                cdn_resources="remote", # can grab the visjs library to make this local if needed
            # probably should
                bgcolor="#222222",
            neighborhood_highlight=True,
              # default_node_size=1,
                font_color="white",
                directed=False,
               # select_menu=True,
                filter_menu=True,
                notebook=False,
               )
    #h.repulsion()
    h.from_nx(g_nx, show_edge_weights=False)
    #h.barnes_hut()
    #h.repulsion(node_distance=40,
    #            central_gravity=-0.2, spring_length=5, spring_strength=0.005, damping=0.09)
    neighbor_map = h.get_adj_list()
   # for node in h.nodes:
   #     if node['group'] == 'author':
   #         a = list(neighbor_map[node["id"]]) # want to insert a "\n" into every third element of a
   #     if node['group'] == 'work':
   #         a = list(neighbor_map[node["id"]])
   #     i = 3
   #     while i < len(a):
   #         a.insert(i, "\n")
   #         i += 4
   #     node["title"] += "\n Neighbors: \n" + " | ".join(a)
   #     node["value"] = len(neighbor_map[node["id"]]) 
# "physics": {
#    "enabled": false
#  },
    h.set_options(
    """
const options = {
  "interaction": {
    "navigationButtons": false
  },
 "physics": {
     "enabled": false
 },
  "edges": {
    "color": {
        "inherit": true
    },
    "setReferenceSize": null,
    "setReference": {
        "angle": 0.7853981633974483
    },
    "smooth": {
        "forceDirection": "none"
    }
  }
  }
    """
    )
    #h.show_buttons(filter_=['physics'])
  #  h.barnes_hut()
    #h.repulsion()
    try:
        path = './tmp'
        h.save_graph(f"{path}/{filename}")
        HtmlFile = open(f"{path}/{filename}","r",
                        encoding='utf-8')
    except:
        h.save_graph(f"{filename}")
        HtmlFile = open(f"{filename}", "r",
                        encoding="utf-8")
    return h

In [62]:
def get_time_series(dg, cl:int):
    """
    takes dg and the cluster number cl
    and returns a time series chart
    by month, y-axis is the article count
    """
    dftime = dg[dg.cluster == cl][['cluster','probability','publication_date']].copy()
    dftime['date'] = pd.to_datetime(dftime['publication_date'])
    dftime.sort_values('date', inplace=True)
    #by_month = pd.to_datetime(dftime['date']).dt.to_period('M').value_counts().sort_index()
    #by_month.index = pd.PeriodIndex(by_month.index)
    #df_month = by_month.rename_axis('month').reset_index(name='counts')
    return dftime
    

In [63]:
import altair as alt
#alt.data_transformers.enable("data_server")

In [64]:
dfinfo.columns

Index(['x', 'y', 'id', 'title', 'doi', 'cluster', 'probability',
       'publication_date', 'grants', 'locations', 'keywords', 'top_concepts',
       'affil_list', 'author_list', 'wrapped_affil_list',
       'wrapped_author_list', 'wrapped_keywords', 'source', 'source_type',
       'funder_list', 'wrapped_funder_list'],
      dtype='object')

In [65]:
dfinfo['source_type'].iloc[56]

'journal'

In [66]:
dftriple.columns

Index(['id', 'display_name', 'ror', 'country_code', 'type', 'lineage',
       'paper_id', 'paper_raw_affiliation_string', 'paper_author_position',
       'paper_doi', 'paper_title', 'paper_abstract', 'paper_publication_date',
       'paper_publication_year', 'paper_grants', 'paper_locations',
       'paper_is_corrresponding', 'paper_x', 'paper_y', 'paper_cluster',
       'paper_cluster_score', 'paper_author_id', 'paper_author_display_name',
       'paper_author_orcid', 'source', 'source_type', 'funder_list'],
      dtype='object')

In [67]:
dfbig['primary_location'].iloc[56]

{'is_oa': True,
 'landing_page_url': 'https://doi.org/10.1038/s41586-019-1666-5',
 'pdf_url': 'https://www.nature.com/articles/s41586-019-1666-5.pdf',
 'source': {'id': 'https://openalex.org/S137773608',
  'display_name': 'Nature',
  'issn_l': '0028-0836',
  'issn': ['1476-4687', '0028-0836'],
  'is_oa': False,
  'is_in_doaj': False,
  'host_organization': 'https://openalex.org/P4310319908',
  'host_organization_name': 'Nature Portfolio',
  'host_organization_lineage': ['https://openalex.org/P4310319908',
   'https://openalex.org/P4310319965'],
  'host_organization_lineage_names': ['Nature Portfolio', 'Springer Nature'],
  'type': 'journal'},
 'license': None,
 'version': 'publishedVersion',
 'is_accepted': True,
 'is_published': True}

In [68]:
def get_source_url(loc_list):
    """
    grab the first item in the list;
    return the source type
    """
    try:
        primary = loc_list[0]
        return primary["landing_page_url"]
    except:
        return None

In [69]:
dftriple['paper_locations'].iloc[45][0]['source']['id']

'https://openalex.org/S137773608'

need to use pyalex again; grab the first "source", "id" value

In [70]:
check_source = Sources()[dftriple['paper_locations'].iloc[45][0]['source']['id']]
check_source['homepage_url']

'https://www.nature.com/nature/'

In [71]:
def get_source_url(loc_list):
    """
    grab the first item in the list;
    return the source type
    """
    try:
        primary = loc_list[0]
        return source_homepage_dict[primary['source']['id']]
    except:
        return None

get a list of all the source, id values of **paper_locations**

In [72]:
sources_list = [val[0] for val in dftriple['paper_locations'].dropna().tolist() if len(val) > 0]

In [73]:
len(sources_list)

116134

In [76]:
def process_sources_list(sl:list):
    """
    takes the list of dictionaries and returns the
    list of unique source id values
    """
    unique_display_names = set()
    for v in sl:
        try:
            unique_display_names.add(v['source']['display_name'])
        except:
            pass
    return list(unique_display_names)

In [77]:
sources_list = dftriple['source'].unique().tolist()
type(sources_list), len(sources_list)

def get_source_json(s:str):
    """
    s is an openalex Sources display_name
    return that Sources object
    """
    source_json = Sources().filter(display_name = s).get()
    a = source_json[0]['type']
    if "homepage_url" in source_json[0] and source_json[0]['homepage_url']:
        print(f"{s} has homepage_url and type {source_json[0]['type']}")
        return source_json[0]["homepage_url"]
    else:
        return None
    
def get_display_page_dict(sl:list):
    """
    sl is a list of Sources display_name values
    returns the dictionary mapping
    display_names with homepage_url values."""
    mapping_dict = dict()
    for s in tqdm(sl):
        try:
            mapping_dict[s] = get_source_json(s)
        except:
            pass
    return mapping_dict



now we just want the mapping between these source display_name values and the homepage_url. ok.

In [78]:
#source_homepage_dict

In [79]:
#dftriple['homepage_url'] = dftriple['paper_locations'].apply(get_source_url)

In [80]:
#dftriple.to_pickle('quantumdftriple2d.pkl')

In [81]:
#dftriple.columns

In [82]:
dftriple['source_type'].value_counts()

source_type
journal           86835
repository         5757
conference         4278
book series        2830
ebook platform      669
Name: count, dtype: int64

In [84]:
#dftriple[dftriple['source_type'] == 'conference'][['source','source_type','homepage_url','paper_cluster']].tail()

In [86]:
#dftriple['homepage_url'].value_counts()

In [90]:
#dfinfo["homepage_url"] = dfinfo["locations"].apply(get_source_url)

In [91]:
#dfinfo.to_pickle('quantumdfinfo2d.pkl')

In [92]:
dftriple['paper_locations'].iloc[56]

[{'is_oa': True,
  'landing_page_url': 'https://doi.org/10.1038/s41586-019-1666-5',
  'pdf_url': 'https://www.nature.com/articles/s41586-019-1666-5.pdf',
  'source': {'id': 'https://openalex.org/S137773608',
   'display_name': 'Nature',
   'issn_l': '0028-0836',
   'issn': ['1476-4687', '0028-0836'],
   'is_oa': False,
   'is_in_doaj': False,
   'host_organization': 'https://openalex.org/P4310319908',
   'host_organization_name': 'Nature Portfolio',
   'host_organization_lineage': ['https://openalex.org/P4310319908',
    'https://openalex.org/P4310319965'],
   'host_organization_lineage_names': ['Nature Portfolio', 'Springer Nature'],
   'type': 'journal'},
  'license': None,
  'version': 'publishedVersion',
  'is_accepted': True,
  'is_published': True},
 {'is_oa': True,
  'landing_page_url': 'http://arxiv.org/abs/1910.11333',
  'pdf_url': 'http://arxiv.org/pdf/1910.11333',
  'source': {'id': 'https://openalex.org/S4306400194',
   'display_name': 'arXiv (Cornell University)',
   'issn

In [93]:
sources_list = dftriple['source'].unique().tolist()
type(sources_list), len(sources_list)

def get_source_json(s:str):
    """
    s is an openalex Sources display_name
    return that Sources object
    """
    source_json = Sources().filter(display_name = s).get()
    a = source_json[0]['type']
    if "homepage_url" in source_json[0] and source_json[0]['homepage_url']:
       # print(f"{s} has homepage_url and type {source_json[0]['type']}")
        return source_json[0]["homepage_url"]
    else:
        return None
    
    
def get_display_page_dict(sl:list):
    """
    sl is a list of Sources display_name values
    returns the dictionary mapping
    display_names with homepage_url values."""
    mapping_dict = dict()
    for s in tqdm(sl):
        try:
            mapping_dict[s] = get_source_json(s)
        except:
            pass
    return mapping_dict

source_page_dict = get_display_page_dict(sources_list)

import pickle

with open("source_page_dict.pkl", "wb") as f:
    pickle.dump(source_page_dict, f)

100%|█████████████████████████████████████████████████████████████████████| 1639/1639 [12:24<00:00,  2.20it/s]


In [94]:
dftriple.columns

Index(['id', 'display_name', 'ror', 'country_code', 'type', 'lineage',
       'paper_id', 'paper_raw_affiliation_string', 'paper_author_position',
       'paper_doi', 'paper_title', 'paper_abstract', 'paper_publication_date',
       'paper_publication_year', 'paper_grants', 'paper_locations',
       'paper_is_corrresponding', 'paper_x', 'paper_y', 'paper_cluster',
       'paper_cluster_score', 'paper_author_id', 'paper_author_display_name',
       'paper_author_orcid', 'source', 'source_type', 'funder_list'],
      dtype='object')

In [95]:
dftriple['display_name'].head(10)

0           Google (United States)
1    Quantum Group (United States)
2           Google (United States)
3    Quantum Group (United States)
4           Google (United States)
5    Quantum Group (United States)
6           Google (United States)
7    Quantum Group (United States)
8           Google (United States)
9    Quantum Group (United States)
Name: display_name, dtype: object

In [96]:
affils_list = dftriple['display_name'].unique().tolist()
type(affils_list), len(affils_list)

(list, 4645)

In [98]:
affil_json = Institutions().filter(display_name = affils_list[1]).get()

In [102]:
def get_affil_json(s:str):
    """
    s is an openalex Sources display_name
    return that Sources object
    """
    affil_json = Institutions().filter(display_name = s).get()
    #a = source_json[0]['type']
    if "geo" in affil_json[0]:
       # print(f"{s} has homepage_url and type {source_json[0]['type']}")
        return affil_json[0]["geo"]["latitude"], affil_json[0]["geo"]["longitude"]
    else:
        return None, None

In [101]:
affil_json[0]["geo"]

{'city': 'San Diego',
 'geonames_city_id': '5391811',
 'region': 'California',
 'country_code': 'US',
 'country': 'United States',
 'latitude': 32.885925,
 'longitude': -117.17643}

In [103]:
def get_display_geo_dict(sl:list):
    """
    sl is a list of Institution display_name values
    returns the dictionary mapping
    display_names with (latitude, longitude) values."""
    mapping_dict = dict()
    for s in tqdm(sl):
        try:
            mapping_dict[s] = get_affil_json(s)
        except:
            pass
    return mapping_dict

In [104]:
affil_geo_dict = get_display_geo_dict(affils_list)

100%|█████████████████████████████████████████████████████████████████████| 4645/4645 [36:00<00:00,  2.15it/s]


In [105]:
import pickle

with open("affil_geo_dict.pkl", "wb") as f:
    pickle.dump(affil_geo_dict, f)