In [2]:
from pyalex import (
    Works, Authors, Sources,
    Institutions, Concepts, Publishers, Funders
)
import pyalex
import pandas as pd
import numpy as np
pyalex.config.email = "david@rs21.io"

from flair.embeddings import DocumentPoolEmbeddings
from flair.data import Sentence
from flair.embeddings import SentenceTransformerDocumentEmbeddings

EMBEDDING_MODEL_1 = "all-mpnet-base-v2" 

# this one is also good: all-MiniLM-L6-v2
EMBEDDING_MODEL_2 = "all-MiniLM-L6-v2"
SENT_EMBEDDINGS_1 = SentenceTransformerDocumentEmbeddings(EMBEDDING_MODEL_1)
SENT_EMBEDDINGS_2 = SentenceTransformerDocumentEmbeddings(EMBEDDING_MODEL_2)
DOC_EMBEDDINGS= DocumentPoolEmbeddings([SENT_EMBEDDINGS_2])

import torch
from tqdm import tqdm
import yake
import umap.umap_ as umap
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture as GMM
import altair as alt
import math
import plotly.express as px
import textwrap

In [3]:
search_term = 'quantum computation|quantum computers|quantum sensors|quantum cryptography'
quantum_concepts = Concepts().\
search_filter(display_name=search_term).get()

In [4]:
concepts = []
for i in range(len(quantum_concepts)):
    id_, display_name = quantum_concepts[i]['id'], quantum_concepts[i]['display_name']
    concepts.append((id_, display_name))
concepts

[('https://openalex.org/C58053490', 'Quantum computer'),
 ('https://openalex.org/C144901912', 'Quantum cryptography'),
 ('https://openalex.org/C89143813', 'Quantum sensor'),
 ('https://openalex.org/C187483380', 'Topological quantum computer'),
 ('https://openalex.org/C192353077', 'Adiabatic quantum computation'),
 ('https://openalex.org/C183968085', 'Trapped ion quantum computer'),
 ('https://openalex.org/C108277079', 'Post-quantum cryptography'),
 ('https://openalex.org/C19271316', 'One-way quantum computer')]

In [5]:
def process_works_list(worklist:list):
    """
    transforms the 
    works list into a dataframe.
    """
    abstracts_dict = {h["id"]:h["abstract"] for h in worklist}
    df = pd.DataFrame.from_records(worklist)
    del df['abstract_inverted_index']
    df['abstract'] = df['id'].map(abstracts_dict)
   # df['author_affils'] = df['authorships'].apply(get_authors_and_affils)
    return df

In [6]:
for i in range(len(quantum_concepts)):
    print(quantum_concepts[i]['id'], quantum_concepts[i]['works_count'])


https://openalex.org/C58053490 56517
https://openalex.org/C144901912 10518
https://openalex.org/C89143813 7686
https://openalex.org/C187483380 3277
https://openalex.org/C192353077 1961
https://openalex.org/C183968085 1150
https://openalex.org/C108277079 1217
https://openalex.org/C19271316 1525


In [7]:
def get_concept_frame(concepts_list:list, i:int):
    """
    takes a list of Concepts() results and an index
    and forms the pagination object to retrive the 
    records
    """
    pager = Works().filter(publication_year='>2016',
    #concepts={"id":f"{concepts_list[i]['id']}"}).filter(authorships={"institutions":{"country_code":"CN"}}).\
    #paginate(per_page=200,n_max=None)
    concepts={"id":f"{concepts_list[i]['id']}"}).\
    paginate(per_page=200,n_max=None)
    df = pd.DataFrame()
    for page in tqdm(pager):
        dfpage = process_works_list(page)
        df = pd.concat([df, dfpage], ignore_index=True)
        df.drop_duplicates(subset='id', keep='first',inplace=True)
    return df

In [8]:
frames_list = []
for i in range(len(quantum_concepts)):
    df = get_concept_frame(quantum_concepts, i)
    frames_list.append(df)

152it [04:16,  1.68s/it]
22it [00:34,  1.55s/it]
22it [00:37,  1.69s/it]
6it [00:09,  1.51s/it]
4it [00:05,  1.47s/it]
2it [00:03,  1.94s/it]
4it [00:05,  1.38s/it]
3it [00:05,  1.73s/it]


In [9]:
dftop = pd.concat(frames_list,
                  ignore_index=True)
dftop.drop_duplicates(subset='id', keep='first', 
                      inplace=True)

dftop.set_index('id', inplace=True, drop=False)

dfall = dftop
print(dfall.shape)

dfall['content'] = dfall['title'] + ". " + dfall['abstract']

dfrecords = dfall[~dfall['content'].isna()].copy()

(37811, 39)


In [10]:
def get_keywords(text:str, top:int=7, stopwords=None):
    """
    takes a blob of text and 
    returns the top **top** 
    keywords as a list
    """
    kw_extractor = yake.KeywordExtractor(top=top, stopwords=stopwords)
    keywords = kw_extractor.extract_keywords(text)
    return [p[0] for p in keywords]

In [11]:
def get_top_concepts(concept_list:list,score:float=.6):
    """
    takes a list of concept dictionaries 
    returns the top **top** display_names;
    concepts whose score is >= score
    """
    return [c['display_name'] for c in concept_list if c['score'] >= score]

In [12]:
dfrecords['keywords'] = dfrecords['content'].apply(get_keywords)
dfrecords['top_concepts'] = dfrecords['concepts'].apply(get_top_concepts)

In [13]:
texts = dfrecords['content'].str.lower().values.tolist()
#dfrecords.to_csv('jamming.csv')

In [14]:
def get_content_embeddings(dfrecords:pd.DataFrame) -> pd.DataFrame:
    """
    passes the preprocessed mitigation strings
    data through the embedding model to produce the vector
    space representation of each pet mitigation.
    """
    sent = Sentence("The grass is green.")
    DOC_EMBEDDINGS.embed(sent)
    texts = dfrecords["content"].str.lower().values.tolist()
    all_descriptions = np.empty((len(texts), len(sent.embedding)))
    for i in tqdm(range(len(texts))):
        sent = Sentence(texts[i])
        DOC_EMBEDDINGS.embed(sent)
        all_descriptions[i, :] = sent.embedding.cpu().numpy()
        # gc.collect()
        torch.cuda.empty_cache()
    dfcontentvectors = pd.DataFrame.from_records(all_descriptions, index=dfrecords.index)
    return dfcontentvectors

In [15]:
dfcontentvectors = get_content_embeddings(dfrecords)


100%|███████████████████████████████████████████████████████████████████| 32624/32624 [07:34<00:00, 71.81it/s]


In [16]:
#umap.UMAP?
N_COMPONENTS = 2 # can visualize this way
umap_reducer = umap.UMAP(n_components=N_COMPONENTS,
                       #  metric='euclidean')
                         random_state=1234,
                         metric='cosine')  # can experiment with this metric as well as the other 
# parameters
# to see what other literature is in the same information space, we need to keep this umap_reducer 
# object as well as the gmm model below.

# Apply UMAP to the vectorized strings
reduced_vectors = umap_reducer.fit_transform(dfcontentvectors.to_numpy())
dfreduced = pd.DataFrame.from_records(reduced_vectors, 
                index=dfcontentvectors.index)
dfreduced.columns = ['x','y']

## use hdbscan to cluster

In [17]:
import hdbscan

hdbscan_args = {'min_cluster_size': 20,
                            'metric': 'euclidean',
                            'cluster_selection_method': 'eom',
                            'cluster_selection_epsilon': 0.1
               }

cluster = hdbscan.HDBSCAN(**hdbscan_args).fit(dfreduced[['x','y']].to_numpy())

dfreduced['cluster'] = cluster.labels_
dfreduced['probability'] = cluster.probabilities_

dfpapers = dfrecords.merge(dfreduced, left_index=True,
                           right_index=True)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [18]:
#help(dfpapers.explode)
del dfpapers['id']
dfstart = dfpapers.reset_index()
dfstart.head()

Unnamed: 0,id,doi,title,display_name,publication_year,publication_date,ids,language,primary_location,type,...,created_date,abstract,is_authors_truncated,content,keywords,top_concepts,x,y,cluster,probability
0,https://openalex.org/W3101479050,https://doi.org/10.1038/s41586-019-1666-5,Quantum supremacy using a programmable superco...,Quantum supremacy using a programmable superco...,2019,2019-10-23,{'openalex': 'https://openalex.org/W3101479050...,en,"{'is_oa': True, 'landing_page_url': 'https://d...",article,...,2020-11-23,The promise of quantum computers is that certa...,,Quantum supremacy using a programmable superco...,"[programmable superconducting processor, Quant...","[Quantum computer, Qubit, Computer science]",1.411983,4.571363,-1,0.0
1,https://openalex.org/W2781738013,https://doi.org/10.22331/q-2018-08-06-79,Quantum Computing in the NISQ era and beyond,Quantum Computing in the NISQ era and beyond,2018,2018-08-06,{'openalex': 'https://openalex.org/W2781738013...,en,"{'is_oa': True, 'landing_page_url': 'https://d...",article,...,2018-01-12,Noisy Intermediate-Scale Quantum (NISQ) techno...,,Quantum Computing in the NISQ era and beyond. ...,"[NISQ era, Quantum, NISQ, Quantum Computing, q...","[Quantum computer, Computer science]",0.957508,4.270126,-1,0.0
2,https://openalex.org/W2559394418,https://doi.org/10.1038/nature23474,Quantum machine learning,Quantum machine learning,2017,2017-09-01,{'openalex': 'https://openalex.org/W2559394418...,en,"{'is_oa': False, 'landing_page_url': 'https://...",article,...,2016-12-08,Fuelled by increasing computer power and algor...,,Quantum machine learning. Fuelled by increasin...,"[machine learning, Quantum machine learning, m...","[Quantum machine learning, Computer science, Q...",5.271276,3.29148,95,1.0
3,https://openalex.org/W3037447387,https://doi.org/10.1103/revmodphys.89.035002,Quantum sensing,Quantum sensing,2017,2017-07-25,{'openalex': 'https://openalex.org/W3037447387...,en,"{'is_oa': True, 'landing_page_url': 'https://d...",article,...,2020-07-02,"""Quantum sensing"" describes the use of a quant...",,"Quantum sensing. ""Quantum sensing"" describes t...","[Quantum sensing, Quantum, sensing, quantum sy...","[Physics, Quantum sensor, Quantum technology, ...",-3.297313,6.944196,118,1.0
4,https://openalex.org/W2755255888,https://doi.org/10.1038/nature23879,Hardware-efficient variational quantum eigenso...,Hardware-efficient variational quantum eigenso...,2017,2017-09-01,{'openalex': 'https://openalex.org/W2755255888...,en,"{'is_oa': False, 'landing_page_url': 'https://...",article,...,2017-09-25,Quantum computers can be used to address molec...,,Hardware-efficient variational quantum eigenso...,"[Monte Carlo methods, quantum, Hardware-effici...","[Quantum computer, Pauli exclusion principle]",2.661423,6.710498,-1,0.0


In [19]:
dfstart.shape

(32624, 46)

In [20]:
dfbig = dfstart.explode(column='authorships')
dfbig.shape, dfstart.shape

((144232, 46), (32624, 46))

In [21]:
def add_extra_to_authorships(row: pd.DataFrame):
    """
    row[authorships] is a dictionary;
    add in the id key to that dictionary
    whose value is row[id]
    """
    complete_dict = row["authorships"]
   # assert type(complete_dict) == dict
    #print(type(complete_dict))
    if type(complete_dict) == dict:
        complete_dict["id"] = row["id"]
        complete_dict["x"] = row["x"]
        complete_dict["y"] = row["y"]
        complete_dict["cluster"] = row["cluster"]
        complete_dict["cluster_score"] = row["probability"]
        complete_dict["title"] = row["title"]
        complete_dict["abstract"] = row["abstract"]
        complete_dict["doi"] = row["doi"]
        complete_dict["publication_date"] = row["publication_date"]
        complete_dict["publication_year"] = row["publication_year"]
        complete_dict["grants"] = row["grants"]
        complete_dict["locations"] = row["locations"]
        return complete_dict
    else:
        return row["authorships"]

In [22]:
dfbig['big_authorships'] = dfbig.apply(add_extra_to_authorships, axis=1)

In [23]:
#dfbig['authorships'].tolist()
bigvals = dfbig['authorships'].tolist()

In [24]:
dictvals = [c for c in bigvals if type(c) != float]

In [25]:
dftriple = pd.json_normalize(dictvals,
                  record_path=['institutions'],
                  meta=['id','raw_affiliation_string','author_position', 'doi',
                        'title','abstract','publication_date', 'publication_year',
                        'grants','locations',
                        'is_corrresponding','x','y','cluster','cluster_score',
                       ['author','id'], ['author', 'display_name'],
                       ['author','orcid']],
                  errors='ignore',
                  sep='_',
                  meta_prefix='paper_',
                #  record_prefix='author_'
                 )

In [26]:
dftopics = dfcontentvectors.copy()
dftopics['cluster'] = dfpapers['cluster']
dfmeantopics = dftopics.groupby('cluster').mean().copy()
reduced_topics = umap_reducer.transform(dfmeantopics.to_numpy())
df_reduced_topics = pd.DataFrame.from_records(reduced_topics, 
                index=dfmeantopics.index)
df_reduced_topics.columns = ['x','y']
df_reduced_topics['topic'] = df_reduced_topics.index
df_reduced_topics.head()

def get_cluster_concepts(topic_num:int, n:int=20):
    """
    takes an integer topic_num corresponding to a 
    given topic number and
    returns the list of top n occuring concepts
    from the top_concept field
    """
    top_concepts = dfpapers[dfpapers['cluster'] == topic_num]['top_concepts'].tolist()
    flat_concepts = [item for sublist in top_concepts for item in sublist]
    concepts_dict = {c:flat_concepts.count(c) for c in flat_concepts}
    sorted_concepts = sorted(concepts_dict.items(), key=lambda x:x[1], reverse=True)
    return [c[0] for c in sorted_concepts][:n]

def get_yake_cluster_phrases(topic_num:int, n:int=20):
    """
    takes in an integer n corresponding
    to a given topic number and
    returns the list of keyphrases (TopicRank method)
    """
    documents = dfpapers[dfpapers['cluster'] == topic_num]['content'].tolist()
    topic_input = ". ".join(documents)
    #extractor = pke.unsupervised.TextRank()
    kw_extractor = yake.KeywordExtractor(top=n, stopwords=None)
    keywords = kw_extractor.extract_keywords(topic_input)
    #extractor.load_document(input=topic_input,
    #                    language='en',
    #                    normalization=None)

    #extractor.candidate_selection()

    #window = 2
    #use_stems = False
    #extractor.candidate_weighting(window=window,
    #                          use_stems=use_stems)
    #extractor.candidate_weighting()
    #threshold = 0.8
   # keyphrases = extractor.get_n_best(n=20, threshold=threshold)
    #keyphrases = extractor.get_n_best(n=n)
    return [p[0] for p in keywords]

wikiconcepts = df_reduced_topics['topic'].apply(get_cluster_concepts)

wikikeywords = df_reduced_topics['topic'].apply(get_yake_cluster_phrases)

dfpapers['id'] = dfpapers.index
dfinfo = dfpapers[['x','y','id','title','doi','cluster','grants',
                   'locations',
                 'publication_date','keywords','top_concepts']].copy()

centroids = dfinfo.groupby('cluster')[['x','y']].mean().copy()
centroids['concepts'] = wikiconcepts
centroids['cluster'] = centroids.index
centroids['keywords'] = wikikeywords

In [27]:
def wrap_it(x):
    return "<br>".join(textwrap.wrap(x, width=40))
   # return "<br>".join(te

In [28]:
centroids['wrapped_keywords'] = centroids['keywords'].apply(str).apply(wrap_it)
centroids['wrapped_concepts'] = centroids['concepts'].apply(str).apply(wrap_it)

In [29]:
centroids.to_pickle('quantumcentroids2d.pkl')

In [30]:
dftriple.to_pickle('quantumdftriple2d.pkl')


In [31]:
def get_affils_cluster_sort(dc:pd.DataFrame, cl:int):
    """
    restricts the dataframe dc to cluster value cl
    and returns the results grouped by id, ror sorted
    by the some of probablity descending
    """
    dg = dc[dc['paper_cluster'] == cl].copy()
    print(cl)
    dv = dg.groupby(['id','display_name','country_code',
                     'type'])['paper_cluster_score'].sum().to_frame()
    dv.sort_values('paper_cluster_score', ascending=False, inplace=True)
    kw = centroids[centroids.cluster == cl]['keywords'].iloc[0]
    return dv, kw

In [32]:
dv84, kw84 = get_affils_cluster_sort(dftriple, 1)
print(kw84)
dv84.head(10)

1
['Quantum Computer Music', 'quantum', 'quantum computing', 'Quantum Computer', 'Computer Music', 'Music', 'quantum computer hardware', 'computer', 'quantum music', 'quantum music generation', 'computing', 'Music Quantum Computing', 'music composition', 'computers', 'quantum algorithms', 'algorithmic computer music', 'quantum hardware', 'quantum computer algorithm', 'quantum computing tools', 'hardware quantum computers']


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,paper_cluster_score
id,display_name,country_code,type,Unnamed: 4_level_1
https://openalex.org/I897542642,University of Plymouth,GB,education,11.0
https://openalex.org/I36234482,University of Bristol,GB,education,2.0
https://openalex.org/I4210161128,Bristol Robotics Laboratory,GB,facility,2.0
https://openalex.org/I51601045,University of Bath,GB,education,1.0
https://openalex.org/I106645853,Changchun University of Science and Technology,CN,education,0.406836
https://openalex.org/I142608572,Prince Sattam Bin Abdulaziz University,SA,education,0.135612


In [34]:
dv84, kw84 = get_affils_cluster_sort(dftriple, 2)
print(kw84)
dv84.head(10)

2
['Quantum Natural Language', 'natural language processing', 'Natural Language', 'Quantum', 'language processing', 'Quantum Natural', 'Quantum Language Processing', 'Language', 'quantum computing', 'quantum hardware', 'quantum circuits', 'NLP', 'Natural', 'quantum NLP', 'language processing tasks', 'quantum computers', 'processing', 'Near-Term Quantum Natural', 'Quantum NLP models', 'Noisy Intermediate-Scale Quantum']


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,paper_cluster_score
id,display_name,country_code,type,Unnamed: 4_level_1
https://openalex.org/I875944469,Koneru Lakshmaiah Education Foundation,IN,education,6.0
https://openalex.org/I193662353,Utrecht University,NL,education,4.0
https://openalex.org/I40120149,University of Oxford,GB,education,4.0
https://openalex.org/I150229711,University of Electronic Science and Technology of China,CN,education,3.0
https://openalex.org/I188760350,Ollscoil na Gaillimhe – University of Galway,IE,education,3.0
https://openalex.org/I2802744477,Irish Centre for High-End Computing,IE,other,3.0
https://openalex.org/I3005160176,Institute for High Performance Computing and Networking,IT,facility,3.0
https://openalex.org/I4210155236,National Research Council,IT,nonprofit,3.0
https://openalex.org/I184999862,University of Salamanca,ES,education,2.0
https://openalex.org/I1341412227,IBM (United States),US,company,1.0


In [36]:
dv84, kw84 = get_affils_cluster_sort(dftriple, 10)
print(kw84)
dv84.head(20)

10
['quantum image processing', 'quantum image', 'quantum image representation', 'image processing', 'Quantum', 'image', 'image processing algorithms', 'quantum information processing', 'quantum computing', 'image representation', 'classical image processing', 'images', 'Quantum Image Algorithm', 'NEQR quantum images', 'Quantum color image', 'quantum image scaling', 'quantum Boolean image', 'quantum algorithm', 'quantum image circuit', 'quantum circuits']


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,paper_cluster_score
id,display_name,country_code,type,Unnamed: 4_level_1
https://openalex.org/I29739308,Guangxi Normal University,CN,education,8.750289
https://openalex.org/I153230381,Charles Sturt University,AU,education,8.064364
https://openalex.org/I3019415892,Fraunhofer Institute for Industrial Mathematics,DE,facility,6.621495
https://openalex.org/I4210144143,Inspur (China),CN,company,6.0
https://openalex.org/I44468530,Qingdao University of Technology,CN,education,5.94068
https://openalex.org/I143413998,Qingdao University of Science and Technology,CN,education,5.94068
https://openalex.org/I863896202,Delhi Technological University,IN,education,4.998127
https://openalex.org/I4210128053,Institute of Remote Sensing and Digital Earth,CN,facility,4.888538
https://openalex.org/I106645853,Changchun University of Science and Technology,CN,education,3.753255
https://openalex.org/I137534880,Southern Federal University,RU,education,3.726289


In [37]:
dfinfo = dfpapers[['x','y','id','title','doi','cluster','probability',
                 'publication_date','grants','locations',
                   'keywords','top_concepts']].copy()

In [38]:
pap_affils_dict = dftriple.groupby('paper_id')['paper_raw_affiliation_string'].\
apply(lambda x: ' | '.join(x.tolist()))

#pap_authors_dict = dftriple.groupby('paper_id')['paper_author_display_name'].\
#apply(lambda x: ' | '.join(x.tolist()))

In [39]:
pap_authors_dict = dftriple.groupby('paper_id')['paper_author_display_name'].apply(lambda x: x.values)

In [40]:
dfinfo['affil_list'] = pap_affils_dict
dfinfo['author_list'] = pap_authors_dict

In [41]:
dfinfo['affil_list'] = dftriple.groupby('paper_id')['paper_raw_affiliation_string'].\
apply(lambda x: x.tolist())

In [42]:
dfinfo['author_list'] =  dftriple.groupby('paper_id')['paper_author_display_name'].\
apply(lambda x: x.tolist())

In [43]:
dfinfo['wrapped_affil_list'] = dfinfo['affil_list'].apply(str).apply(wrap_it)
dfinfo['wrapped_author_list'] = dfinfo['author_list'].apply(str).apply(wrap_it)

In [44]:
dfinfo['wrapped_keywords'] = dfinfo['keywords'].apply(str).apply(wrap_it)

In [45]:
def get_source_name(loc_list):
    """
    grab the first item in the list;
    retturn the display name
    """
    try:
        primary = loc_list[0]
        return primary["source"]["display_name"]
    except:
        return None

def get_source_type(loc_list):
    """
    grab the first item in the list;
    return the source type
    """
    try:
        primary = loc_list[0]
        return primary["source"]["type"]
    except:
        return None
    
    

In [46]:
dfinfo["source"] = dfinfo["locations"].apply(get_source_name)
dfinfo["source_type"] = dfinfo["locations"].apply(get_source_type)

In [47]:
dfinfo.to_pickle('quantumdfinfo2d.pkl')

In [48]:
def get_funder_names(funder_list):
    """
    funder_list is a list of dictionaries
    with three keys; return the list of 
    unique **funder_display_name**
    values
    """
    try:
        funder_names = list(set([f['funder_display_name'] for f in funder_list]))
        return funder_names    
    except:
        return []

In [49]:
dftriple["source"] = dftriple["paper_locations"].apply(get_source_name)
dftriple["source_type"] = dftriple["paper_locations"].apply(get_source_type)
dftriple["funder_list"] = dftriple["paper_grants"].apply(get_funder_names)

In [50]:
dftriple.to_pickle('quantumdftriple2d.pkl')

In [51]:
def get_journals_cluster_sort(dc:pd.DataFrame, cl:int):
    """
    restricts the dataframe dc to cluster value cl
    and returns the results grouped by source (where
    source_type == 'journal') sorted
    by the some of probablity descending
    """
    dg = dc[dc['paper_cluster'] == cl].copy()
    print(cl)
    dv = dg[dg['source_type'] == 'journal'].groupby(['source'])['paper_cluster_score'].sum().to_frame()
    dv.sort_values('paper_cluster_score', ascending=False, inplace=True)
    kw = centroids[centroids.cluster == cl]['keywords'].iloc[0]
    return dv, kw

In [52]:
def get_conferences_cluster_sort(dc:pd.DataFrame, cl:int):
    """
    restricts the dataframe dc to cluster value cl
    and returns the results grouped by source (where
    source_type == 'journal') sorted
    by the some of probablity descending
    """
    dg = dc[dc['paper_cluster'] == cl].copy()
    print(cl)
    dv = dg[dg['source_type'] == 'conference'].groupby(['source'])['paper_cluster_score'].sum().to_frame()
    dv.sort_values('paper_cluster_score', ascending=False, inplace=True)
    kw = centroids[centroids.cluster == cl]['keywords'].iloc[0]
    return dv, kw

In [53]:
dv84, kw84 = get_journals_cluster_sort(dftriple, 1)
print(kw84)
dv84.head(10)

1
['Quantum Computer Music', 'quantum', 'quantum computing', 'Quantum Computer', 'Computer Music', 'Music', 'quantum computer hardware', 'computer', 'quantum music', 'quantum music generation', 'computing', 'Music Quantum Computing', 'music composition', 'computers', 'quantum algorithms', 'algorithmic computer music', 'quantum hardware', 'quantum computer algorithm', 'quantum computing tools', 'hardware quantum computers']


Unnamed: 0_level_0,paper_cluster_score
source,Unnamed: 1_level_1
Applied sciences,5.0
Journal of New Music Research,1.0
Muzikologija,1.0
Theoretical Computer Science,0.542447


In [54]:
dv84, kw84 = get_conferences_cluster_sort(dftriple, 1)
print(kw84)
dv84.head(10)

1
['Quantum Computer Music', 'quantum', 'quantum computing', 'Quantum Computer', 'Computer Music', 'Music', 'quantum computer hardware', 'computer', 'quantum music', 'quantum music generation', 'computing', 'Music Quantum Computing', 'music composition', 'computers', 'quantum algorithms', 'algorithmic computer music', 'quantum hardware', 'quantum computer algorithm', 'quantum computing tools', 'hardware quantum computers']


Unnamed: 0_level_0,paper_cluster_score
source,Unnamed: 1_level_1


In [55]:
def get_country_collaborations_sort(dc:pd.DataFrame, cl:int):
    """
    resticts the dataframe dc to cluster value cl
    and returns the results of paper_id s where there is 
    more than one country_code
    """
    dg = dc[dc['paper_cluster'] == cl].copy()
    dv = dg.groupby('paper_id')['country_code'].apply(lambda x: len(set(x.values))).to_frame()
    dc = dg.groupby('paper_id')['country_code'].apply(lambda x: list(set(x.values))).to_frame()
    dc.columns = ['collab_countries']
    dv.columns = ['country_count']
    dv['collab_countries'] = dc['collab_countries']
    dv.sort_values('country_count',ascending=False, inplace=True)
    di = dfinfo.loc[dv.index].copy()
    di['country_count'] = dv['country_count']
    di['collab_countries'] = dv['collab_countries']
    return di[di['country_count'] > 1]

In [56]:
quantum_concepts

[{'id': 'https://openalex.org/C58053490',
  'wikidata': 'https://www.wikidata.org/wiki/Q176555',
  'display_name': 'Quantum computer',
  'relevance_score': 36806.66,
  'level': 3,
  'description': 'theoretical computation device relying on quantum mechanics',
  'works_count': 56517,
  'cited_by_count': 1080023,
  'summary_stats': {'2yr_mean_citedness': 2.94691092625479,
   'h_index': 402,
   'i10_index': 13002},
  'ids': {'openalex': 'https://openalex.org/C58053490',
   'wikidata': 'https://www.wikidata.org/wiki/Q176555',
   'mag': '58053490',
   'wikipedia': 'https://en.wikipedia.org/wiki/Quantum%20computing'},
  'image_url': 'https://upload.wikimedia.org/wikipedia/commons/6/60/IBM_Q_system_%28Fraunhofer_2%29.jpg',
  'image_thumbnail_url': 'https://upload.wikimedia.org/wikipedia/commons/thumb/6/60/IBM_Q_system_%28Fraunhofer_2%29.jpg/100px-IBM_Q_system_%28Fraunhofer_2%29.jpg',
  'international': {'display_name': {'ar': 'حاسوب كمومي',
    'ast': 'Computación cuántica',
    'az': 'Kvant 

In [57]:
import networkx as nx
from pyvis.network import Network
import igraph as ig # for getting a layout w/o relying on slow pyvis physics 

In [58]:
dfinfo["funder_list"] = dfinfo["grants"].apply(get_funder_names)
dfinfo["wrapped_funder_list"] = dfinfo["funder_list"].apply(str).apply(wrap_it)

In [59]:
dfinfo.to_pickle('quantumdfinfo2d.pkl')

In [60]:
kw_dict = dfinfo['keywords'].to_dict()

In [61]:
dc = dftriple[dftriple['paper_cluster'] == 10].copy()
dc.shape

(291, 26)

In [62]:
[x for row in dc['funder_list'].tolist() for x in row]

['Qatar National Research Fund',
 'National Natural Science Foundation of China',
 'Deutsche Forschungsgemeinschaft',
 'Chinese Academy of Sciences',
 'Natural Science Foundation of Guangdong Province',
 'Qatar National Research Fund',
 'National Natural Science Foundation of China',
 'Deutsche Forschungsgemeinschaft',
 'Chinese Academy of Sciences',
 'Natural Science Foundation of Guangdong Province',
 'Qatar National Research Fund',
 'National Natural Science Foundation of China',
 'Deutsche Forschungsgemeinschaft',
 'Chinese Academy of Sciences',
 'Natural Science Foundation of Guangdong Province',
 'Qatar National Research Fund',
 'National Natural Science Foundation of China',
 'Deutsche Forschungsgemeinschaft',
 'Chinese Academy of Sciences',
 'Natural Science Foundation of Guangdong Province',
 'Qatar National Research Fund',
 'National Natural Science Foundation of China',
 'Deutsche Forschungsgemeinschaft',
 'Chinese Academy of Sciences',
 'Natural Science Foundation of Guangd

In [63]:
kw_dict = dfinfo['keywords'].to_dict()

# add in the affiliations as nodes as well; that row, author, paper, affil. all three get links. ok.
def create_nx_graph(df: pd.DataFrame, cl:int) -> nx.Graph:
    """
    takes the dataframe df, and creates the undirected graph
    from the source and target columns for each row.
    """
    g = nx.Graph() # dc['paper_cluster'] == cl
    dc = df[df['paper_cluster'] == cl]
    author_counts = dc['paper_author_id'].tolist()
    author_counts_dict = {c:author_counts.count(c) for c in author_counts}
    affiliation_counts = dc['id'].tolist()
    affiliation_counts_dict = {c:affiliation_counts.count(c) for c in affiliation_counts}
    source_counts = dc['source'].tolist()
    source_counts_dict = {c:source_counts.count(c) for c in source_counts}
    funder_counts = [x for row in dc['funder_list'].tolist() for x in row]
    funder_counts_dict = {c:funder_counts.count(c) for c in funder_counts}
    for index, row in df[df['paper_cluster'] == cl].iterrows():
        g.add_node(row['paper_id'], group='work', title=row['paper_title'])
        g.add_node(row['paper_author_id'], title=row['paper_author_display_name'],
                   group='author',value = author_counts_dict[row['paper_author_id']])
        g.add_node(row['id'], group='affiliation',
                   title=row['display_name'] + '\n' + row['country_code'],
                  value = affiliation_counts_dict[row['id']])
        if row['source']:
            g.add_node(row['source'], group=row['source_type'],
                      title=row['source'] + ' :\n ' + row['source_type'],
                      value=source_counts_dict[row['source']])
            g.add_edge(
                row['paper_id'],
                row['source'],
                title=row['paper_title'] + ' :\n ' + str(row['paper_publication_date']) +  \
                ' :\n' + row['source'] + ' :\n ' + \
                row['source_type'],
              #  weight = df[(df['paper_id'] == row['paper_id']) & \
              #              (df['source'] == row['source'])]['paper_cluster_score'].sum()
               # weight = row['paper_cluster_score']
            )
            g.add_edge(
                row['paper_author_id'],
                row['source'],
                title=row['paper_author_display_name'] + ':\n' + row['source'],
             #   weight = df[(df['paper_author_id'] == row['paper_author_id']) & \
              #              (df['source'] == row['source'])]['paper_cluster_score'].sum()
               # weight = row['paper_cluster_score']
            )
        if len(row['funder_list']) > 0:
            for f in row['funder_list']:
                g.add_node(f, group='funder',
                          title=str(f),
                          value = founder_counts_dict[f]),
                g.add_edge(
                       row['paper_id'],
                       f,
                       title=row['paper_title'] + ':\n ' +  str(row['paper_publication_date']) + \
                       ' :\n' + str(f),
                  #  weight = row['paper_cluster_score']
                   )
                g.add_edge(
                       f,
                       row['paper_author_id'],
                       title=row['paper_author_display_name'] + ' :\n ' + \
                       str(f),
                  #  weight = row['paper_cluster_score']
                       
                   )
                g.add_edge(
                       f,
                       row['id'],
                       title=row['display_name'] + '\n' + row['country_code'] + ' :\n ' + \
                       str(f)  ,
                  #  weight = row['paper_cluster_score']
                   )  
                if row["source"]:
                    g.add_edge(
                        f,
                        row["source"],
                        title=row["source"] + ' :\n' + str(f),
                     #   weight = row['paper_cluster_score']
                    )
        g.nodes[row['paper_id']]['title'] = (
            row['paper_title'] + ' :\n ' + str(row['paper_publication_date'] + ':\n' + 
            '\n'.join(kw_dict[row['paper_id']]))
        )
        g.nodes[row['paper_author_id']]['title'] = (
            row['paper_author_display_name']
        )
        g.add_edge(
            row['paper_id'],
            row['paper_author_id'],
        title=row['paper_title'] + ' :\n ' + row['paper_author_display_name'] + ' :\n ' + \
            row['paper_raw_affiliation_string'],
         #   weight = row['paper_cluster_score']
        )
        g.add_edge(
            row['paper_author_id'],
            row['id'],
            title=row['paper_author_display_name'] + ' :\n ' + \
            row['display_name'] + ' :\n ' + row['country_code'],
          #  weight = row['paper_cluster_score']
        )
        g.add_edge(
            row['paper_id'],
            row['id'],
            title=row['paper_title'] + ' :\n ' + str(row['paper_publication_date']) + ':\n' + 
            row['display_name'] + ' :\n ' + row['country_code'],
         #   weight = row['paper_cluster_score']
        )
        
    g_ig = ig.Graph.from_networkx(g) # assign 'x', and 'y' to g before returning
    #layout = g_ig.layout_auto()
    #layout = g_ig.layout_davidson_harel()
    layout = g_ig.layout_umap(min_dist = 2, epochs = 500)
    # https://igraph.org/python/tutorial/0.9.6/visualisation.html
    coords = layout.coords
    allnodes = list(g.nodes())
    coords_dict = {allnodes[i]:(coords[i][0], coords[i][1]) for i in range(len(allnodes))}
    for i in g.nodes():
        g.nodes[i]['x'] = 250 * coords_dict[i][0] # the scale factor needed 
        g.nodes[i]['y'] = 250 * coords_dict[i][1]
    return g

In [64]:
def create_pyvis_html(cl: int, filename: str = "pyvis_coauthorships_graph.html"):
    """
    wrapper function that calls create_nx_graph to finally 
    produce an interactive pyvis standalone html file
    """
    g_nx = create_nx_graph(dftriple, cl);
    h = Network(height="1000px",
          #  heading="Mitigations and Techniques Relationships",
                width="100%",
                cdn_resources="remote", # can grab the visjs library to make this local if needed
            # probably should
                bgcolor="#222222",
            neighborhood_highlight=True,
              # default_node_size=1,
                font_color="white",
                directed=False,
               # select_menu=True,
                filter_menu=True,
                notebook=False,
               )
    #h.repulsion()
    h.from_nx(g_nx, show_edge_weights=False)
    #h.barnes_hut()
    #h.repulsion(node_distance=40,
    #            central_gravity=-0.2, spring_length=5, spring_strength=0.005, damping=0.09)
    neighbor_map = h.get_adj_list()
   # for node in h.nodes:
   #     if node['group'] == 'author':
   #         a = list(neighbor_map[node["id"]]) # want to insert a "\n" into every third element of a
   #     if node['group'] == 'work':
   #         a = list(neighbor_map[node["id"]])
   #     i = 3
   #     while i < len(a):
   #         a.insert(i, "\n")
   #         i += 4
   #     node["title"] += "\n Neighbors: \n" + " | ".join(a)
   #     node["value"] = len(neighbor_map[node["id"]]) 
# "physics": {
#    "enabled": false
#  },
    h.set_options(
    """
const options = {
  "interaction": {
    "navigationButtons": false
  },
 "physics": {
     "enabled": false
 },
  "edges": {
    "color": {
        "inherit": true
    },
    "setReferenceSize": null,
    "setReference": {
        "angle": 0.7853981633974483
    },
    "smooth": {
        "forceDirection": "none"
    }
  }
  }
    """
    )
    #h.show_buttons(filter_=['physics'])
  #  h.barnes_hut()
    #h.repulsion()
    try:
        path = './tmp'
        h.save_graph(f"{path}/{filename}")
        HtmlFile = open(f"{path}/{filename}","r",
                        encoding='utf-8')
    except:
        h.save_graph(f"{filename}")
        HtmlFile = open(f"{filename}", "r",
                        encoding="utf-8")
    return h

In [65]:
def get_time_series(dg, cl:int):
    """
    takes dg and the cluster number cl
    and returns a time series chart
    by month, y-axis is the article count
    """
    dftime = dg[dg.cluster == cl][['cluster','probability','publication_date']].copy()
    dftime['date'] = pd.to_datetime(dftime['publication_date'])
    dftime.sort_values('date', inplace=True)
    #by_month = pd.to_datetime(dftime['date']).dt.to_period('M').value_counts().sort_index()
    #by_month.index = pd.PeriodIndex(by_month.index)
    #df_month = by_month.rename_axis('month').reset_index(name='counts')
    return dftime
    

In [66]:
import altair as alt
#alt.data_transformers.enable("data_server")