In [2]:
from pyalex import (
    Works, Authors, Sources,
    Institutions, Concepts, Publishers, Funders
)
import pyalex
import pandas as pd
import numpy as np
pyalex.config.email = "david@rs21.io"

from flair.embeddings import DocumentPoolEmbeddings
from flair.data import Sentence
from flair.embeddings import SentenceTransformerDocumentEmbeddings

EMBEDDING_MODEL_1 = "all-mpnet-base-v2" 

# this one is also good: all-MiniLM-L6-v2
EMBEDDING_MODEL_2 = "all-MiniLM-L6-v2"
SENT_EMBEDDINGS_1 = SentenceTransformerDocumentEmbeddings(EMBEDDING_MODEL_1)
SENT_EMBEDDINGS_2 = SentenceTransformerDocumentEmbeddings(EMBEDDING_MODEL_2)
DOC_EMBEDDINGS= DocumentPoolEmbeddings([SENT_EMBEDDINGS_2])

import torch
from tqdm import tqdm
import yake
import umap.umap_ as umap
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture as GMM
import altair as alt
import math
import plotly.express as px
import textwrap

In [3]:
def process_works_list(worklist:list):
    """
    transforms the 
    works list into a dataframe.
    """
    abstracts_dict = {h["id"]:h["abstract"] for h in worklist}
    df = pd.DataFrame.from_records(worklist)
    del df['abstract_inverted_index']
    df['abstract'] = df['id'].map(abstracts_dict)
   # df['author_affils'] = df['authorships'].apply(get_authors_and_affils)
    return df

In [9]:
def get_sg_frame():
    """
    get all works from the given instituion id
    """
    pager = Works().filter(authorships={"institutions": {"country_code": "SG"}}).\
    filter(publication_year='>2021').\
    paginate(per_page=200, n_max = 60_000)
    df = pd.DataFrame()
    for page in tqdm(pager):
      #  print(len(page))
        dfpage = process_works_list(page)
        df = pd.concat([df, dfpage], ignore_index=True)
        df.drop_duplicates(subset='id', keep='first',inplace=True)
    return df

In [10]:
dftop = get_sg_frame()

241it [07:40,  1.91s/it]


In [11]:
dftop.drop_duplicates(subset='id', keep='first', 
                      inplace=True)

dftop.set_index('id', inplace=True, drop=False)

dfall = dftop
print(dfall.shape)

dfall['content'] = dfall['title'] + ". " + dfall['abstract']

dfrecords = dfall[~dfall['content'].isna()].copy()

(48099, 40)


In [12]:
def get_keywords(text:str, top:int=7, stopwords=None):
    """
    takes a blob of text and 
    returns the top **top** 
    keywords as a list
    """
    kw_extractor = yake.KeywordExtractor(top=top, stopwords=stopwords)
    keywords = kw_extractor.extract_keywords(text)
    return [p[0] for p in keywords]

In [13]:
def get_top_concepts(concept_list:list,score:float=.6):
    """
    takes a list of concept dictionaries 
    returns the top **top** display_names;
    concepts whose score is >= score
    """
    return [c['display_name'] for c in concept_list if c['score'] >= score]

In [14]:
dfrecords['keywords'] = dfrecords['content'].apply(get_keywords)
dfrecords['top_concepts'] = dfrecords['concepts'].apply(get_top_concepts)

In [15]:
texts = dfrecords['content'].str.lower().values.tolist()
#dfrecords.to_csv('jamming.csv')


In [16]:
def get_content_embeddings(dfrecords:pd.DataFrame) -> pd.DataFrame:
    """
    passes the preprocessed mitigation strings
    data through the embedding model to produce the vector
    space representation of each pet mitigation.
    """
    sent = Sentence("The grass is green.")
    DOC_EMBEDDINGS.embed(sent)
    texts = dfrecords["content"].str.lower().values.tolist()
    all_descriptions = np.empty((len(texts), len(sent.embedding)))
    for i in tqdm(range(len(texts))):
        sent = Sentence(texts[i])
        DOC_EMBEDDINGS.embed(sent)
        all_descriptions[i, :] = sent.embedding.cpu().numpy()
        # gc.collect()
        torch.cuda.empty_cache()
    dfcontentvectors = pd.DataFrame.from_records(all_descriptions, index=dfrecords.index)
    return dfcontentvectors

In [17]:
dfcontentvectors = get_content_embeddings(dfrecords)


100%|███████████████████████████████████████████████████████████████████| 41813/41813 [11:54<00:00, 58.49it/s]


In [18]:
#umap.UMAP?
N_COMPONENTS = 2 # can visualize this way
umap_reducer = umap.UMAP(n_components=N_COMPONENTS,
                       #  metric='euclidean')
                         random_state=1234,
                         metric='cosine')  # can experiment with this metric as well as the other 
# parameters
# to see what other literature is in the same information space, we need to keep this umap_reducer 
# object as well as the gmm model below.

# Apply UMAP to the vectorized strings
reduced_vectors = umap_reducer.fit_transform(dfcontentvectors.to_numpy())
dfreduced = pd.DataFrame.from_records(reduced_vectors, 
                index=dfcontentvectors.index)
dfreduced.columns = ['x','y']

In [19]:
import hdbscan

hdbscan_args = {'min_cluster_size': 15,
                            'metric': 'euclidean',
                            'cluster_selection_method': 'eom',
                            'cluster_selection_epsilon': 0.1
               }

cluster = hdbscan.HDBSCAN(**hdbscan_args).fit(dfreduced[['x','y']].to_numpy())

dfreduced['cluster'] = cluster.labels_
dfreduced['probability'] = cluster.probabilities_

dfpapers = dfrecords.merge(dfreduced, left_index=True,
                           right_index=True)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [20]:
#help(dfpapers.explode)
del dfpapers['id']
dfstart = dfpapers.reset_index()
dfstart.head()

Unnamed: 0,id,doi,title,display_name,publication_year,publication_date,ids,language,primary_location,type,...,created_date,is_authors_truncated,abstract,content,keywords,top_concepts,x,y,cluster,probability
0,https://openalex.org/W4283271244,https://doi.org/10.1038/s41375-022-01613-1,The 5th edition of the World Health Organizati...,The 5th edition of the World Health Organizati...,2022,2022-06-22,{'openalex': 'https://openalex.org/W4283271244...,en,"{'is_oa': True, 'landing_page_url': 'https://d...",article,...,2022-06-23,,Abstract The upcoming 5th edition of the World...,The 5th edition of the World Health Organizati...,"[World Health Organization, Health Organizatio...","[Histiocyte, Myeloid]",8.96454,-0.496561,-1,0.0
1,https://openalex.org/W3003265726,https://doi.org/10.1109/tnnls.2021.3070843,"A Survey on Knowledge Graphs: Representation, ...","A Survey on Knowledge Graphs: Representation, ...",2022,2022-02-01,{'openalex': 'https://openalex.org/W3003265726...,en,"{'is_oa': False, 'landing_page_url': 'https://...",article,...,2020-02-07,,Human knowledge provides a formal understandin...,"A Survey on Knowledge Graphs: Representation, ...","[Knowledge, knowledge graph, knowledge acquisi...",[Computer science],6.940184,9.486319,-1,0.0
2,https://openalex.org/W4283381496,https://doi.org/10.1038/s41375-022-01620-2,The 5th edition of the World Health Organizati...,The 5th edition of the World Health Organizati...,2022,2022-06-22,{'openalex': 'https://openalex.org/W4283381496...,en,"{'is_oa': True, 'landing_page_url': 'https://d...",article,...,2022-06-25,,We herein present an overview of the upcoming ...,The 5th edition of the World Health Organizati...,"[World Health Organization, Health Organizatio...",[],8.992878,-0.450357,-1,0.0
3,https://openalex.org/W4220838968,https://doi.org/10.1038/s41586-022-04492-9,Reproducible brain-wide association studies re...,Reproducible brain-wide association studies re...,2022,2022-03-16,{'openalex': 'https://openalex.org/W4220838968...,en,"{'is_oa': True, 'landing_page_url': 'https://d...",article,...,2022-04-03,,Magnetic resonance imaging (MRI) has transform...,Reproducible brain-wide association studies re...,"[MRI, BWAS, functional MRI, task functional MR...","[Sample size determination, Neuroimaging, Repl...",8.807067,4.272074,-1,0.0
4,https://openalex.org/W4310461604,https://doi.org/10.1056/nejmoa2212948,Lecanemab in Early Alzheimer’s Disease,Lecanemab in Early Alzheimer’s Disease,2023,2023-01-05,{'openalex': 'https://openalex.org/W4310461604...,en,"{'is_oa': True, 'landing_page_url': 'https://d...",article,...,2022-12-10,,The accumulation of soluble and insoluble aggr...,Lecanemab in Early Alzheimer’s Disease. The ac...,"[Alzheimer ’s Disease, Early Alzheimer, Diseas...","[Medicine, Disease, Alzheimer's disease, Monoc...",8.518635,1.673271,-1,0.0


In [21]:
dfstart.shape

(41813, 47)

In [22]:
dfbig = dfstart.explode(column='authorships')
dfbig.shape, dfstart.shape

((295703, 47), (41813, 47))

In [23]:
def add_extra_to_authorships(row: pd.DataFrame):
    """
    row[authorships] is a dictionary;
    add in the id key to that dictionary
    whose value is row[id]
    """
    complete_dict = row["authorships"]
   # assert type(complete_dict) == dict
    #print(type(complete_dict))
    if type(complete_dict) == dict:
        complete_dict["id"] = row["id"]
        complete_dict["x"] = row["x"]
        complete_dict["y"] = row["y"]
        complete_dict["cluster"] = row["cluster"]
        complete_dict["cluster_score"] = row["probability"]
        complete_dict["title"] = row["title"]
        complete_dict["abstract"] = row["abstract"]
        complete_dict["doi"] = row["doi"]
        complete_dict["publication_date"] = row["publication_date"]
        complete_dict["publication_year"] = row["publication_year"]
        complete_dict["grants"] = row["grants"]
        complete_dict["locations"] = row["locations"]
        return complete_dict
    else:
        return row["authorships"]

In [24]:
dfbig['big_authorships'] = dfbig.apply(add_extra_to_authorships, axis=1)

In [25]:
#dfbig['authorships'].tolist()
bigvals = dfbig['authorships'].tolist()

In [26]:
dictvals = [c for c in bigvals if type(c) != float]


In [27]:
dftriple = pd.json_normalize(dictvals,
                  record_path=['institutions'],
                  meta=['id','raw_affiliation_string','author_position', 'doi',
                        'title','abstract','publication_date', 'publication_year',
                        'grants','locations',
                        'is_corrresponding','x','y','cluster','cluster_score',
                       ['author','id'], ['author', 'display_name'],
                       ['author','orcid']],
                  errors='ignore',
                  sep='_',
                  meta_prefix='paper_',
                #  record_prefix='author_'
                 )

In [28]:
dftopics = dfcontentvectors.copy()
dftopics['cluster'] = dfpapers['cluster']
dfmeantopics = dftopics.groupby('cluster').mean().copy()
reduced_topics = umap_reducer.transform(dfmeantopics.to_numpy())
df_reduced_topics = pd.DataFrame.from_records(reduced_topics, 
                index=dfmeantopics.index)
df_reduced_topics.columns = ['x','y']
df_reduced_topics['topic'] = df_reduced_topics.index
df_reduced_topics.head()

def get_cluster_concepts(topic_num:int, n:int=20):
    """
    takes an integer topic_num corresponding to a 
    given topic number and
    returns the list of top n occuring concepts
    from the top_concept field
    """
    top_concepts = dfpapers[dfpapers['cluster'] == topic_num]['top_concepts'].tolist()
    flat_concepts = [item for sublist in top_concepts for item in sublist]
    concepts_dict = {c:flat_concepts.count(c) for c in flat_concepts}
    sorted_concepts = sorted(concepts_dict.items(), key=lambda x:x[1], reverse=True)
    return [c[0] for c in sorted_concepts][:n]

def get_yake_cluster_phrases(topic_num:int, n:int=20):
    """
    takes in an integer n corresponding
    to a given topic number and
    returns the list of keyphrases (TopicRank method)
    """
    documents = dfpapers[dfpapers['cluster'] == topic_num]['content'].tolist()
    topic_input = ". ".join(documents)
    #extractor = pke.unsupervised.TextRank()
    kw_extractor = yake.KeywordExtractor(top=n, stopwords=None)
    keywords = kw_extractor.extract_keywords(topic_input)
    #extractor.load_document(input=topic_input,
    #                    language='en',
    #                    normalization=None)

    #extractor.candidate_selection()

    #window = 2
    #use_stems = False
    #extractor.candidate_weighting(window=window,
    #                          use_stems=use_stems)
    #extractor.candidate_weighting()
    #threshold = 0.8
   # keyphrases = extractor.get_n_best(n=20, threshold=threshold)
    #keyphrases = extractor.get_n_best(n=n)
    return [p[0] for p in keywords]

wikiconcepts = df_reduced_topics['topic'].apply(get_cluster_concepts)

wikikeywords = df_reduced_topics['topic'].apply(get_yake_cluster_phrases)

dfpapers['id'] = dfpapers.index
dfinfo = dfpapers[['x','y','id','title','doi','cluster','grants',
                   'locations',
                 'publication_date','keywords','top_concepts']].copy()

centroids = dfinfo.groupby('cluster')[['x','y']].mean().copy()
centroids['concepts'] = wikiconcepts
centroids['cluster'] = centroids.index
centroids['keywords'] = wikikeywords

In [29]:
def wrap_it(x):
    return "<br>".join(textwrap.wrap(x, width=40))
   # return "<br>".join(textwrap.wrap(x.replace(r'\s+', ' '), width=40))

In [30]:
centroids['wrapped_keywords'] = centroids['keywords'].apply(str).apply(wrap_it)
centroids['wrapped_concepts'] = centroids['concepts'].apply(str).apply(wrap_it)

In [31]:
centroids.to_pickle('sgcentroids2d.pkl')

In [32]:
dftriple.to_pickle('sgdftriple2d.pkl')

In [33]:
def get_affils_cluster_sort(dc:pd.DataFrame, cl:int):
    """
    restricts the dataframe dc to cluster value cl
    and returns the results grouped by id, ror sorted
    by the some of probablity descending
    """
    dg = dc[dc['paper_cluster'] == cl].copy()
    print(cl)
    dv = dg.groupby(['id','display_name','country_code',
                     'type'])['paper_cluster_score'].sum().to_frame()
    dv.sort_values('paper_cluster_score', ascending=False, inplace=True)
    kw = centroids[centroids.cluster == cl]['keywords'].iloc[0]
    return dv, kw

In [35]:
dv84, kw84 = get_affils_cluster_sort(dftriple, 2)
print(kw84)
dv84.head(10)

2
['water', 'water demand management', 'urban water', 'water demand', 'urban water demand', 'water circularity', 'water supply', 'water security', 'water supply systems', 'urban water systems', 'water consumption', 'water systems', 'public water supply', 'urban water circularity', 'management', 'demand management', 'Social dilemmas', 'Private water supply', 'systems', 'water management']


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,paper_cluster_score
id,display_name,country_code,type,Unnamed: 4_level_1
https://openalex.org/I165932596,National University of Singapore,SG,education,33.645859
https://openalex.org/I7882870,University of Glasgow,GB,education,9.921508
https://openalex.org/I60559429,Nazarbayev University,KZ,education,4.0
https://openalex.org/I107639228,University of Notre Dame,US,education,2.38811
https://openalex.org/I173304897,University of Granada,ES,education,2.0
https://openalex.org/I219193219,Purdue University West Lafayette,US,education,1.294605
https://openalex.org/I172675005,Nanyang Technological University,SG,education,1.250748
https://openalex.org/I55732556,Arizona State University,US,education,1.246665
https://openalex.org/I97018004,Stanford University,US,education,1.19934
https://openalex.org/I152815399,Singapore University of Technology and Design,SG,education,1.156923


In [36]:
dfinfo = dfpapers[['x','y','id','title','doi','cluster','probability',
                 'publication_date','grants','locations',
                   'keywords','top_concepts']].copy()

In [37]:
pap_affils_dict = dftriple.groupby('paper_id')['paper_raw_affiliation_string'].\
apply(lambda x: ' | '.join(x.tolist()))

#pap_authors_dict = dftriple.groupby('paper_id')['paper_author_display_name'].\
#apply(lambda x: ' | '.join(x.tolist()))

In [38]:
pap_authors_dict = dftriple.groupby('paper_id')['paper_author_display_name'].apply(lambda x: x.values)

In [39]:
dfinfo['affil_list'] = pap_affils_dict
dfinfo['author_list'] = pap_authors_dict

In [40]:
dfinfo['affil_list'] = dftriple.groupby('paper_id')['paper_raw_affiliation_string'].\
apply(lambda x: x.tolist())

In [41]:
dfinfo['author_list'] =  dftriple.groupby('paper_id')['paper_author_display_name'].\
apply(lambda x: x.tolist())

In [42]:
dfinfo['wrapped_affil_list'] = dfinfo['affil_list'].apply(str).apply(wrap_it)
dfinfo['wrapped_author_list'] = dfinfo['author_list'].apply(str).apply(wrap_it)

In [43]:
dfinfo['wrapped_keywords'] = dfinfo['keywords'].apply(str).apply(wrap_it)


In [44]:
def get_source_name(loc_list):
    """
    grab the first item in the list;
    retturn the display name
    """
    try:
        primary = loc_list[0]
        return primary["source"]["display_name"]
    except:
        return None

def get_source_type(loc_list):
    """
    grab the first item in the list;
    return the source type
    """
    try:
        primary = loc_list[0]
        return primary["source"]["type"]
    except:
        return None
    

In [45]:
dfinfo["source"] = dfinfo["locations"].apply(get_source_name)
dfinfo["source_type"] = dfinfo["locations"].apply(get_source_type)

In [46]:
dfinfo.to_pickle('sgdfinfo2d.pkl')

In [47]:
def get_funder_names(funder_list):
    """
    funder_list is a list of dictionaries
    with three keys; return the list of 
    unique **funder_display_name**
    values
    """
    try:
        funder_names = list(set([f['funder_display_name'] for f in funder_list]))
        return funder_names    
    except:
        return []
        

In [48]:
dftriple["source"] = dftriple["paper_locations"].apply(get_source_name)
dftriple["source_type"] = dftriple["paper_locations"].apply(get_source_type)
dftriple["funder_list"] = dftriple["paper_grants"].apply(get_funder_names)

In [49]:
dftriple.to_pickle('sgdftriple2d.pkl')

In [50]:
def get_journals_cluster_sort(dc:pd.DataFrame, cl:int):
    """
    restricts the dataframe dc to cluster value cl
    and returns the results grouped by source (where
    source_type == 'journal') sorted
    by the some of probablity descending
    """
    dg = dc[dc['paper_cluster'] == cl].copy()
    print(cl)
    dv = dg[dg['source_type'] == 'journal'].groupby(['source'])['paper_cluster_score'].sum().to_frame()
    dv.sort_values('paper_cluster_score', ascending=False, inplace=True)
    kw = centroids[centroids.cluster == cl]['keywords'].iloc[0]
    return dv, kw

In [51]:
def get_conferences_cluster_sort(dc:pd.DataFrame, cl:int):
    """
    restricts the dataframe dc to cluster value cl
    and returns the results grouped by source (where
    source_type == 'journal') sorted
    by the some of probablity descending
    """
    dg = dc[dc['paper_cluster'] == cl].copy()
    print(cl)
    dv = dg[dg['source_type'] == 'conference'].groupby(['source'])['paper_cluster_score'].sum().to_frame()
    dv.sort_values('paper_cluster_score', ascending=False, inplace=True)
    kw = centroids[centroids.cluster == cl]['keywords'].iloc[0]
    return dv, kw

In [52]:
dv84, kw84 = get_journals_cluster_sort(dftriple, 10)
print(kw84)
dv84.head(10)

10
['piezoelectric energy harvester', 'energy harvester', 'energy harvesting', 'vibration energy harvesting', 'energy', 'piezoelectric energy', 'vibration energy', 'vibration energy harvester', 'energy harvesting performance', 'vibration', 'harvester', 'triboelectric energy harvester', 'Hybrid Energy Harvester', 'mechanical energy harvesters', 'Energy Harvesting Technology', 'wind energy harvesting', 'harvesting', 'electromagnetic energy harvester', 'wind energy harvesters', 'galloping piezoelectric energy']


Unnamed: 0_level_0,paper_cluster_score
source,Unnamed: 1_level_1
Mechanical Systems and Signal Processing,22.71339
Symmetry,13.0
Micromachines,10.808485
Energy Conversion and Management,10.655077
Science Bulletin,9.594271
International Journal of Mechanical Sciences,9.191444
Nanomaterials,8.850609
Sensors and Actuators A-physical,8.606137
Journal of Physics D,7.0
Applied Energy,7.0


In [53]:
def get_country_collaborations_sort(dc:pd.DataFrame, cl:int):
    """
    resticts the dataframe dc to cluster value cl
    and returns the results of paper_id s where there is 
    more than one country_code
    """
    dg = dc[dc['paper_cluster'] == cl].copy()
    dv = dg.groupby('paper_id')['country_code'].apply(lambda x: len(set(x.values))).to_frame()
    dc = dg.groupby('paper_id')['country_code'].apply(lambda x: list(set(x.values))).to_frame()
    dc.columns = ['collab_countries']
    dv.columns = ['country_count']
    dv['collab_countries'] = dc['collab_countries']
    dv.sort_values('country_count',ascending=False, inplace=True)
    di = dfinfo.loc[dv.index].copy()
    di['country_count'] = dv['country_count']
    di['collab_countries'] = dv['collab_countries']
    return di[di['country_count'] > 1]

In [54]:
import networkx as nx
from pyvis.network import Network
import igraph as ig #

In [55]:
dfinfo["funder_list"] = dfinfo["grants"].apply(get_funder_names)
dfinfo["wrapped_funder_list"] = dfinfo["funder_list"].apply(str).apply(wrap_it)

In [56]:

dfinfo.to_pickle('sgdfinfo2d.pkl')


In [57]:
kw_dict = dfinfo['keywords'].to_dict()

In [58]:
sources_list = dftriple['source'].unique().tolist()
type(sources_list), len(sources_list)

(list, 7047)

In [61]:
def get_source_json(s:str):
    """
    s is an openalex Sources display_name
    return that Sources object
    """
    source_json = Sources().filter(display_name = s).get()
    a = source_json[0]['type']
    if "homepage_url" in source_json[0] and source_json[0]['homepage_url']:
      #  print(f"{s} has homepage_url and type {source_json[0]['type']}")
        return source_json[0]["homepage_url"]
    else:
        return None

In [62]:
def get_display_page_dict(sl:list):
    """
    sl is a list of Sources display_name values
    returns the dictionary mapping
    display_names with homepage_url values."""
    mapping_dict = dict()
    for s in tqdm(sl):
        try:
            mapping_dict[s] = get_source_json(s)
        except:
            pass
    return mapping_dict

In [63]:
source_page_dict = get_display_page_dict(sources_list)

100%|█████████████████████████████████████████████████████████████████████| 7047/7047 [57:50<00:00,  2.03it/s]


In [64]:
len(source_page_dict)

591

In [65]:
import pickle

with open("source_page_dict.pkl", "wb") as f:
    pickle.dump(source_page_dict, f)

In [66]:
with open("source_page_dict.pkl", "rb") as f:
    source_dict = pickle.load(f)

In [67]:
def get_journals_cluster_sort(dc:pd.DataFrame, cl:int):
    """
    restricts the dataframe dc to cluster value cl
    and returns the results grouped by source (where
    source_type == 'journal') sorted
    by the some of probablity descending
    """
    dg = dc[dc['paper_cluster'] == cl].copy()
    print(cl)
    dv = dg[dg['source_type'] == 'journal'].groupby(['source'])['paper_cluster_score'].sum().to_frame()
    dv.sort_values('paper_cluster_score', ascending=False, inplace=True)
    dv['journal'] = dv.index
    dv['hompage_url'] = dv['journal'].map(source_dict)
    kw = centroids[centroids.cluster == cl]['keywords'].iloc[0]
    return dv, kw

In [68]:
dv, kw = get_journals_cluster_sort(dftriple, 4)
dv.head()

4


Unnamed: 0_level_0,paper_cluster_score,journal,hompage_url
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Journal of the American Medical Informatics Association,28.386823,Journal of the American Medical Informatics As...,
Journal of Computing and Information Science in Engineering,11.0,Journal of Computing and Information Science i...,
Expert Systems With Applications,8.0,Expert Systems With Applications,
Journal of Mechanical Design,7.981787,Journal of Mechanical Design,
Proceedings of the Design Society,7.225505,Proceedings of the Design Society,


In [None]:
affils_list = dftriple['display_name'].unique().tolist()
type(affils_list), len(affils_list)

affil_json = Institutions().filter(display_name = affils_list[1]).get()

def get_affil_json(s:str):
    """
    s is an openalex Sources display_name
    return that Sources object
    """
    affil_json = Institutions().filter(display_name = s).get()
    #a = source_json[0]['type']
    if "geo" in affil_json[0]:
       # print(f"{s} has homepage_url and type {source_json[0]['type']}")
        return affil_json[0]["geo"]["latitude"], affil_json[0]["geo"]["longitude"]
    else:
        return None, None
    
def get_display_geo_dict(sl:list):
    """
    sl is a list of Institution display_name values
    returns the dictionary mapping
    display_names with (latitude, longitude) values."""
    mapping_dict = dict()
    for s in tqdm(sl):
        try:
            mapping_dict[s] = get_affil_json(s)
        except:
            pass
    return mapping_dict

affil_geo_dict = get_display_geo_dict(affils_list)


import pickle

with open("affil_geo_dict.pkl", "wb") as f:
    pickle.dump(affil_geo_dict, f)

  0%|                                                                    | 21/13127 [00:09<1:32:49,  2.35it/s]