In [1]:
from pyalex import (
    Works, Authors, Sources,
    Institutions, Concepts, Publishers, Funders
)
import pyalex
import pandas as pd
import numpy as np
pyalex.config.email = "david@rs21.io"

from flair.embeddings import DocumentPoolEmbeddings
from flair.data import Sentence
from flair.embeddings import SentenceTransformerDocumentEmbeddings

EMBEDDING_MODEL_1 = "all-mpnet-base-v2" 

# this one is also good: all-MiniLM-L6-v2
EMBEDDING_MODEL_2 = "all-MiniLM-L6-v2"
SENT_EMBEDDINGS_1 = SentenceTransformerDocumentEmbeddings(EMBEDDING_MODEL_1)
SENT_EMBEDDINGS_2 = SentenceTransformerDocumentEmbeddings(EMBEDDING_MODEL_2)
DOC_EMBEDDINGS= DocumentPoolEmbeddings([SENT_EMBEDDINGS_2])

import torch
from tqdm import tqdm
import yake
import umap.umap_ as umap
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture as GMM
import altair as alt
import math
import plotly.express as px
import textwrap

2023-12-19 10:14:39.522681: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
target_authors = Authors().search_filter(display_name='Alex Washburne').get()

In [8]:
len(target_authors)

2

In [9]:
def process_works_list(worklist:list):
    """
    transforms the 
    works list into a dataframe.
    """
    abstracts_dict = {h["id"]:h["abstract"] for h in worklist}
    df = pd.DataFrame.from_records(worklist)
    del df['abstract_inverted_index']
    df['abstract'] = df['id'].map(abstracts_dict)
   # df['author_affils'] = df['authorships'].apply(get_authors_and_affils)
    return df

In [10]:
for i in range(len(target_authors)):
    print(target_authors[i]['id'], target_authors[i]['works_count'])

https://openalex.org/A5044568003 54
https://openalex.org/A5008059746 2


In [68]:
from pyalex import Works
from itertools import chain

# Create a query for the works of a specific author
query = Works().filter(author={"id": target_authors[0]['id']})

# Use the paginate() method to iterate over all the pages of results
records0 = []
for record in chain(*query.paginate(per_page=25)):
    records0.append(record)
   #print(record)


In [69]:
# Create a query for the works of a specific author
query = Works().filter(author={"id": target_authors[1]['id']})

# Use the paginate() method to iterate over all the pages of results
records1 = []
for record in chain(*query.paginate(per_page=25)):
    records1.append(record)
   #print(record)

In [70]:
len(records0), len(records1)

(54, 2)

In [71]:
records0.extend(records1)
len(records0)

56

In [74]:
def get_author_frame(authors_list:list, i:int):
    """
    takes a list of Authors() results and an index
    and forms the pagination object to retrive the 
    records
    """
    pager = Works().filter(publication_year='>1990',
    #concepts={"id":f"{concepts_list[i]['id']}"}).filter(authorships={"institutions":{"country_code":"CN"}}).\
    #paginate(per_page=200,n_max=None)
    authorships={"author.id":f"{authors_list[i]['id']}"}).\
    paginate(per_page=200,n_max=None)
    df = pd.DataFrame()
    for page in tqdm(pager):
        dfpage = process_works_list(page)
        df = pd.concat([df, dfpage], ignore_index=True)
        df.drop_duplicates(subset='id', keep='first',inplace=True)
    return df

In [75]:
df = process_works_list(records0)
df.drop_duplicates(subset='id', keep='first', inplace=True)

In [77]:
dfall = df.copy()
dfall.set_index('id', inplace=True, drop=False)

In [78]:
print(dfall.shape)

(56, 43)


In [79]:
dfall['content'] = dfall['title'] + ". " + dfall['abstract']

dfrecords = dfall[~dfall['content'].isna()].copy()

In [80]:
def get_keywords(text:str, top:int=7, stopwords=None):
    """
    takes a blob of text and 
    returns the top **top** 
    keywords as a list
    """
    kw_extractor = yake.KeywordExtractor(top=top, stopwords=stopwords)
    keywords = kw_extractor.extract_keywords(text)
    return [p[0] for p in keywords]

In [81]:
def get_top_concepts(concept_list:list,score:float=.6):
    """
    takes a list of concept dictionaries 
    returns the top **top** display_names;
    concepts whose score is >= score
    """
    return [c['display_name'] for c in concept_list if c['score'] >= score]

In [82]:
dfrecords['keywords'] = dfrecords['content'].apply(get_keywords)
dfrecords['top_concepts'] = dfrecords['concepts'].apply(get_top_concepts)

In [83]:
texts = dfrecords['content'].str.lower().values.tolist()
#dfrecords.to_csv('jamming.csv')

In [84]:
def get_content_embeddings(dfrecords:pd.DataFrame) -> pd.DataFrame:
    """
    passes the preprocessed mitigation strings
    data through the embedding model to produce the vector
    space representation of each pet mitigation.
    """
    sent = Sentence("The grass is green.")
    DOC_EMBEDDINGS.embed(sent)
    texts = dfrecords["content"].str.lower().values.tolist()
    all_descriptions = np.empty((len(texts), len(sent.embedding)))
    for i in tqdm(range(len(texts))):
        sent = Sentence(texts[i])
        DOC_EMBEDDINGS.embed(sent)
        all_descriptions[i, :] = sent.embedding.cpu().numpy()
        # gc.collect()
        torch.cuda.empty_cache()
    dfcontentvectors = pd.DataFrame.from_records(all_descriptions, index=dfrecords.index)
    return dfcontentvectors

In [85]:
dfcontentvectors = get_content_embeddings(dfrecords)


100%|█████████████████████████████████████████████████████████████████████████| 53/53 [00:02<00:00, 24.57it/s]


In [86]:
#umap.UMAP?
N_COMPONENTS = 2 # can visualize this way
umap_reducer = umap.UMAP(n_components=N_COMPONENTS,
                       #  metric='euclidean')
                         random_state=1234,
                         metric='cosine')  # can experiment with this metric as well as the other 
# parameters
# to see what other literature is in the same information space, we need to keep this umap_reducer 
# object as well as the gmm model below.

# Apply UMAP to the vectorized strings
reduced_vectors = umap_reducer.fit_transform(dfcontentvectors.to_numpy())
dfreduced = pd.DataFrame.from_records(reduced_vectors, 
                index=dfcontentvectors.index)
dfreduced.columns = ['x','y']

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


## use hdbscan to cluster

In [87]:
import hdbscan

hdbscan_args = {'min_cluster_size': 2,
                            'metric': 'euclidean',
                            'cluster_selection_method': 'eom',
                            'cluster_selection_epsilon': 0.1
               }

cluster = hdbscan.HDBSCAN(**hdbscan_args).fit(dfreduced[['x','y']].to_numpy())

dfreduced['cluster'] = cluster.labels_
dfreduced['probability'] = cluster.probabilities_

dfpapers = dfrecords.merge(dfreduced, left_index=True,
                           right_index=True)

In [88]:
#help(dfpapers.explode)
del dfpapers['id']
dfstart = dfpapers.reset_index()
dfstart.head()

Unnamed: 0,id,doi,title,display_name,publication_year,publication_date,ids,language,primary_location,type,...,updated_date,created_date,fulltext_origin,abstract,content,top_concepts,x,y,cluster,probability
0,https://openalex.org/W3134208712,https://doi.org/10.1126/science.abg3055,Estimated transmissibility and impact of SARS-...,Estimated transmissibility and impact of SARS-...,2021,2021-04-09,{'openalex': 'https://openalex.org/W3134208712...,en,"{'is_oa': True, 'landing_page_url': 'https://d...",article,...,2023-12-19T16:06:19.867345,2021-03-15,,UK variant transmission Severe acute respirato...,Estimated transmissibility and impact of SARS-...,"[Transmissibility (structural dynamics), Linea...",2.14471,5.364228,3,1.0
1,https://openalex.org/W3044459694,https://doi.org/10.1128/msystems.00614-20,SARS-CoV-2 Titers in Wastewater Are Higher tha...,SARS-CoV-2 Titers in Wastewater Are Higher tha...,2020,2020-08-25,{'openalex': 'https://openalex.org/W3044459694...,en,"{'is_oa': True, 'landing_page_url': 'https://d...",article,...,2023-12-19T01:39:00.859939,2020-07-29,pdf,Wastewater surveillance represents a complemen...,SARS-CoV-2 Titers in Wastewater Are Higher tha...,[Outbreak],1.750743,5.407535,3,1.0
2,https://openalex.org/W2951061751,https://doi.org/10.1038/s41467-019-10656-5,Establishing microbial composition measurement...,Establishing microbial composition measurement...,2019,2019-06-20,{'openalex': 'https://openalex.org/W2951061751...,en,"{'is_oa': True, 'landing_page_url': 'https://d...",article,...,2023-12-18T08:08:23.026663,2019-06-27,pdf,Differential abundance analysis is controversi...,Establishing microbial composition measurement...,"[Relative species abundance, Microbiome, False...",6.96731,4.733189,0,1.0
3,https://openalex.org/W2510675082,https://doi.org/10.7554/elife.21887,A phylogenetic transform enhances analysis of ...,A phylogenetic transform enhances analysis of ...,2017,2017-02-15,{'openalex': 'https://openalex.org/W2510675082...,en,"{'is_oa': True, 'landing_page_url': 'https://d...",article,...,2023-12-17T02:56:01.284622,2016-09-16,ngrams,"Surveys of microbial communities (microbiota),...",A phylogenetic transform enhances analysis of ...,"[Phylogenetic tree, Clade]",6.474397,4.941989,0,1.0
4,https://openalex.org/W3116925452,https://doi.org/10.1101/2020.12.24.20248822,Estimated transmissibility and impact of SARS-...,Estimated transmissibility and impact of SARS-...,2020,2020-12-26,{'openalex': 'https://openalex.org/W3116925452...,en,"{'is_oa': True, 'landing_page_url': 'https://d...",article,...,2023-12-19T16:05:06.384298,2021-01-05,pdf,"A novel SARS-CoV-2 variant, VOC 202012/01 (lin...",Estimated transmissibility and impact of SARS-...,"[Transmissibility (structural dynamics), Coron...",1.904365,4.902037,4,0.563183


In [89]:
dfstart.shape

(53, 49)

In [90]:
dfstart.cluster.value_counts(dropna=False)

cluster
 0    28
 2     8
 4     7
-1     4
 3     3
 1     3
Name: count, dtype: int64

In [91]:
dfbig = dfstart.explode(column='authorships')
dfbig.shape, dfstart.shape

((399, 49), (53, 49))

In [92]:
def add_extra_to_authorships(row: pd.DataFrame):
    """
    row[authorships] is a dictionary;
    add in the id key to that dictionary
    whose value is row[id]
    """
    complete_dict = row["authorships"]
   # assert type(complete_dict) == dict
    #print(type(complete_dict))
    if type(complete_dict) == dict:
        complete_dict["id"] = row["id"]
        complete_dict["x"] = row["x"]
        complete_dict["y"] = row["y"]
        complete_dict["cluster"] = row["cluster"]
        complete_dict["cluster_score"] = row["probability"]
        complete_dict["title"] = row["title"]
        complete_dict["abstract"] = row["abstract"]
        complete_dict["doi"] = row["doi"]
        complete_dict["publication_date"] = row["publication_date"]
        complete_dict["publication_year"] = row["publication_year"]
        complete_dict["grants"] = row["grants"]
        complete_dict["locations"] = row["locations"]
        return complete_dict
    else:
        return row["authorships"]

In [93]:
dfbig['big_authorships'] = dfbig.apply(add_extra_to_authorships, axis=1)

In [94]:
#dfbig['authorships'].tolist()
bigvals = dfbig['authorships'].tolist()

In [95]:
dictvals = [c for c in bigvals if type(c) != float]


In [96]:
dftriple = pd.json_normalize(dictvals,
                  record_path=['institutions'],
                  meta=['id','raw_affiliation_string','author_position', 'doi',
                        'title','abstract','publication_date', 'publication_year',
                        'grants','locations',
                        'is_corrresponding','x','y','cluster','cluster_score',
                       ['author','id'], ['author', 'display_name'],
                       ['author','orcid']],
                  errors='ignore',
                  sep='_',
                  meta_prefix='paper_',
                #  record_prefix='author_'
                 )

In [97]:
dftopics = dfcontentvectors.copy()
dftopics['cluster'] = dfpapers['cluster']
dfmeantopics = dftopics.groupby('cluster').mean().copy()
reduced_topics = umap_reducer.transform(dfmeantopics.to_numpy())
df_reduced_topics = pd.DataFrame.from_records(reduced_topics, 
                index=dfmeantopics.index)
df_reduced_topics.columns = ['x','y']
df_reduced_topics['topic'] = df_reduced_topics.index
df_reduced_topics.head()

def get_cluster_concepts(topic_num:int, n:int=20):
    """
    takes an integer topic_num corresponding to a 
    given topic number and
    returns the list of top n occuring concepts
    from the top_concept field
    """
    top_concepts = dfpapers[dfpapers['cluster'] == topic_num]['top_concepts'].tolist()
    flat_concepts = [item for sublist in top_concepts for item in sublist]
    concepts_dict = {c:flat_concepts.count(c) for c in flat_concepts}
    sorted_concepts = sorted(concepts_dict.items(), key=lambda x:x[1], reverse=True)
    return [c[0] for c in sorted_concepts][:n]

def get_yake_cluster_phrases(topic_num:int, n:int=20):
    """
    takes in an integer n corresponding
    to a given topic number and
    returns the list of keyphrases (TopicRank method)
    """
    documents = dfpapers[dfpapers['cluster'] == topic_num]['content'].tolist()
    topic_input = ". ".join(documents)
    #extractor = pke.unsupervised.TextRank()
    kw_extractor = yake.KeywordExtractor(top=n, stopwords=None)
    keywords = kw_extractor.extract_keywords(topic_input)
    #extractor.load_document(input=topic_input,
    #                    language='en',
    #                    normalization=None)

    #extractor.candidate_selection()

    #window = 2
    #use_stems = False
    #extractor.candidate_weighting(window=window,
    #                          use_stems=use_stems)
    #extractor.candidate_weighting()
    #threshold = 0.8
   # keyphrases = extractor.get_n_best(n=20, threshold=threshold)
    #keyphrases = extractor.get_n_best(n=n)
    return [p[0] for p in keywords]

wikiconcepts = df_reduced_topics['topic'].apply(get_cluster_concepts)

wikikeywords = df_reduced_topics['topic'].apply(get_yake_cluster_phrases)

dfpapers['id'] = dfpapers.index
dfinfo = dfpapers[['x','y','id','title','doi','cluster','grants',
                   'locations',
                 'publication_date','keywords','top_concepts']].copy()

centroids = dfinfo.groupby('cluster')[['x','y']].mean().copy()
centroids['concepts'] = wikiconcepts
centroids['cluster'] = centroids.index
centroids['keywords'] = wikikeywords

In [98]:
def wrap_it(x):
    return "<br>".join(textwrap.wrap(x, width=40))
   # return "<br>".join(textwrap.wrap(x.replace(r'\s+', ' '), width=40))

In [99]:
centroids['wrapped_keywords'] = centroids['keywords'].apply(str).apply(wrap_it)
centroids['wrapped_concepts'] = centroids['concepts'].apply(str).apply(wrap_it)

In [100]:
centroids.to_pickle('washburnecentroids2d.pkl')

In [101]:
dftriple.to_pickle('washburnedftriple2d.pkl')

In [102]:
def get_affils_cluster_sort(dc:pd.DataFrame, cl:int):
    """
    restricts the dataframe dc to cluster value cl
    and returns the results grouped by id, ror sorted
    by the some of probablity descending
    """
    dg = dc[dc['paper_cluster'] == cl].copy()
    print(cl)
    dv = dg.groupby(['id','display_name','country_code',
                     'type'])['paper_cluster_score'].sum().to_frame()
    dv.sort_values('paper_cluster_score', ascending=False, inplace=True)
    kw = centroids[centroids.cluster == cl]['keywords'].iloc[0]
    return dv, kw

In [103]:
dv84, kw84 = get_affils_cluster_sort(dftriple, 1)
print(kw84)
dv84.head(10)

1
['spillover', 'pathogen spillover', 'pathogen', 'spillover risk', 'models', 'Percolation models', 'Percolation', 'risk', 'spillover events', 'events', 'covariates', 'predictions', 'predicting spillover', 'scales', 'Linear models', 'pathogens', 'Predicting pathogen spillover', 'human', 'zoonotic pathogen spillover', 'reservoir']


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,paper_cluster_score
id,display_name,country_code,type,Unnamed: 4_level_1
https://openalex.org/I23732399,Montana State University,US,education,10.777643
https://openalex.org/I165733156,University of Georgia,US,education,1.755529
https://openalex.org/I4210119109,Indiana University Bloomington,US,education,1.755529
https://openalex.org/I97018004,Stanford University,US,education,1.755529
https://openalex.org/I121980950,Utah State University,US,education,1.0
https://openalex.org/I7882870,University of Glasgow,GB,education,1.0
https://openalex.org/I72951846,Washington State University,US,education,0.755529


In [104]:
dv84, kw84 = get_affils_cluster_sort(dftriple, 0)
print(kw84)
dv84.head(10)

0
['microbial communities', 'microbial', 'Microbial community', 'community ecological data', 'microbiome', 'data', 'community', 'relative abundance data', 'communities', 'phylogenetic', 'ecological data', 'soil microbial communities', 'microbial abundance patterns', 'relative abundance', 'microbial relative abundances', 'human microbiome', 'community ecological', 'abundance data', 'human microbiome data', 'ecological']


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,paper_cluster_score
id,display_name,country_code,type,Unnamed: 4_level_1
https://openalex.org/I170897317,Duke University,US,education,43.0
https://openalex.org/I5124864,École Polytechnique Fédérale de Lausanne,CH,education,36.711654
https://openalex.org/I23732399,Montana State University,US,education,22.85685
https://openalex.org/I188538660,University of Colorado Boulder,US,education,16.707185
https://openalex.org/I36258959,"University of California, San Diego",US,education,16.0
https://openalex.org/I161675122,Cooperative Institute for Research in Environmental Sciences,US,facility,11.0
https://openalex.org/I20089843,Princeton University,US,education,10.121556
https://openalex.org/I1343871089,Los Alamos National Laboratory,US,facility,9.0
https://openalex.org/I186903577,University of Luxembourg,LU,education,7.713701
https://openalex.org/I4210108719,EP Analytics (United States),US,company,4.564036


In [105]:
dfinfo = dfpapers[['x','y','id','title','abstract', 'doi','cluster','probability',
                 'publication_date','grants','locations',
                   'keywords','top_concepts']].copy()

In [106]:
pap_affils_dict = dftriple.groupby('paper_id')['paper_raw_affiliation_string'].\
apply(lambda x: ' | '.join(x.tolist()))

#pap_authors_dict = dftriple.groupby('paper_id')['paper_author_display_name'].\
#apply(lambda x: ' | '.join(x.tolist()))

In [107]:
pap_authors_dict = dftriple.groupby('paper_id')['paper_author_display_name'].apply(lambda x: x.values)

In [108]:
dfinfo['affil_list'] = pap_affils_dict
dfinfo['author_list'] = pap_authors_dict

In [109]:
dfinfo['affil_list'] = dftriple.groupby('paper_id')['paper_raw_affiliation_string'].\
apply(lambda x: x.tolist())

In [110]:
dfinfo['author_list'] =  dftriple.groupby('paper_id')['paper_author_display_name'].\
apply(lambda x: x.tolist())

In [111]:
dfinfo['wrapped_affil_list'] = dfinfo['affil_list'].apply(str).apply(wrap_it)
dfinfo['wrapped_author_list'] = dfinfo['author_list'].apply(str).apply(wrap_it)

In [112]:
dfinfo['wrapped_keywords'] = dfinfo['keywords'].apply(str).apply(wrap_it)

In [113]:
def get_source_name(loc_list):
    """
    grab the first item in the list;
    retturn the display name
    """
    try:
        primary = loc_list[0]
        return primary["source"]["display_name"]
    except:
        return None

def get_source_type(loc_list):
    """
    grab the first item in the list;
    return the source type
    """
    try:
        primary = loc_list[0]
        return primary["source"]["type"]
    except:
        return None

In [114]:
dfinfo["source"] = dfinfo["locations"].apply(get_source_name)
dfinfo["source_type"] = dfinfo["locations"].apply(get_source_type)

In [115]:
dfinfo.to_pickle('washburnedfinfo2d.pkl')

In [116]:
def get_funder_names(funder_list):
    """
    funder_list is a list of dictionaries
    with three keys; return the list of 
    unique **funder_display_name**
    values
    """
    try:
        funder_names = list(set([f['funder_display_name'] for f in funder_list]))
        return funder_names    
    except:
        return []

In [117]:
dftriple["source"] = dftriple["paper_locations"].apply(get_source_name)
dftriple["source_type"] = dftriple["paper_locations"].apply(get_source_type)
dftriple["funder_list"] = dftriple["paper_grants"].apply(get_funder_names)

In [118]:
dftriple.to_pickle('washburnedftriple2d.pkl')

In [119]:
dfinfo["funder_list"] = dfinfo["grants"].apply(get_funder_names)
dfinfo["wrapped_funder_list"] = dfinfo["funder_list"].apply(str).apply(wrap_it)


In [120]:
dfinfo.to_pickle('washburnedfinfo2d.pkl')

In [121]:
kw_dict = dfinfo['keywords'].to_dict()

In [122]:
sources_list = dftriple['source'].unique().tolist()
type(sources_list), len(sources_list)


(list, 27)

In [123]:
def get_source_json(s:str):
    """
    s is an openalex Sources display_name
    return that Sources object
    """
    source_json = Sources().search_filter(display_name = s).get()
    a = source_json[0]['type']
    if "homepage_url" in source_json[0] and source_json[0]['homepage_url']:
        print(f"{s} has homepage_url and type {source_json[0]['type']}")
        return source_json[0]["homepage_url"]
    else:
        return None

In [125]:
sj0 = get_source_json(sources_list[6])
sj0

Science Translational Medicine has homepage_url and type journal


'http://stm.sciencemag.org'

In [126]:
def get_display_page_dict(sl:list):
    """
    sl is a list of Sources display_name values
    returns the dictionary mapping
    display_names with homepage_url values."""
    mapping_dict = dict()
    for s in tqdm(sl):
        try:
            mapping_dict[s] = get_source_json(s)
        except:
            pass
    return mapping_dict

In [127]:
source_page_dict = get_display_page_dict(sources_list)

  4%|██▋                                                                       | 1/27 [00:00<00:17,  1.49it/s]

Science has homepage_url and type journal


  7%|█████▍                                                                    | 2/27 [00:01<00:12,  1.99it/s]

MSystems has homepage_url and type journal


 11%|████████▏                                                                 | 3/27 [00:01<00:11,  2.06it/s]

Nature Communications has homepage_url and type journal


 15%|██████████▉                                                               | 4/27 [00:01<00:10,  2.22it/s]

eLife has homepage_url and type journal


 19%|█████████████▋                                                            | 5/27 [00:02<00:09,  2.25it/s]

medRxiv (Cold Spring Harbor Laboratory) has homepage_url and type repository


 22%|████████████████▍                                                         | 6/27 [00:02<00:09,  2.28it/s]

Frontiers in Microbiology has homepage_url and type journal


 26%|███████████████████▏                                                      | 7/27 [00:03<00:08,  2.36it/s]

Science Translational Medicine has homepage_url and type journal


 30%|█████████████████████▉                                                    | 8/27 [00:03<00:08,  2.36it/s]

PeerJ has homepage_url and type journal


 33%|████████████████████████▋                                                 | 9/27 [00:04<00:08,  2.23it/s]

PLOS Neglected Tropical Diseases has homepage_url and type journal


 37%|███████████████████████████                                              | 10/27 [00:04<00:07,  2.17it/s]

Microbiome has homepage_url and type journal


 41%|█████████████████████████████▋                                           | 11/27 [00:05<00:07,  2.06it/s]

Nature Microbiology has homepage_url and type journal


 44%|████████████████████████████████▍                                        | 12/27 [00:05<00:06,  2.17it/s]

Ecological Monographs has homepage_url and type journal


 48%|███████████████████████████████████▏                                     | 13/27 [00:06<00:06,  2.04it/s]

Ecology has homepage_url and type journal


 52%|█████████████████████████████████████▊                                   | 14/27 [00:06<00:06,  2.06it/s]

Proceedings of the National Academy of Sciences of the United States of America has homepage_url and type journal


 56%|████████████████████████████████████████▌                                | 15/27 [00:06<00:05,  2.14it/s]

Philosophical Transactions of the Royal Society B has homepage_url and type journal


 59%|███████████████████████████████████████████▎                             | 16/27 [00:07<00:05,  2.10it/s]

The ISME Journal has homepage_url and type journal


 63%|█████████████████████████████████████████████▉                           | 17/27 [00:07<00:04,  2.17it/s]

Biology Letters has homepage_url and type journal


 67%|████████████████████████████████████████████████▋                        | 18/27 [00:08<00:04,  2.22it/s]

Gut has homepage_url and type journal


 70%|███████████████████████████████████████████████████▎                     | 19/27 [00:08<00:03,  2.26it/s]

Molecular Ecology has homepage_url and type journal


 74%|██████████████████████████████████████████████████████                   | 20/27 [00:09<00:03,  2.14it/s]

Vaccines has homepage_url and type journal


 78%|████████████████████████████████████████████████████████▊                | 21/27 [00:09<00:02,  2.02it/s]

PLOS Computational Biology has homepage_url and type journal


 81%|███████████████████████████████████████████████████████████▍             | 22/27 [00:10<00:02,  2.10it/s]

The American Naturalist has homepage_url and type journal


 85%|██████████████████████████████████████████████████████████████▏          | 23/27 [00:10<00:01,  2.12it/s]

bioRxiv (Cold Spring Harbor Laboratory) has homepage_url and type repository


 89%|████████████████████████████████████████████████████████████████▉        | 24/27 [00:11<00:01,  2.15it/s]

Ecology and Evolution has homepage_url and type journal


 93%|███████████████████████████████████████████████████████████████████▌     | 25/27 [00:11<00:00,  2.20it/s]

Authorea (Authorea) has homepage_url and type repository


 96%|██████████████████████████████████████████████████████████████████████▎  | 26/27 [00:12<00:00,  2.25it/s]

Social Science Research Network has homepage_url and type repository


100%|█████████████████████████████████████████████████████████████████████████| 27/27 [00:12<00:00,  2.17it/s]


In [128]:
import pickle

with open("source_page_dict.pkl", "wb") as f:
    pickle.dump(source_page_dict, f)


In [129]:
with open("source_page_dict.pkl", "rb") as f:
    source_dict = pickle.load(f)

In [130]:
def get_journals_cluster_sort(dc:pd.DataFrame, cl:int):
    """
    restricts the dataframe dc to cluster value cl
    and returns the results grouped by source (where
    source_type == 'journal') sorted
    by the some of probablity descending
    """
    dg = dc[dc['paper_cluster'] == cl].copy()
    print(cl)
    dv = dg[dg['source_type'] == 'journal'].groupby(['source'])['paper_cluster_score'].sum().to_frame()
    dv.sort_values('paper_cluster_score', ascending=False, inplace=True)
    dv['journal'] = dv.index
    dv['hompage_url'] = dv['journal'].map(source_dict)
    kw = centroids[centroids.cluster == cl]['keywords'].iloc[0]
    return dv, kw

In [131]:
dv, kw = get_journals_cluster_sort(dftriple, 2)
dv.head()

2


Unnamed: 0_level_0,paper_cluster_score,journal,hompage_url
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Molecular Ecology,17.0,Molecular Ecology,http://www.wiley.com/bw/journal.asp?ref=0962-1083
PLOS Neglected Tropical Diseases,7.657034,PLOS Neglected Tropical Diseases,http://www.plosntds.org/
Biology Letters,4.280942,Biology Letters,http://rsbl.royalsocietypublishing.org/
Vaccines,4.0,Vaccines,http://www.mdpi.com/journal/vaccines
PeerJ,3.978182,PeerJ,http://www.peerj.com/


In [132]:
affils_list = dftriple['display_name'].unique().tolist()
type(affils_list), len(affils_list)

affil_json = Institutions().filter(display_name = affils_list[1]).get()

def get_affil_json(s:str):
    """
    s is an openalex Sources display_name
    return that Sources object
    """
    affil_json = Institutions().search_filter(display_name = s).get()
    #a = source_json[0]['type']
    if "geo" in affil_json[0]:
       # print(f"{s} has homepage_url and type {source_json[0]['type']}")
        return affil_json[0]["geo"]["latitude"], affil_json[0]["geo"]["longitude"]
    else:
        return None, None
    
def get_display_geo_dict(sl:list):
    """
    sl is a list of Institution display_name values
    returns the dictionary mapping
    display_names with (latitude, longitude) values."""
    mapping_dict = dict()
    for s in tqdm(sl):
        try:
            mapping_dict[s] = get_affil_json(s)
        except:
            pass
    return mapping_dict

affil_geo_dict = get_display_geo_dict(affils_list)


import pickle

with open("affil_geo_dict.pkl", "wb") as f:
    pickle.dump(affil_geo_dict, f)

100%|█████████████████████████████████████████████████████████████████████████| 96/96 [00:43<00:00,  2.22it/s]
