In [1]:
from pyalex import (
    Works, Authors, Sources,
    Institutions, Concepts, Publishers, Funders
)
import pyalex
import pandas as pd
import numpy as np
pyalex.config.email = "david@rs21.io"

### ssl problems
# https://support.chainstack.com/hc/en-us/articles/9117198436249-Common-SSL-Issues-on-Python-and-How-to-Fix-it

from flair.embeddings import DocumentPoolEmbeddings
from flair.data import Sentence
from flair.embeddings import SentenceTransformerDocumentEmbeddings

EMBEDDING_MODEL_1 = "all-mpnet-base-v2" 

# this one is also good: all-MiniLM-L6-v2
EMBEDDING_MODEL_2 = "all-MiniLM-L6-v2"
SENT_EMBEDDINGS_1 = SentenceTransformerDocumentEmbeddings(EMBEDDING_MODEL_1)
SENT_EMBEDDINGS_2 = SentenceTransformerDocumentEmbeddings(EMBEDDING_MODEL_2)
DOC_EMBEDDINGS= DocumentPoolEmbeddings([SENT_EMBEDDINGS_2])

import torch
from tqdm import tqdm
import yake
import umap.umap_ as umap
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture as GMM
import altair as alt
import math
import plotly.express as px
import textwrap

In [2]:
def process_works_list(worklist:list):
    """
    transforms the 
    works list into a dataframe.
    """
    abstracts_dict = {h["id"]:h["abstract"] for h in worklist}
    df = pd.DataFrame.from_records(worklist)
    try: 
        del df['abstract_inverted_index'] # though don't all have abstracts is the problem
        df['abstract'] = df['id'].map(abstracts_dict)
    except:
        pass
   # df['author_affils'] = df['authorships'].apply(get_authors_and_affils)
    return df

In [3]:
from pyalex import config

config.max_retries = 0
config.retry_backoff_factor = 0.1
config.retry_http_codes = [429, 500, 503]

https://github.com/J535D165/pyalex

How to get all the works from US companies

In [5]:
#Works().random()

In [7]:
#Works().filter(authorships={"institutions": {"country_code": "US",
#                                             "type": "company"}}).get()

In [15]:
def get_country_type_frame(country_code:str, affil_type:str):
    """
    takes a country_code and affil_type
    and forms the pagination object to retrive the 
    records
    """
    pager = Works().filter(publication_year = '>2021',
        authorships={"institutions": {"country_code": country_code,
                        "type": affil_type}}).paginate(
        per_page=200, n_max=100_000)
   # pager = Works().filter(publication_year='>2018',
   # authorships={"institutions": {"ror": ror}}).paginate(
   #     per_page=200, n_max=None)
    #pager = Works().filter(publication_year='>2016',
    #concepts={"id":f"{concepts_list[i]['id']}"}).filter(authorships={"institutions":{"country_code":"CN"}}).\
    #paginate(per_page=200,n_max=None)
    #concepts={"id":f"{concepts_list[i]['id']}"}).\
    #paginate(per_page=200,n_max=None)
    df = pd.DataFrame()
    for page in tqdm(pager):
        dfpage = process_works_list(page)
        df = pd.concat([df, dfpage], ignore_index=True)
        df.drop_duplicates(subset='id', keep='first',inplace=True)
    return df

In [16]:
df = get_country_type_frame(country_code="US", affil_type="company")

500it [23:05,  2.77s/it]


In [17]:
#dftop = pd.concat(df,
#                  ignore_index=True)
dftop = df
dftop.drop_duplicates(subset='id', keep='first', 
                      inplace=True)

dftop.set_index('id', inplace=True, drop=False)

dfall = dftop
print(dfall.shape)

dfall['content'] = dfall['title'] + ". " + dfall['abstract']

dfrecords = dfall[~dfall['content'].isna()].copy()

(100000, 49)


In [18]:
def get_keywords(text:str, top:int=7, stopwords=None):
    """
    takes a blob of text and 
    returns the top **top** 
    keywords as a list
    """
    kw_extractor = yake.KeywordExtractor(top=top, stopwords=stopwords)
    keywords = kw_extractor.extract_keywords(text)
    return [p[0] for p in keywords]

In [19]:
def get_top_concepts(concept_list:list,score:float=.6):
    """
    takes a list of concept dictionaries 
    returns the top **top** display_names;
    concepts whose score is >= score
    """
    return [c['display_name'] for c in concept_list if c['score'] >= score]

In [20]:
dfrecords['keywords'] = dfrecords['content'].apply(get_keywords)
dfrecords['top_concepts'] = dfrecords['concepts'].apply(get_top_concepts)

In [21]:
texts = dfrecords['content'].str.lower().values.tolist()

In [22]:
def get_content_embeddings(dfrecords:pd.DataFrame) -> pd.DataFrame:
    """
    passes the preprocessed mitigation strings
    data through the embedding model to produce the vector
    space representation of each pet mitigation.
    """
    sent = Sentence("The grass is green.")
    DOC_EMBEDDINGS.embed(sent)
    texts = dfrecords["content"].str.lower().values.tolist()
    all_descriptions = np.empty((len(texts), len(sent.embedding)))
    for i in tqdm(range(len(texts))):
        sent = Sentence(texts[i])
        DOC_EMBEDDINGS.embed(sent)
        all_descriptions[i, :] = sent.embedding.cpu().numpy()
        # gc.collect()
        torch.cuda.empty_cache()
    dfcontentvectors = pd.DataFrame.from_records(all_descriptions, index=dfrecords.index)
    return dfcontentvectors

In [23]:
dfcontentvectors = get_content_embeddings(dfrecords)

100%|███████████████████████████████████████████████████████████████████| 92733/92733 [57:26<00:00, 26.90it/s]


In [24]:
#umap.UMAP?
N_COMPONENTS = 2 # can visualize this way
umap_reducer = umap.UMAP(n_components=N_COMPONENTS,
                       #  metric='euclidean')
                         random_state=1234,
                         metric='cosine')  # can experiment with this metric as well as the other 
# parameters
# to see what other literature is in the same information space, we need to keep this umap_reducer 
# object as well as the gmm model below.

# Apply UMAP to the vectorized strings
reduced_vectors = umap_reducer.fit_transform(dfcontentvectors.to_numpy())
dfreduced = pd.DataFrame.from_records(reduced_vectors, 
                index=dfcontentvectors.index)
dfreduced.columns = ['x','y']

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


# use hdbscan to cluster

In [25]:
import hdbscan

hdbscan_args = {'min_cluster_size': 15,
                            'metric': 'euclidean',
                            'cluster_selection_method': 'eom',
                            'cluster_selection_epsilon': 0.1
               }

cluster = hdbscan.HDBSCAN(**hdbscan_args).fit(dfreduced[['x','y']].to_numpy())

dfreduced['cluster'] = cluster.labels_
dfreduced['probability'] = cluster.probabilities_

dfpapers = dfrecords.merge(dfreduced, left_index=True,
                           right_index=True)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [26]:
#help(dfpapers.explode)
del dfpapers['id']
dfstart = dfpapers.reset_index()
dfstart.head()

Unnamed: 0,id,doi,title,display_name,publication_year,publication_date,ids,language,primary_location,type,...,created_date,fulltext_origin,is_authors_truncated,abstract,content,top_concepts,x,y,cluster,probability
0,https://openalex.org/W3201073812,https://doi.org/10.1016/j.cpc.2021.108171,LAMMPS - a flexible simulation tool for partic...,LAMMPS - a flexible simulation tool for partic...,2022,2022-02-01,{'openalex': 'https://openalex.org/W3201073812...,en,"{'is_oa': True, 'landing_page_url': 'https://d...",article,...,2021-09-27,,,Since the classical molecular dynamics simulat...,LAMMPS - a flexible simulation tool for partic...,"[Python (programming language), Computer science]",-2.751676,6.058049,76,1.0
1,https://openalex.org/W4312443924,https://doi.org/10.1109/cvpr52688.2022.01167,A ConvNet for the 2020s,A ConvNet for the 2020s,2022,2022-06-01,{'openalex': 'https://openalex.org/W4312443924...,en,"{'is_oa': False, 'landing_page_url': 'https://...",article,...,2023-01-04,,,The “Roaring 20s” of visual recognition began ...,A ConvNet for the 2020s. The “Roaring 20s” of ...,"[Transformer, Computer science, Artificial int...",3.997951,7.04341,76,1.0
2,https://openalex.org/W4226236384,https://doi.org/10.1056/nejmoa2118542,"Oral Nirmatrelvir for High-Risk, Nonhospitaliz...","Oral Nirmatrelvir for High-Risk, Nonhospitaliz...",2022,2022-04-14,{'openalex': 'https://openalex.org/W4226236384...,en,"{'is_oa': True, 'landing_page_url': 'https://d...",article,...,2022-05-05,pdf,,Nirmatrelvir is an orally administered severe ...,"Oral Nirmatrelvir for High-Risk, Nonhospitaliz...","[Medicine, Coronavirus disease 2019 (COVID-19)...",3.190367,-1.239761,44,1.0
3,https://openalex.org/W4283271244,https://doi.org/10.1038/s41375-022-01613-1,The 5th edition of the World Health Organizati...,The 5th edition of the World Health Organizati...,2022,2022-06-22,{'openalex': 'https://openalex.org/W4283271244...,en,"{'is_oa': True, 'landing_page_url': 'https://d...",article,...,2022-06-23,pdf,,The upcoming 5th edition of the World Health O...,The 5th edition of the World Health Organizati...,"[Histiocyte, Myeloid]",1.420206,-4.142851,76,1.0
4,https://openalex.org/W4310461604,https://doi.org/10.1056/nejmoa2212948,Lecanemab in Early Alzheimer’s Disease,Lecanemab in Early Alzheimer’s Disease,2023,2023-01-05,{'openalex': 'https://openalex.org/W4310461604...,en,"{'is_oa': True, 'landing_page_url': 'https://d...",article,...,2022-12-10,pdf,,The accumulation of soluble and insoluble aggr...,Lecanemab in Early Alzheimer’s Disease. The ac...,"[Medicine, Disease, Alzheimer's disease]",-1.830902,0.093264,76,1.0


In [27]:
dfstart['publication_year'].value_counts(dropna=False)

2022    33223
2024    32322
2023    27184
2025        4
Name: publication_year, dtype: int64

In [28]:
dfstart.shape

(92733, 55)

In [29]:
dfbig = dfstart.explode(column='authorships')
dfbig.shape, dfstart.shape

((1028780, 55), (92733, 55))

In [30]:
def add_extra_to_authorships(row: pd.DataFrame):
    """
    row[authorships] is a dictionary;
    add in the id key to that dictionary
    whose value is row[id]
    """
    complete_dict = row["authorships"]
   # assert type(complete_dict) == dict
    #print(type(complete_dict))
    if type(complete_dict) == dict:
        complete_dict["id"] = row["id"]
        complete_dict["x"] = row["x"]
        complete_dict["y"] = row["y"]
        complete_dict["cluster"] = row["cluster"]
        complete_dict["cluster_score"] = row["probability"]
        complete_dict["title"] = row["title"]
        complete_dict["abstract"] = row["abstract"]
        complete_dict["doi"] = row["doi"]
        complete_dict["publication_date"] = row["publication_date"]
        complete_dict["publication_year"] = row["publication_year"]
        complete_dict["grants"] = row["grants"]
        complete_dict["locations"] = row["locations"]
        return complete_dict
    else:
        return row["authorships"]

In [31]:
dfbig['big_authorships'] = dfbig.apply(add_extra_to_authorships, axis=1)

In [32]:
#dfbig['authorships'].tolist()
bigvals = dfbig['authorships'].tolist()

In [33]:
dictvals = [c for c in bigvals if type(c) != float]

In [34]:
dictvals[0]['author'].keys()

dict_keys(['id', 'display_name', 'orcid'])

In [35]:
dftriple = pd.json_normalize(dictvals,
                  record_path=['institutions'],
                  meta=['id','raw_affiliation_strings','author_position', 'doi',
                        'title','abstract','publication_date', 'publication_year',
                        'grants','locations',
                        'is_corrresponding','x','y','cluster','cluster_score',
                       ['author','id'], ['author', 'display_name'],
                       ['author','orcid']],
                  errors='ignore',
                  sep='_',
                  meta_prefix='paper_',
                #  record_prefix='author_'
                 )

In [36]:
dftopics = dfcontentvectors.copy()
dftopics['cluster'] = dfpapers['cluster']
dfmeantopics = dftopics.groupby('cluster').mean().copy()
reduced_topics = umap_reducer.transform(dfmeantopics.to_numpy())
df_reduced_topics = pd.DataFrame.from_records(reduced_topics, 
                index=dfmeantopics.index)
df_reduced_topics.columns = ['x','y']
df_reduced_topics['topic'] = df_reduced_topics.index
df_reduced_topics.head()

def get_cluster_concepts(topic_num:int, n:int=20):
    """
    takes an integer topic_num corresponding to a 
    given topic number and
    returns the list of top n occuring concepts
    from the top_concept field
    """
    top_concepts = dfpapers[dfpapers['cluster'] == topic_num]['top_concepts'].tolist()
    flat_concepts = [item for sublist in top_concepts for item in sublist]
    concepts_dict = {c:flat_concepts.count(c) for c in flat_concepts}
    sorted_concepts = sorted(concepts_dict.items(), key=lambda x:x[1], reverse=True)
    return [c[0] for c in sorted_concepts][:n]

def get_yake_cluster_phrases(topic_num:int, n:int=20):
    """
    takes in an integer n corresponding
    to a given topic number and
    returns the list of keyphrases (TopicRank method)
    """
    documents = dfpapers[dfpapers['cluster'] == topic_num]['content'].tolist()
    topic_input = ". ".join(documents)
    #extractor = pke.unsupervised.TextRank()
    kw_extractor = yake.KeywordExtractor(top=n, stopwords=None)
    keywords = kw_extractor.extract_keywords(topic_input)
    #extractor.load_document(input=topic_input,
    #                    language='en',
    #                    normalization=None)

    #extractor.candidate_selection()

    #window = 2
    #use_stems = False
    #extractor.candidate_weighting(window=window,
    #                          use_stems=use_stems)
    #extractor.candidate_weighting()
    #threshold = 0.8
   # keyphrases = extractor.get_n_best(n=20, threshold=threshold)
    #keyphrases = extractor.get_n_best(n=n)
    return [p[0] for p in keywords]

wikiconcepts = df_reduced_topics['topic'].apply(get_cluster_concepts)

wikikeywords = df_reduced_topics['topic'].apply(get_yake_cluster_phrases)

dfpapers['id'] = dfpapers.index
dfinfo = dfpapers[['x','y','id','title','doi','cluster','grants',
                   'locations',
                 'publication_date','keywords','top_concepts']].copy()

centroids = dfinfo.groupby('cluster')[['x','y']].mean().copy()
centroids['concepts'] = wikiconcepts
centroids['cluster'] = centroids.index
centroids['keywords'] = wikikeywords

In [37]:
def wrap_it(x):
    return "<br>".join(textwrap.wrap(x, width=40))
   # return "<br>".join(textwrap.wrap(x.replace(r'\s+', ' '), width=40))

In [38]:
centroids['wrapped_keywords'] = centroids['keywords'].apply(str).apply(wrap_it)
centroids['wrapped_concepts'] = centroids['concepts'].apply(str).apply(wrap_it)

In [39]:
centroids.to_pickle('updatejammingcentroids2d.pkl')

https://github.com/doolingdavidrs21/openalex-datalink-gpt4/blob/main/update2d.ipynb

In [40]:
dftriple.to_pickle('updatejammingdftriple2d.pkl')

In [41]:
def get_affils_cluster_sort(dc:pd.DataFrame, cl:int):
    """
    restricts the dataframe dc to cluster value cl
    and returns the results grouped by id, ror sorted
    by the some of probablity descending
    """
    dg = dc[dc['paper_cluster'] == cl].copy()
    print(cl)
    dv = dg.groupby(['id','display_name','country_code',
                     'type'])['paper_cluster_score'].sum().to_frame()
    dv.sort_values('paper_cluster_score', ascending=False, inplace=True)
    kw = centroids[centroids.cluster == cl]['keywords'].iloc[0]
    return dv, kw

In [42]:
dv84, kw84 = get_affils_cluster_sort(dftriple, 1)
print(kw84)
dv84.head(10)

1
['hereditary angioedema', 'hereditary angioedema attacks', 'HAE attacks', 'HAE attack rate', 'HAE', 'HAE patients', 'patients', 'HAE attack management', 'angioedema', 'angioedema attacks', 'Hereditary Angioedema Patients', 'Hereditary Angioedema Association', 'attacks', 'hereditary', 'treatment', 'HAE attack treatment', 'on-demand HAE treatments', 'on-demand treatment', 'HAE type', 'attack']


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,paper_cluster_score
id,display_name,country_code,type,Unnamed: 4_level_1
https://openalex.org/I4210140176,Takeda (United States),US,company,47.678656
https://openalex.org/I7877124,Charité - Universitätsmedizin Berlin,DE,healthcare,42.111037
https://openalex.org/I39343248,Humboldt-Universität zu Berlin,DE,education,23.995387
https://openalex.org/I4210119176,KalVista Pharmaceuticals (United States),US,company,23.526979
https://openalex.org/I75951250,Freie Universität Berlin,DE,education,22.960934
https://openalex.org/I4210107844,Fraunhofer Institute for Translational Medicine and Pharmacology,DE,facility,22.22071
https://openalex.org/I2800035258,Ionis Pharmaceuticals (United States),US,company,15.599573
https://openalex.org/I36258959,"University of California, San Diego",US,education,15.275113
https://openalex.org/I4210135591,Fresenius Kabi (China),CN,company,14.0
https://openalex.org/I101202996,Semmelweis University,HU,education,10.536383


In [43]:
dv84, kw84 = get_affils_cluster_sort(dftriple, 0)
print(kw84)
dv84.head(10)

0
['ATTR cardiac amyloidosis', 'Transthyretin Cardiac Amyloidosis', 'Transthyretin Amyloid Cardiomyopathy', 'ATTR amyloidosis', 'patients', 'Cardiac Amyloidosis', 'Transthyretin Amyloidosis Outcomes', 'transthyretin amyloidosis', 'amyloidosis', 'hereditary ATTR amyloidosis', 'Hereditary Transthyretin Amyloidosis', 'Transthyretin Amyloid', 'ATTR cardiac', 'Amyloidosis Outcomes Survey', 'Transthyretin Cardiomyopathy Clinical', 'ATTR', 'Cardiac', 'transthyretin', 'Transthyretin Cardiac', 'TTR']


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,paper_cluster_score
id,display_name,country_code,type,Unnamed: 4_level_1
https://openalex.org/I145311141,Alnylam Pharmaceuticals (United States),US,company,65.873453
https://openalex.org/I180857899,Pfizer (United States),US,company,55.659384
https://openalex.org/I1316902750,Cleveland Clinic,US,healthcare,29.046067
https://openalex.org/I45129253,University College London,GB,education,26.402384
https://openalex.org/I2800035258,Ionis Pharmaceuticals (United States),US,company,23.64438
https://openalex.org/I21250087,Charles University,CZ,education,21.0
https://openalex.org/I1330342723,Mayo Clinic,US,healthcare,18.759827
https://openalex.org/I1283280774,Brigham and Women's Hospital,US,healthcare,17.348459
https://openalex.org/I4210153126,Istituti di Ricovero e Cura a Carattere Scientifico,IT,healthcare,16.471289
https://openalex.org/I4210150756,AstraZeneca (United States),US,company,16.0


In [44]:
dv84, kw84 = get_affils_cluster_sort(dftriple, 52)
print(kw84)
dv84.head(20)

52
['mml', 'National Ignition Facility', 'plasma', 'inertial confinement fusion', 'fusion', 'mrow', 'National Ignition', 'Ignition Facility', 'fusion energy', 'plasmas', 'math xmlns', 'CDATA', 'energy', 'electron', 'confinement fusion', 'Inertial Fusion Energy', 'magnetic field', 'fusion power', 'high', 'fusion plasmas']


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,paper_cluster_score
id,display_name,country_code,type,Unnamed: 4_level_1
https://openalex.org/I1282311441,Lawrence Livermore National Laboratory,US,facility,1499.407245
https://openalex.org/I63533367,General Atomics (United States),US,company,649.60014
https://openalex.org/I4210149442,Fusion Academy,US,education,427.843547
https://openalex.org/I5388228,University of Rochester,US,education,424.741095
https://openalex.org/I4210125919,Fusion (United States),US,company,416.613076
https://openalex.org/I4210126244,Plasma Technology (United States),US,company,403.429695
https://openalex.org/I4210113689,Energetics (United States),US,company,400.431498
https://openalex.org/I1343871089,Los Alamos National Laboratory,US,facility,357.179734
https://openalex.org/I2799567181,Princeton Plasma Physics Laboratory,US,facility,334.442629
https://openalex.org/I4210094962,Culham Science Centre,GB,facility,293.27274


In [56]:
dv84, kw84 = get_affils_cluster_sort(dftriple, 50)
print(kw84)
dv84.head(20)

50
['Wireless Picosecond Time', 'Distributed Antenna Arrays', 'Picosecond Time Synchronization', 'measurements', 'Wireless', 'Antenna', 'array', 'systems', 'self-interference', 'Distributed Antenna', 'Millimeter Wave', 'GHz', 'Full-Duplex', 'Wireless Picosecond', 'system', 'Picosecond Time', 'Communications', 'channel', 'model', 'Antenna Arrays']


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,paper_cluster_score
id,display_name,country_code,type,Unnamed: 4_level_1
https://openalex.org/I4210087596,Qualcomm (United States),US,company,12.0
https://openalex.org/I1174212,University of Southern California,US,education,10.0
https://openalex.org/I87216513,Michigan State University,US,education,8.910831
https://openalex.org/I78577930,Columbia University,US,education,8.707157
https://openalex.org/I86519309,The University of Texas at Austin,US,education,7.836615
https://openalex.org/I1283103587,AT&T (United States),US,company,6.92898
https://openalex.org/I2801313472,North Central State College,US,education,6.0
https://openalex.org/I137902535,North Carolina State University,US,education,6.0
https://openalex.org/I55732556,Arizona State University,US,education,5.791988
https://openalex.org/I76130692,Zhejiang University,CN,education,5.0


In [57]:
dfinfo = dfpapers[['x','y','id','title','doi','cluster','probability',
                 'publication_date','grants','locations',
                   'keywords','top_concepts']].copy()

In [58]:
# Group by 'paper_id' and concatenate 'paper_raw_affiliation_strings'
grouped = dftriple.groupby('paper_id')['paper_raw_affiliation_strings'].apply(lambda x: list(set([item for sublist in x for item in sublist]))).reset_index()

# Convert the series back to a dictionary
pap_affils_dict = grouped.set_index('paper_id')['paper_raw_affiliation_strings'].to_dict()

In [59]:
import itertools

In [62]:
pap_authors_dict = dftriple.groupby('paper_id')['paper_author_display_name'].apply(lambda x: x.values)

In [63]:
dfinfo['affil_list'] = dfinfo['id'].map(pap_affils_dict)

In [64]:
dfinfo['author_list'] = dfinfo['id'].map(pap_authors_dict)

In [65]:
dfinfo['wrapped_affil_list'] = dfinfo['affil_list'].apply(str).apply(wrap_it)
dfinfo['wrapped_author_list'] = dfinfo['author_list'].apply(str).apply(wrap_it)

In [66]:
dfinfo['wrapped_keywords'] = dfinfo['keywords'].apply(str).apply(wrap_it)

In [67]:
def get_source_name(loc_list):
    """
    grab the first item in the list;
    retturn the display name
    """
    try:
        primary = loc_list[0]
        return primary["source"]["display_name"]
    except:
        return None

def get_source_type(loc_list):
    """
    grab the first item in the list;
    return the source type
    """
    try:
        primary = loc_list[0]
        return primary["source"]["type"]
    except:
        return None

In [68]:
dfinfo["source"] = dfinfo["locations"].apply(get_source_name)
dfinfo["source_type"] = dfinfo["locations"].apply(get_source_type)

In [69]:
dfinfo.to_pickle('updatejammingdfinfo2d.pkl')

In [70]:
def get_funder_names(funder_list):
    """
    funder_list is a list of dictionaries
    with three keys; return the list of 
    unique **funder_display_name**
    values
    """
    try:
        funder_names = list(set([f['funder_display_name'] for f in funder_list]))
        return funder_names    
    except:
        return []

In [71]:
dftriple["source"] = dftriple["paper_locations"].apply(get_source_name)
dftriple["source_type"] = dftriple["paper_locations"].apply(get_source_type)
dftriple["funder_list"] = dftriple["paper_grants"].apply(get_funder_names)

In [72]:
dftriple.to_pickle('updatejammingdftriple2d.pkl')

In [73]:
def get_journals_cluster_sort(dc:pd.DataFrame, cl:int):
    """
    restricts the dataframe dc to cluster value cl
    and returns the results grouped by source (where
    source_type == 'journal') sorted
    by the some of probablity descending
    """
    dg = dc[dc['paper_cluster'] == cl].copy()
    print(cl)
    dv = dg[dg['source_type'] == 'journal'].groupby(['source'])['paper_cluster_score'].sum().to_frame()
    dv.sort_values('paper_cluster_score', ascending=False, inplace=True)
    kw = centroids[centroids.cluster == cl]['keywords'].iloc[0]
    return dv, kw

In [74]:
def get_conferences_cluster_sort(dc:pd.DataFrame, cl:int):
    """
    restricts the dataframe dc to cluster value cl
    and returns the results grouped by source (where
    source_type == 'journal') sorted
    by the some of probablity descending
    """
    dg = dc[dc['paper_cluster'] == cl].copy()
    print(cl)
    dv = dg[dg['source_type'] == 'conference'].groupby(['source'])['paper_cluster_score'].sum().to_frame()
    dv.sort_values('paper_cluster_score', ascending=False, inplace=True)
    kw = centroids[centroids.cluster == cl]['keywords'].iloc[0]
    return dv, kw

In [75]:
def get_country_collaborations_sort(dc:pd.DataFrame, cl:int):
    """
    resticts the dataframe dc to cluster value cl
    and returns the results of paper_id s where there is 
    more than one country_code
    """
    dg = dc[dc['paper_cluster'] == cl].copy()
    dv = dg.groupby('paper_id')['country_code'].apply(lambda x: len(set(x.values))).to_frame()
    dc = dg.groupby('paper_id')['country_code'].apply(lambda x: list(set(x.values))).to_frame()
    dc.columns = ['collab_countries']
    dv.columns = ['country_count']
    dv['collab_countries'] = dc['collab_countries']
    dv.sort_values('country_count',ascending=False, inplace=True)
    di = dfinfo.loc[dv.index].copy()
    di['country_count'] = dv['country_count']
    di['collab_countries'] = dv['collab_countries']
    return di[di['country_count'] > 1]

In [76]:
dv = get_country_collaborations_sort(dftriple, 0)
dv

Unnamed: 0_level_0,x,y,id,title,doi,cluster,probability,publication_date,grants,locations,...,top_concepts,affil_list,author_list,wrapped_affil_list,wrapped_author_list,wrapped_keywords,source,source_type,country_count,collab_countries
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
https://openalex.org/W4387966260,8.512614,-0.838894,https://openalex.org/W4387966260,Patisiran Treatment in Patients with Transthyr...,https://doi.org/10.1056/nejmoa2300757,0,1.000000,2023-10-26,[{'funder': 'https://openalex.org/F4320309056'...,"[{'is_oa': False, 'landing_page_url': 'https:/...",...,"[Medicine, Clinical endpoint, Cardiac amyloido...","[Unidade de Pesquisa Clínica–UPC, Hospital Das...","[Mathew S. Maurer, Parag Kale, Marianna Fontan...","['Unidade de Pesquisa Clínica–UPC,<br>Hospital...",['Mathew S. Maurer' 'Parag Kale'<br>'Marianna ...,"['ATTR cardiac amyloidosis', 'secondary<br>end...",New England journal of medicine/The New Engl...,journal,12,"[AU, DK, FR, JP, CZ, PT, TW, GB, BR, IT, MX, US]"
https://openalex.org/W4388571799,8.511369,-0.838894,https://openalex.org/W4388571799,A 15-year consolidated overview of data in ove...,https://doi.org/10.1186/s13023-023-02962-5,0,1.000000,2023-11-10,[{'funder': 'https://openalex.org/F4320307765'...,"[{'is_oa': True, 'landing_page_url': 'https://...",...,"[Transthyretin, Amyloidosis, Asymptomatic, Med...","[Unidade Corino Andrade, Centro Hospitalar Uni...","[Luca Gentile, Teresa Coelho, Angela Dispenzie...","['Unidade Corino Andrade, Centro<br>Hospitalar...",['Luca Gentile' 'Teresa Coelho' 'Angela<br>Dis...,"['Amyloidosis Outcomes Survey',<br>'Transthyre...",Orphanet journal of rare diseases,journal,11,"[DE, FR, PT, BG, ES, SE, BR, IT, MX, US, IN]"
https://openalex.org/W4387115393,8.506588,-0.833740,https://openalex.org/W4387115393,Eplontersen for Hereditary Transthyretin Amylo...,https://doi.org/10.1001/jama.2023.18688,0,1.000000,2023-10-17,[],"[{'is_oa': True, 'landing_page_url': 'https://...",...,"[Medicine, Transthyretin, Placebo, Polyneuropa...",[Hospital Universitário Clementino Fraga Filho...,"[Teresa Coelho, Wilson Marques, Noel R. Dasgup...",['Hospital Universitário Clementino<br>Fraga F...,['Teresa Coelho' 'Wilson Marques' 'Noel<br>R. ...,"['Hereditary Transthyretin Amyloidosis',<br>'N...",JAMA,journal,11,"[DE, JP, PT, TW, TR, SE, GB, BR, IT, AR, US]"
https://openalex.org/W4311666688,8.507696,-0.834409,https://openalex.org/W4311666688,Characteristics of Patients with Hereditary Tr...,https://doi.org/10.1007/s40120-022-00414-z,0,1.000000,2022-12-16,[{'funder': 'https://openalex.org/F4320314430'...,"[{'is_oa': True, 'landing_page_url': 'https://...",...,"[Medicine, Transthyretin, Polyneuropathy, Amyl...","[Division of Hematology, Mayo Clinic, Rocheste...","[Teresa Coelho, Márcia Waddington Cruz, Márcia...","['Division of Hematology, Mayo Clinic,<br>Roch...",['Teresa Coelho' 'Márcia Waddington<br>Cruz' '...,['Hereditary Transthyretin Amyloidosis-<br>Pol...,Neurology and therapy,journal,11,"[DE, JP, PT, TW, TR, SE, GB, BR, IT, AR, US]"
https://openalex.org/W4286715944,8.498540,-0.827260,https://openalex.org/W4286715944,Efficacy and safety of vutrisiran for patients...,https://doi.org/10.1080/13506129.2022.2091985,0,0.520235,2022-07-23,[{'funder': 'https://openalex.org/F4320309056'...,"[{'is_oa': True, 'landing_page_url': 'https://...",...,"[Medicine, Placebo, Transthyretin, Clinical en...","[Boston Medical Center, Boston University, Bos...","[David Adams, David Adams, David Adams, Ivailo...","['Boston Medical Center, Boston<br>University,...",['David Adams' 'David Adams' 'David<br>Adams' ...,"['randomized clinical trial',<br>'hereditary t...",Amyloid,journal,10,"[AU, JP, FR, PT, BG, GB, MY, IT, MX, US]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
https://openalex.org/W4285891814,8.533929,-0.852802,https://openalex.org/W4285891814,Estimating the Effect of Tafamidis on Cardiova...,https://doi.org/10.1159/000525883,0,1.000000,2022-01-01,[],"[{'is_oa': True, 'landing_page_url': 'https://...",...,"[Medicine, Placebo, Internal medicine, Confoun...","[Pfizer Inc, Collegeville, PA, USA, Pfizer Inc...","[Huihua Li, Mark H Rozenbaum, Michelle Casey, ...","['Pfizer Inc, Collegeville, PA, USA',<br>'Pfiz...",['Huihua Li' 'Mark H Rozenbaum'<br>'Michelle C...,"['Transthyretin Amyloid Cardiomyopathy',<br>'N...",Cardiology,journal,2,"[NL, US]"
https://openalex.org/W4323275916,8.477757,-0.861890,https://openalex.org/W4323275916,Cardiovascular Disease and Mortality in Black ...,https://doi.org/10.1016/j.jchf.2023.02.003,0,0.198664,2023-09-01,[{'funder': 'https://openalex.org/F4320337338'...,"[{'is_oa': False, 'landing_page_url': 'https:/...",...,"[Medicine, Transthyretin, Body mass index, Int...","[Division of Public Health Sciences, Fred Hutc...","[Bernhard Haring, Bernhard Haring, Bernhard Ha...","['Division of Public Health Sciences,<br>Fred ...",['Bernhard Haring' 'Bernhard Haring'<br>'Bernh...,"['Black Women Carrying', 'Transthyretin<br>Gen...",JACC. Heart failure,journal,2,"[DE, US]"
https://openalex.org/W4283458479,8.536531,-0.853446,https://openalex.org/W4283458479,Response by Elliott et al to Letter Regarding ...,https://doi.org/10.1161/circheartfailure.122.0...,0,0.991728,2022-07-01,[],"[{'is_oa': True, 'landing_page_url': 'https://...",...,[Transthyretin],"[Pfizer Inc, Groton, CT (B.G.)., University Co...","[Perry Elliott, Balarama Gundapaneni, Marla B....","['Pfizer Inc, Groton, CT (B.G.).',<br>'Univers...",['Perry Elliott' 'Balarama Gundapaneni'<br>'Ma...,"['Transthyretin Amyloid Cardiomyopathy',<br>'T...",Circulation. Heart failure,journal,2,"[GB, US]"
https://openalex.org/W4225938291,8.540058,-0.857412,https://openalex.org/W4225938291,Long-Term Survival With Tafamidis in Patients ...,https://doi.org/10.1161/circheartfailure.120.0...,0,0.775683,2022-01-01,[],"[{'is_oa': True, 'landing_page_url': 'https://...",...,"[Medicine, Placebo]",[University of Michigan and Ann Arbor Veterans...,"[Perry Elliott, Brian Drachman, Brian Drachman...",['University of Michigan and Ann Arbor<br>Vete...,['Perry Elliott' 'Brian Drachman' 'Brian<br>Dr...,"['Transthyretin Amyloid Cardiomyopathy',<br>'T...",Circulation. Heart failure,journal,2,"[GB, US]"


In [77]:
import networkx as nx
from pyvis.network import Network
import igraph as ig # for g

In [78]:
dfinfo["funder_list"] = dfinfo["grants"].apply(get_funder_names)
dfinfo["wrapped_funder_list"] = dfinfo["funder_list"].apply(str).apply(wrap_it)

In [79]:
dfinfo.to_pickle('updatejammingdfinfo2d.pkl')

In [80]:
dfinfo[['id','keywords','wrapped_keywords','wrapped_funder_list']].head()

Unnamed: 0_level_0,id,keywords,wrapped_keywords,wrapped_funder_list
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
https://openalex.org/W3201073812,https://openalex.org/W3201073812,"[LAMMPS, particle-based materials modeling, to...","['LAMMPS', 'particle-based materials<br>modeli...","['Office of Science', 'National Nuclear<br>Sec..."
https://openalex.org/W4312443924,https://openalex.org/W4312443924,"[Transformers, Swin Transformers, Vision, Visi...","['Transformers', 'Swin Transformers',<br>'Visi...",[]
https://openalex.org/W4226236384,https://openalex.org/W4226236384,"[Nonhospitalized Adults, Oral Nirmatrelvir, No...","['Nonhospitalized Adults', 'Oral<br>Nirmatrelv...",['Pfizer']
https://openalex.org/W4283271244,https://openalex.org/W4283271244,"[World Health Organization, Health Organizatio...","['World Health Organization', 'Health<br>Organ...",[]
https://openalex.org/W4310461604,https://openalex.org/W4310461604,"[Alzheimer ’s Disease, Early Alzheimer, Diseas...","['Alzheimer ’s Disease', 'Early<br>Alzheimer',...",[]


In [81]:
kw_dict = dfinfo['keywords'].to_dict()

In [82]:
dc = dftriple[dftriple['paper_cluster'] == 10].copy()
dc.shape

(355, 27)

In [83]:
[x for row in dc['funder_list'].tolist() for x in row]

['Bayer',
 'Bayer',
 'Bayer',
 'Bayer',
 'Bayer',
 'Bayer',
 'Bayer',
 'Bayer',
 'Bayer',
 'Bayer',
 'Bayer',
 'Bayer',
 'Bayer',
 'Bayer',
 'Bayer',
 'Merck Sharp and Dohme',
 'Merck Sharp and Dohme',
 'Merck Sharp and Dohme',
 'Merck Sharp and Dohme',
 'Merck Sharp and Dohme',
 'Merck Sharp and Dohme',
 'Merck Sharp and Dohme',
 'Merck',
 'Merck',
 'Merck',
 'Merck',
 'Merck',
 'Merck',
 'Merck',
 'Merck',
 'Merck',
 'Merck',
 'Merck',
 'Merck',
 'Merck',
 'Medical Research Council',
 'Medical Research Council',
 'Medical Research Council',
 'Medical Research Council',
 'Medical Research Council',
 'Medical Research Council',
 'Medical Research Council',
 'Medical Research Council',
 'Medical Research Council',
 'Medical Research Council',
 'Medical Research Council',
 'Medical Research Council',
 'Medical Research Council',
 'Medical Research Council',
 'Medical Research Council',
 'Medical Research Council',
 'Medical Research Council',
 'Medical Research Council',
 'Medical Resear

In [84]:
kw_dict = dfinfo['keywords'].to_dict()

# add in the affiliations as nodes as well; that row, author, paper, affil. all three get links. ok.
def create_nx_graph(df: pd.DataFrame, cl:int) -> nx.Graph:
    """
    takes the dataframe df, and creates the undirected graph
    from the source and target columns for each row.
    """
    g = nx.Graph() # dc['paper_cluster'] == cl
    dc = df[df['paper_cluster'] == cl]
    author_counts = dc['paper_author_id'].tolist()
    author_counts_dict = {c:author_counts.count(c) for c in author_counts}
    affiliation_counts = dc['id'].tolist()
    affiliation_counts_dict = {c:affiliation_counts.count(c) for c in affiliation_counts}
    source_counts = dc['source'].tolist()
    source_counts_dict = {c:source_counts.count(c) for c in source_counts}
    funder_counts = [x for row in dc['funder_list'].tolist() for x in row]
    funder_counts_dict = {c:funder_counts.count(c) for c in funder_counts}
    for index, row in df[df['paper_cluster'] == cl].iterrows():
        g.add_node(row['paper_id'], group='work', title=row['paper_title'])
        g.add_node(row['paper_author_id'], title=row['paper_author_display_name'],
                   group='author',value = author_counts_dict[row['paper_author_id']])
        g.add_node(row['id'], group='affiliation',
                   title=row['display_name'] + '\n' + row['country_code'],
                  value = affiliation_counts_dict[row['id']])
        if row['source']:
            g.add_node(row['source'], group=row['source_type'],
                      title=row['source'] + ' :\n ' + row['source_type'],
                      value=source_counts_dict[row['source']])
            g.add_edge(
                row['paper_id'],
                row['source'],
                title=row['paper_title'] + ' :\n ' + str(row['paper_publication_date']) +  \
                ' :\n' + row['source'] + ' :\n ' + \
                row['source_type'],
              #  weight = df[(df['paper_id'] == row['paper_id']) & \
              #              (df['source'] == row['source'])]['paper_cluster_score'].sum()
               # weight = row['paper_cluster_score']
            )
            g.add_edge(
                row['paper_author_id'],
                row['source'],
                title=row['paper_author_display_name'] + ':\n' + row['source'],
             #   weight = df[(df['paper_author_id'] == row['paper_author_id']) & \
              #              (df['source'] == row['source'])]['paper_cluster_score'].sum()
               # weight = row['paper_cluster_score']
            )
        if len(row['funder_list']) > 0:
            for f in row['funder_list']:
                g.add_node(f, group='funder',
                          title=str(f),
                          value = founder_counts_dict[f]),
                g.add_edge(
                       row['paper_id'],
                       f,
                       title=row['paper_title'] + ':\n ' +  str(row['paper_publication_date']) + \
                       ' :\n' + str(f),
                  #  weight = row['paper_cluster_score']
                   )
                g.add_edge(
                       f,
                       row['paper_author_id'],
                       title=row['paper_author_display_name'] + ' :\n ' + \
                       str(f),
                  #  weight = row['paper_cluster_score']
                       
                   )
                g.add_edge(
                       f,
                       row['id'],
                       title=row['display_name'] + '\n' + row['country_code'] + ' :\n ' + \
                       str(f)  ,
                  #  weight = row['paper_cluster_score']
                   )  
                if row["source"]:
                    g.add_edge(
                        f,
                        row["source"],
                        title=row["source"] + ' :\n' + str(f),
                     #   weight = row['paper_cluster_score']
                    )
        g.nodes[row['paper_id']]['title'] = (
            row['paper_title'] + ' :\n ' + str(row['paper_publication_date'] + ':\n' + 
            '\n'.join(kw_dict[row['paper_id']]))
        )
        g.nodes[row['paper_author_id']]['title'] = (
            row['paper_author_display_name']
        )
        g.add_edge(
            row['paper_id'],
            row['paper_author_id'],
        title=row['paper_title'] + ' :\n ' + row['paper_author_display_name'] + ' :\n ' + \
            row['paper_raw_affiliation_string'],
         #   weight = row['paper_cluster_score']
        )
        g.add_edge(
            row['paper_author_id'],
            row['id'],
            title=row['paper_author_display_name'] + ' :\n ' + \
            row['display_name'] + ' :\n ' + row['country_code'],
          #  weight = row['paper_cluster_score']
        )
        g.add_edge(
            row['paper_id'],
            row['id'],
            title=row['paper_title'] + ' :\n ' + str(row['paper_publication_date']) + ':\n' + 
            row['display_name'] + ' :\n ' + row['country_code'],
         #   weight = row['paper_cluster_score']
        )
        
    g_ig = ig.Graph.from_networkx(g) # assign 'x', and 'y' to g before returning
    #layout = g_ig.layout_auto()
    #layout = g_ig.layout_davidson_harel()
    layout = g_ig.layout_umap(min_dist = 2, epochs = 500)
    # https://igraph.org/python/tutorial/0.9.6/visualisation.html
    coords = layout.coords
    allnodes = list(g.nodes())
    coords_dict = {allnodes[i]:(coords[i][0], coords[i][1]) for i in range(len(allnodes))}
    for i in g.nodes():
        g.nodes[i]['x'] = 250 * coords_dict[i][0] # the scale factor needed 
        g.nodes[i]['y'] = 250 * coords_dict[i][1]
    return g

In [85]:
def create_pyvis_html(cl: int, filename: str = "pyvis_coauthorships_graph.html"):
    """
    wrapper function that calls create_nx_graph to finally 
    produce an interactive pyvis standalone html file
    """
    g_nx = create_nx_graph(dftriple, cl);
    h = Network(height="1000px",
          #  heading="Mitigations and Techniques Relationships",
                width="100%",
                cdn_resources="remote", # can grab the visjs library to make this local if needed
            # probably should
                bgcolor="#222222",
            neighborhood_highlight=True,
              # default_node_size=1,
                font_color="white",
                directed=False,
               # select_menu=True,
                filter_menu=True,
                notebook=False,
               )
    #h.repulsion()
    h.from_nx(g_nx, show_edge_weights=False)
    #h.barnes_hut()
    #h.repulsion(node_distance=40,
    #            central_gravity=-0.2, spring_length=5, spring_strength=0.005, damping=0.09)
    neighbor_map = h.get_adj_list()
   # for node in h.nodes:
   #     if node['group'] == 'author':
   #         a = list(neighbor_map[node["id"]]) # want to insert a "\n" into every third element of a
   #     if node['group'] == 'work':
   #         a = list(neighbor_map[node["id"]])
   #     i = 3
   #     while i < len(a):
   #         a.insert(i, "\n")
   #         i += 4
   #     node["title"] += "\n Neighbors: \n" + " | ".join(a)
   #     node["value"] = len(neighbor_map[node["id"]]) 
# "physics": {
#    "enabled": false
#  },
    h.set_options(
    """
const options = {
  "interaction": {
    "navigationButtons": false
  },
 "physics": {
     "enabled": false
 },
  "edges": {
    "color": {
        "inherit": true
    },
    "setReferenceSize": null,
    "setReference": {
        "angle": 0.7853981633974483
    },
    "smooth": {
        "forceDirection": "none"
    }
  }
  }
    """
    )
    #h.show_buttons(filter_=['physics'])
  #  h.barnes_hut()
    #h.repulsion()
    try:
        path = './tmp'
        h.save_graph(f"{path}/{filename}")
        HtmlFile = open(f"{path}/{filename}","r",
                        encoding='utf-8')
    except:
        h.save_graph(f"{filename}")
        HtmlFile = open(f"{filename}", "r",
                        encoding="utf-8")
    return h

In [86]:
dfinfo.shape

(92733, 21)

In [87]:
dftime = dfinfo[['cluster','probability','publication_date']].copy()

In [88]:
dftime['publication_datetime'] = pd.to_datetime(dftime['publication_date'])

In [89]:
def get_time_series(dg, cl:int):
    """
    takes dg and the cluster number cl
    and returns a time series chart
    by month, y-axis is the article count
    """
    dftime = dg[dg.cluster == cl][['cluster','probability','publication_date']].copy()
    dftime['date'] = pd.to_datetime(dftime['publication_date'])
    dftime.sort_values('date', inplace=True)
    #by_month = pd.to_datetime(dftime['date']).dt.to_period('M').value_counts().sort_index()
    #by_month.index = pd.PeriodIndex(by_month.index)
    #df_month = by_month.rename_axis('month').reset_index(name='counts')
    return dftime

In [90]:
import altair as alt
#alt.data_transformers.enable("data_server")

In [91]:
sources_list = dftriple['source'].unique().tolist()
type(sources_list), len(sources_list)

(list, 9580)

In [92]:
def get_source_json(s:str):
    """
    s is an openalex Sources display_name
    return that Sources object
    """
    source_json = Sources().search_filter(display_name = s).get()
    a = source_json[0]['type']
    if "homepage_url" in source_json[0] and source_json[0]['homepage_url']:
       # print(f"{s} has homepage_url and type {source_json[0]['type']}")
        return source_json[0]["homepage_url"]
    else:
        return None

In [93]:
sources_list[5]

'Nature'

In [94]:
sj0 = get_source_json(sources_list[5])
sj0

'https://www.nature.com/nature/'

In [95]:
def get_display_page_dict(sl:list):
    """
    sl is a list of Sources display_name values
    returns the dictionary mapping
    display_names with homepage_url values."""
    mapping_dict = dict()
    for s in tqdm(sl):
        try:
            mapping_dict[s] = get_source_json(s)
        except:
            pass
    return mapping_dict

In [96]:
source_page_dict = get_display_page_dict(sources_list)

100%|███████████████████████████████████████████████████████████████████| 9580/9580 [1:19:59<00:00,  2.00it/s]


In [97]:
len(source_page_dict)

8597

In [98]:
import pickle

with open("updatesource_page_dict.pkl", "wb") as f:
    pickle.dump(source_page_dict, f)

In [99]:
with open("updatesource_page_dict.pkl", "rb") as f:
    source_dict = pickle.load(f)

In [100]:
def get_journals_cluster_sort(dc:pd.DataFrame, cl:int):
    """
    restricts the dataframe dc to cluster value cl
    and returns the results grouped by source (where
    source_type == 'journal') sorted
    by the some of probablity descending
    """
    dg = dc[dc['paper_cluster'] == cl].copy()
    print(cl)
    dv = dg[dg['source_type'] == 'journal'].groupby(['source'])['paper_cluster_score'].sum().to_frame()
    dv.sort_values('paper_cluster_score', ascending=False, inplace=True)
    dv['journal'] = dv.index
    dv['hompage_url'] = dv['journal'].map(source_dict)
    kw = centroids[centroids.cluster == cl]['keywords'].iloc[0]
    return dv, kw

In [101]:
dv, kw = get_journals_cluster_sort(dftriple, 4)
dv.head()

4


Unnamed: 0_level_0,paper_cluster_score,journal,hompage_url
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
The journal of allergy and clinical immunology/Journal of allergy and clinical immunology/The journal of allergy and clinical immunology,235.510529,The journal of allergy and clinical immunolo...,https://www.sciencedirect.com/journal/journal-...
Journal of allergy and clinical immunology. In practice/The Journal of allergy and clinical immunology. In practice,225.847645,Journal of allergy and clinical immunology. In...,https://www.journals.elsevier.com/the-journal-...
NEJM evidence,77.0,NEJM evidence,
Blood,34.0,Blood,https://www.sciencedirect.com/journal/blood
HemaSphere,34.0,HemaSphere,http://www.hemaspherejournal.com


In [102]:
def get_conferences_cluster_sort(dc:pd.DataFrame, cl:int):
    """
    restricts the dataframe dc to cluster value cl
    and returns the results grouped by source (where
    source_type == 'journal') sorted
    by the some of probablity descending
    """
    dg = dc[dc['paper_cluster'] == cl].copy()
    print(cl)
    dv = dg[dg['source_type'] == 'conference'].groupby(['source'])['paper_cluster_score'].sum().to_frame()
    dv.sort_values('paper_cluster_score', ascending=False, inplace=True)
    dv['conference'] = dv.index
    dv['homepage_url'] = dv['conference'].map(source_dict)
    kw = centroids[centroids.cluster == cl]['keywords'].iloc[0]
    return dv, kw

In [103]:
affils_list = dftriple['display_name'].unique().tolist()
type(affils_list), len(affils_list)

(list, 33029)

In [104]:
affil_json = Institutions().search_filter(display_name = affils_list[0]).get()

In [105]:
affils_list = dftriple['display_name'].unique().tolist()
type(affils_list), len(affils_list)

affil_json = Institutions().search_filter(display_name = affils_list[1]).get()

def get_affil_json(s:str):
    """
    s is an openalex Sources display_name
    return that Sources object
    """
    affil_json = Institutions().search_filter(display_name = s).get()
    #a = source_json[0]['type']
    if "geo" in affil_json[0]:
       # print(f"{s} has homepage_url and type {source_json[0]['type']}")
        return affil_json[0]["geo"]["latitude"], affil_json[0]["geo"]["longitude"]
    else:
        return None, None
    
def get_display_geo_dict(sl:list):
    """
    sl is a list of Institution display_name values
    returns the dictionary mapping
    display_names with (latitude, longitude) values."""
    mapping_dict = dict()
    for s in tqdm(sl):
        try:
            mapping_dict[s] = get_affil_json(s)
        except:
            pass
    return mapping_dict

affil_geo_dict = get_display_geo_dict(affils_list)


import pickle

with open("updateaffil_geo_dict.pkl", "wb") as f:
    pickle.dump(affil_geo_dict, f)

100%|█████████████████████████████████████████████████████████████████| 33029/33029 [4:19:31<00:00,  2.12it/s]


In [106]:
dfinfo['abstract'] = dfinfo['title'].map(dftriple.set_index('paper_title')['paper_abstract'].to_dict())
dfinfo.to_pickle('updatejammingdfinfo2d.pkl')