In [28]:
from pyalex import (
    Works, Authors, Sources,
    Institutions, Concepts, Publishers, Funders
)
import pyalex
import pandas as pd
import numpy as np
pyalex.config.email = "david@rs21.io"

### ssl problems
# https://support.chainstack.com/hc/en-us/articles/9117198436249-Common-SSL-Issues-on-Python-and-How-to-Fix-it

from flair.embeddings import DocumentPoolEmbeddings
from flair.data import Sentence
from flair.embeddings import SentenceTransformerDocumentEmbeddings

EMBEDDING_MODEL_1 = "all-mpnet-base-v2" 

# this one is also good: all-MiniLM-L6-v2
EMBEDDING_MODEL_2 = "all-MiniLM-L6-v2"
SENT_EMBEDDINGS_1 = SentenceTransformerDocumentEmbeddings(EMBEDDING_MODEL_1)
SENT_EMBEDDINGS_2 = SentenceTransformerDocumentEmbeddings(EMBEDDING_MODEL_2)
DOC_EMBEDDINGS= DocumentPoolEmbeddings([SENT_EMBEDDINGS_2])

import torch
from tqdm import tqdm
import yake
import umap.umap_ as umap
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture as GMM
import altair as alt
import math
import plotly.express as px
import textwrap

In [29]:
def process_works_list(worklist:list):
    """
    transforms the 
    works list into a dataframe.
    """
    abstracts_dict = {h["id"]:h["abstract"] for h in worklist}
    df = pd.DataFrame.from_records(worklist)
    try: 
        del df['abstract_inverted_index'] # though don't all have abstracts is the problem
        df['abstract'] = df['id'].map(abstracts_dict)
    except:
        pass
   # df['author_affils'] = df['authorships'].apply(get_authors_and_affils)
    return df

In [30]:
from pyalex import config

config.max_retries = 0
config.retry_backoff_factor = 0.1
config.retry_http_codes = [429, 500, 503]

In [31]:
Works().filter(publication_year = '>2021',
        authorships={"institutions": {"country_code": "US",
                        "type": "company"}}).count()

248000

https://github.com/J535D165/pyalex

In [32]:
Works().filter(publication_year = '>2021',
        authorships={"institutions": {"country_code": "US|CN",
                        "type": "company"}}).count()

361080

In [33]:
Works().filter(publication_year = '>2019',
        authorships={"institutions": {"country_code": ["US","CN"]}}).filter(
        authorships={"institutions": {"country_code": "US",
                     "type": "company"}}).count()

37879

In [37]:
Works().filter(publication_year = '>2020',
        authorships={"institutions": {"country_code":"CN|IR|RU"}}).filter(
        authorships={"institutions": {"country_code": "US",
                     "type": "company"}}).count()

33890

In [38]:
def get_us_cn_frame():
    """
    takes a country_code and affil_type
    and forms the pagination object to retrive the 
    records
    """
    pager = Works().filter(publication_year = '>2020',
        authorships={"institutions": {"country_code":"CN|IR|RU"}}).filter(
        authorships={"institutions": {"country_code": "US",
                     "type": "company"}}).paginate(
        per_page = 200, n_max=None
                     )
    df = pd.DataFrame()
    for page in tqdm(pager):
        dfpage = process_works_list(page)
        df = pd.concat([df, dfpage], ignore_index=True)
        df.drop_duplicates(subset='id', keep='first',inplace=True)
    return df

In [39]:
df = get_us_cn_frame()

171it [07:03,  2.48s/it]


In [40]:
#dftop = pd.concat(df,
#                  ignore_index=True)
dftop = df
dftop.drop_duplicates(subset='id', keep='first', 
                      inplace=True)

dftop.set_index('id', inplace=True, drop=False)

dfall = dftop
print(dfall.shape)

dfall['content'] = dfall['title'] + ". " + dfall['abstract']

dfrecords = dfall[~dfall['content'].isna()].copy()

(33890, 49)


In [41]:
def get_keywords(text:str, top:int=7, stopwords=None):
    """
    takes a blob of text and 
    returns the top **top** 
    keywords as a list
    """
    kw_extractor = yake.KeywordExtractor(top=top, stopwords=stopwords)
    keywords = kw_extractor.extract_keywords(text)
    return [p[0] for p in keywords]

In [42]:
def get_top_concepts(concept_list:list,score:float=.6):
    """
    takes a list of concept dictionaries 
    returns the top **top** display_names;
    concepts whose score is >= score
    """
    return [c['display_name'] for c in concept_list if c['score'] >= score]

In [43]:
dfrecords['keywords'] = dfrecords['content'].apply(get_keywords)
dfrecords['top_concepts'] = dfrecords['concepts'].apply(get_top_concepts)

In [44]:
texts = dfrecords['content'].str.lower().values.tolist()

In [45]:
def get_content_embeddings(dfrecords:pd.DataFrame) -> pd.DataFrame:
    """
    passes the preprocessed mitigation strings
    data through the embedding model to produce the vector
    space representation of each pet mitigation.
    """
    sent = Sentence("The grass is green.")
    DOC_EMBEDDINGS.embed(sent)
    texts = dfrecords["content"].str.lower().values.tolist()
    all_descriptions = np.empty((len(texts), len(sent.embedding)))
    for i in tqdm(range(len(texts))):
        sent = Sentence(texts[i])
        DOC_EMBEDDINGS.embed(sent)
        all_descriptions[i, :] = sent.embedding.cpu().numpy()
        # gc.collect()
        torch.cuda.empty_cache()
    dfcontentvectors = pd.DataFrame.from_records(all_descriptions, index=dfrecords.index)
    return dfcontentvectors

In [46]:
dfcontentvectors = get_content_embeddings(dfrecords)

100%|███████████████████████████████████████████████████████████████████| 31109/31109 [14:57<00:00, 34.65it/s]


In [47]:
#umap.UMAP?
N_COMPONENTS = 2 # can visualize this way
umap_reducer = umap.UMAP(n_components=N_COMPONENTS,
                       #  metric='euclidean')
                         random_state=1234,
                         metric='cosine')  # can experiment with this metric as well as the other 
# parameters
# to see what other literature is in the same information space, we need to keep this umap_reducer 
# object as well as the gmm model below.

# Apply UMAP to the vectorized strings
reduced_vectors = umap_reducer.fit_transform(dfcontentvectors.to_numpy())
dfreduced = pd.DataFrame.from_records(reduced_vectors, 
                index=dfcontentvectors.index)
dfreduced.columns = ['x','y']

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


# use hdbscan to cluster

In [48]:
import hdbscan

hdbscan_args = {'min_cluster_size': 15,
                            'metric': 'euclidean',
                            'cluster_selection_method': 'eom',
                            'cluster_selection_epsilon': 0.1
               }

cluster = hdbscan.HDBSCAN(**hdbscan_args).fit(dfreduced[['x','y']].to_numpy())

dfreduced['cluster'] = cluster.labels_
dfreduced['probability'] = cluster.probabilities_

dfpapers = dfrecords.merge(dfreduced, left_index=True,
                           right_index=True)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [49]:
#help(dfpapers.explode)
del dfpapers['id']
dfstart = dfpapers.reset_index()
dfstart.head()

Unnamed: 0,id,doi,title,display_name,publication_year,publication_date,ids,language,primary_location,type,...,created_date,fulltext_origin,is_authors_truncated,abstract,content,top_concepts,x,y,cluster,probability
0,https://openalex.org/W3041133507,https://doi.org/10.1109/jproc.2020.3004555,A Comprehensive Survey on Transfer Learning,A Comprehensive Survey on Transfer Learning,2021,2021-01-01,{'openalex': 'https://openalex.org/W3041133507...,en,"{'is_oa': False, 'landing_page_url': 'https://...",article,...,2020-07-16,,,Transfer learning aims at improving the perfor...,A Comprehensive Survey on Transfer Learning. T...,"[Transfer of learning, Computer science]",5.702897,4.143548,183,1.0
1,https://openalex.org/W4225854071,https://doi.org/10.1056/nejmoa2114583,Waning Immune Humoral Response to BNT162b2 Cov...,Waning Immune Humoral Response to BNT162b2 Cov...,2021,2021-12-09,{'openalex': 'https://openalex.org/W4225854071...,en,"{'is_oa': True, 'landing_page_url': 'https://d...",article,...,2022-05-05,pdf,,Despite high vaccine coverage and effectivenes...,Waning Immune Humoral Response to BNT162b2 Cov...,"[Medicine, Neutralizing antibody, Incidence (g...",-0.253714,5.305731,77,1.0
2,https://openalex.org/W4283271244,https://doi.org/10.1038/s41375-022-01613-1,The 5th edition of the World Health Organizati...,The 5th edition of the World Health Organizati...,2022,2022-06-22,{'openalex': 'https://openalex.org/W4283271244...,en,"{'is_oa': True, 'landing_page_url': 'https://d...",article,...,2022-06-23,pdf,,The upcoming 5th edition of the World Health O...,The 5th edition of the World Health Organizati...,"[Histiocyte, Myeloid]",-5.298151,3.350824,-1,0.0
3,https://openalex.org/W3171517418,https://doi.org/10.1016/s0140-6736(21)00797-2,First-line nivolumab plus chemotherapy versus ...,First-line nivolumab plus chemotherapy versus ...,2021,2021-07-01,{'openalex': 'https://openalex.org/W3171517418...,en,"{'is_oa': False, 'landing_page_url': 'https://...",article,...,2021-06-22,,,First-line chemotherapy for advanced or metast...,First-line nivolumab plus chemotherapy versus ...,"[Medicine, Oxaliplatin, Nivolumab, Internal me...",-6.409124,1.994749,188,1.0
4,https://openalex.org/W4220999759,https://doi.org/10.1126/science.abj6987,The complete sequence of a human genome,The complete sequence of a human genome,2022,2022-04-01,{'openalex': 'https://openalex.org/W4220999759...,en,"{'is_oa': False, 'landing_page_url': 'https://...",article,...,2022-04-03,pdf,,"Since its initial release in 2000, the human r...",The complete sequence of a human genome. Since...,"[Euchromatin, Genome, Human genome, Telomere, ...",-1.735086,7.104193,110,1.0


In [50]:
dfstart['publication_year'].value_counts(dropna=False)

2021    9732
2022    9514
2023    8854
2024    3007
2025       2
Name: publication_year, dtype: int64

In [51]:
dfstart.shape

(31109, 55)

In [52]:
dfbig = dfstart.explode(column='authorships')
dfbig.shape, dfstart.shape

((394355, 55), (31109, 55))

In [53]:
def add_extra_to_authorships(row: pd.DataFrame):
    """
    row[authorships] is a dictionary;
    add in the id key to that dictionary
    whose value is row[id]
    """
    complete_dict = row["authorships"]
   # assert type(complete_dict) == dict
    #print(type(complete_dict))
    if type(complete_dict) == dict:
        complete_dict["id"] = row["id"]
        complete_dict["x"] = row["x"]
        complete_dict["y"] = row["y"]
        complete_dict["cluster"] = row["cluster"]
        complete_dict["cluster_score"] = row["probability"]
        complete_dict["title"] = row["title"]
        complete_dict["abstract"] = row["abstract"]
        complete_dict["doi"] = row["doi"]
        complete_dict["publication_date"] = row["publication_date"]
        complete_dict["publication_year"] = row["publication_year"]
        complete_dict["grants"] = row["grants"]
        complete_dict["locations"] = row["locations"]
        return complete_dict
    else:
        return row["authorships"]

In [54]:
dfbig['big_authorships'] = dfbig.apply(add_extra_to_authorships, axis=1)

In [55]:
#dfbig['authorships'].tolist()
bigvals = dfbig['authorships'].tolist()

In [56]:
dictvals = [c for c in bigvals if type(c) != float]

In [57]:
dictvals[0]['author'].keys()

dict_keys(['id', 'display_name', 'orcid'])

In [58]:
dftriple = pd.json_normalize(dictvals,
                  record_path=['institutions'],
                  meta=['id','raw_affiliation_strings','author_position', 'doi',
                        'title','abstract','publication_date', 'publication_year',
                        'grants','locations',
                        'is_corrresponding','x','y','cluster','cluster_score',
                       ['author','id'], ['author', 'display_name'],
                       ['author','orcid']],
                  errors='ignore',
                  sep='_',
                  meta_prefix='paper_',
                #  record_prefix='author_'
                 )

In [59]:
dftopics = dfcontentvectors.copy()
dftopics['cluster'] = dfpapers['cluster']
dfmeantopics = dftopics.groupby('cluster').mean().copy()
reduced_topics = umap_reducer.transform(dfmeantopics.to_numpy())
df_reduced_topics = pd.DataFrame.from_records(reduced_topics, 
                index=dfmeantopics.index)
df_reduced_topics.columns = ['x','y']
df_reduced_topics['topic'] = df_reduced_topics.index
df_reduced_topics.head()

def get_cluster_concepts(topic_num:int, n:int=20):
    """
    takes an integer topic_num corresponding to a 
    given topic number and
    returns the list of top n occuring concepts
    from the top_concept field
    """
    top_concepts = dfpapers[dfpapers['cluster'] == topic_num]['top_concepts'].tolist()
    flat_concepts = [item for sublist in top_concepts for item in sublist]
    concepts_dict = {c:flat_concepts.count(c) for c in flat_concepts}
    sorted_concepts = sorted(concepts_dict.items(), key=lambda x:x[1], reverse=True)
    return [c[0] for c in sorted_concepts][:n]

def get_yake_cluster_phrases(topic_num:int, n:int=20):
    """
    takes in an integer n corresponding
    to a given topic number and
    returns the list of keyphrases (TopicRank method)
    """
    documents = dfpapers[dfpapers['cluster'] == topic_num]['content'].tolist()
    topic_input = ". ".join(documents)
    #extractor = pke.unsupervised.TextRank()
    kw_extractor = yake.KeywordExtractor(top=n, stopwords=None)
    keywords = kw_extractor.extract_keywords(topic_input)
    #extractor.load_document(input=topic_input,
    #                    language='en',
    #                    normalization=None)

    #extractor.candidate_selection()

    #window = 2
    #use_stems = False
    #extractor.candidate_weighting(window=window,
    #                          use_stems=use_stems)
    #extractor.candidate_weighting()
    #threshold = 0.8
   # keyphrases = extractor.get_n_best(n=20, threshold=threshold)
    #keyphrases = extractor.get_n_best(n=n)
    return [p[0] for p in keywords]

wikiconcepts = df_reduced_topics['topic'].apply(get_cluster_concepts)

wikikeywords = df_reduced_topics['topic'].apply(get_yake_cluster_phrases)

dfpapers['id'] = dfpapers.index
dfinfo = dfpapers[['x','y','id','title','doi','cluster','grants',
                   'locations',
                 'publication_date','keywords','top_concepts']].copy()

centroids = dfinfo.groupby('cluster')[['x','y']].mean().copy()
centroids['concepts'] = wikiconcepts
centroids['cluster'] = centroids.index
centroids['keywords'] = wikikeywords

In [60]:
def wrap_it(x):
    return "<br>".join(textwrap.wrap(x, width=40))
   # return "<br>".join(textwrap.wrap(x.replace(r'\s+', ' '), width=40))

In [61]:
centroids['wrapped_keywords'] = centroids['keywords'].apply(str).apply(wrap_it)
centroids['wrapped_concepts'] = centroids['concepts'].apply(str).apply(wrap_it)

In [62]:
centroids.to_pickle('updatejammingcentroids2d.pkl')

In [63]:
dftriple.to_pickle('updatejammingdftriple2d.pkl')

In [64]:
def get_affils_cluster_sort(dc:pd.DataFrame, cl:int):
    """
    restricts the dataframe dc to cluster value cl
    and returns the results grouped by id, ror sorted
    by the some of probablity descending
    """
    dg = dc[dc['paper_cluster'] == cl].copy()
    print(cl)
    dv = dg.groupby(['id','display_name','country_code',
                     'type'])['paper_cluster_score'].sum().to_frame()
    dv.sort_values('paper_cluster_score', ascending=False, inplace=True)
    kw = centroids[centroids.cluster == cl]['keywords'].iloc[0]
    return dv, kw

In [65]:
dv84, kw84 = get_affils_cluster_sort(dftriple, 1)
print(kw84)
dv84.head(10)

1
['Continuing Medical Education', 'Medical Education Institute', 'Lippincott Continuing Medical', 'CME Accreditation Lippincott', 'Lippincott Professional Development', 'NCPD Accreditation Lippincott', 'Accreditation Lippincott Continuing', 'continuing professional development', 'Accreditation Lippincott Professional', 'Continuing Medical', 'Medical Education', 'nursing continuing professional', 'continuing nursing education', 'Professional Development', 'provide continuing medical', 'AMA PRA Category', 'Education Institute', 'Accreditation Lippincott', 'CME Accreditation', 'Lippincott Continuing']


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,paper_cluster_score
id,display_name,country_code,type,Unnamed: 4_level_1
https://openalex.org/I4210162565,Chongqing Electromechanical Holdings (China),CN,company,138.74078
https://openalex.org/I4210143747,Medical Education Institute,US,nonprofit,138.74078
https://openalex.org/I865767359,Accreditation Council for Graduate Medical Education,US,nonprofit,137.169771
https://openalex.org/I2800773984,American Nurses Credentialing Center,US,other,127.233348
https://openalex.org/I200777214,National and Kapodistrian University of Athens,GR,education,37.936086
https://openalex.org/I40347166,University of Chicago,US,education,37.936086
https://openalex.org/I1337397194,Emerald Group Publishing (United Kingdom),GB,company,37.936086
https://openalex.org/I39422238,University of Illinois at Chicago,US,education,37.936086
https://openalex.org/I4210097017,Universitatea Danubius Galati,RO,education,37.936086
https://openalex.org/I4210098396,Indo-American Center,US,nonprofit,37.936086


In [66]:
dv84, kw84 = get_affils_cluster_sort(dftriple, 0)
print(kw84)
dv84.head(10)

0
['MACIE', 'evolutionarily conserved', 'regulatory functional', 'functional', 'MACIE score', 'conserved', 'Class Integrative Estimation', 'Annotation Class Integrative', 'estimated posterior probability', 'Multi-dimensional Annotation Class', 'evolutionarily', 'regulatory', 'estimated posterior', 'posterior', 'posterior probability', 'MACIE integrates', 'estimated', 'Integrative Estimation', 'Roadmap Epigenomics', 'Class Integrative']


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,paper_cluster_score
id,display_name,country_code,type,Unnamed: 4_level_1
https://openalex.org/I2801228662,Peking Union Medical College Hospital,CN,healthcare,72.0
https://openalex.org/I200296433,Chinese Academy of Medical Sciences & Peking Union Medical College,CN,education,72.0
https://openalex.org/I136199984,Harvard University,US,education,48.0
https://openalex.org/I157773358,Sun Yat-sen University,CN,education,32.0
https://openalex.org/I4210128615,Chinese Academy of Forestry,CN,government,16.0
https://openalex.org/I204823248,Huazhong Agricultural University,CN,education,16.0
https://openalex.org/I4210093460,Sixth Affiliated Hospital of Sun Yat-sen University,CN,healthcare,16.0
https://openalex.org/I4210098034,Key Laboratory of Guangdong Province,CN,facility,16.0
https://openalex.org/I197869895,Anhui Medical University,CN,education,16.0
https://openalex.org/I78577930,Columbia University,US,education,16.0


In [69]:
dv84, kw84 = get_affils_cluster_sort(dftriple, 51)
print(kw84)
dv84.head(20)

51
['Lithium-ion batteries', 'Battery', 'model', 'lithium-ion batteries based', 'batteries', 'battery SOH estimation', 'estimation', 'SOC estimation method', 'method', 'SOC', 'lithium-ion battery', 'SOC estimation', 'based', 'lithium-ion', 'batteries based', 'fault diagnosis methods', 'electrochemical model', 'models', 'estimation method based', 'lithium-ion battery models']


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,paper_cluster_score
id,display_name,country_code,type,Unnamed: 4_level_1
https://openalex.org/I204983213,Harbin Institute of Technology,CN,education,27.843065
https://openalex.org/I4210156703,The Faraday Institution,GB,nonprofit,12.047498
https://openalex.org/I4210156197,Life Cycle Engineering (United States),US,company,10.796044
https://openalex.org/I66946132,"University of Maryland, College Park",US,education,10.269602
https://openalex.org/I99065089,Tsinghua University,CN,education,8.053132
https://openalex.org/I4210157719,Yancheng Institute of Technology,CN,education,7.044918
https://openalex.org/I158842170,Chongqing University,CN,education,7.0
https://openalex.org/I116953780,Tongji University,CN,education,6.9569
https://openalex.org/I74872605,China Southern Power Grid (China),CN,company,6.287389
https://openalex.org/I153473198,North China Electric Power University,CN,education,6.0


In [70]:
dfinfo = dfpapers[['x','y','id','title','doi','cluster','probability',
                 'publication_date','grants','locations',
                   'keywords','top_concepts']].copy()

In [71]:
# Group by 'paper_id' and concatenate 'paper_raw_affiliation_strings'
grouped = dftriple.groupby('paper_id')['paper_raw_affiliation_strings'].apply(lambda x: list(set([item for sublist in x for item in sublist]))).reset_index()

# Convert the series back to a dictionary
pap_affils_dict = grouped.set_index('paper_id')['paper_raw_affiliation_strings'].to_dict()

In [72]:
import itertools

In [73]:
pap_authors_dict = dftriple.groupby('paper_id')['paper_author_display_name'].apply(lambda x: x.values)

In [74]:
dfinfo['affil_list'] = dfinfo['id'].map(pap_affils_dict)

In [75]:
dfinfo['author_list'] = dfinfo['id'].map(pap_authors_dict)

In [76]:
dfinfo['wrapped_affil_list'] = dfinfo['affil_list'].apply(str).apply(wrap_it)
dfinfo['wrapped_author_list'] = dfinfo['author_list'].apply(str).apply(wrap_it)

In [77]:
dfinfo['wrapped_keywords'] = dfinfo['keywords'].apply(str).apply(wrap_it)

In [78]:
def get_source_name(loc_list):
    """
    grab the first item in the list;
    retturn the display name
    """
    try:
        primary = loc_list[0]
        return primary["source"]["display_name"]
    except:
        return None

def get_source_type(loc_list):
    """
    grab the first item in the list;
    return the source type
    """
    try:
        primary = loc_list[0]
        return primary["source"]["type"]
    except:
        return None

In [79]:
dfinfo["source"] = dfinfo["locations"].apply(get_source_name)
dfinfo["source_type"] = dfinfo["locations"].apply(get_source_type)

In [80]:
dfinfo.to_pickle('updatejammingdfinfo2d.pkl')

In [81]:
def get_funder_names(funder_list):
    """
    funder_list is a list of dictionaries
    with three keys; return the list of 
    unique **funder_display_name**
    values
    """
    try:
        funder_names = list(set([f['funder_display_name'] for f in funder_list]))
        return funder_names    
    except:
        return []

In [82]:
dftriple["source"] = dftriple["paper_locations"].apply(get_source_name)
dftriple["source_type"] = dftriple["paper_locations"].apply(get_source_type)
dftriple["funder_list"] = dftriple["paper_grants"].apply(get_funder_names)

In [83]:
dftriple.to_pickle('updatejammingdftriple2d.pkl')

In [84]:
def get_journals_cluster_sort(dc:pd.DataFrame, cl:int):
    """
    restricts the dataframe dc to cluster value cl
    and returns the results grouped by source (where
    source_type == 'journal') sorted
    by the some of probablity descending
    """
    dg = dc[dc['paper_cluster'] == cl].copy()
    print(cl)
    dv = dg[dg['source_type'] == 'journal'].groupby(['source'])['paper_cluster_score'].sum().to_frame()
    dv.sort_values('paper_cluster_score', ascending=False, inplace=True)
    kw = centroids[centroids.cluster == cl]['keywords'].iloc[0]
    return dv, kw

In [85]:
def get_conferences_cluster_sort(dc:pd.DataFrame, cl:int):
    """
    restricts the dataframe dc to cluster value cl
    and returns the results grouped by source (where
    source_type == 'journal') sorted
    by the some of probablity descending
    """
    dg = dc[dc['paper_cluster'] == cl].copy()
    print(cl)
    dv = dg[dg['source_type'] == 'conference'].groupby(['source'])['paper_cluster_score'].sum().to_frame()
    dv.sort_values('paper_cluster_score', ascending=False, inplace=True)
    kw = centroids[centroids.cluster == cl]['keywords'].iloc[0]
    return dv, kw

In [86]:
def get_country_collaborations_sort(dc:pd.DataFrame, cl:int):
    """
    resticts the dataframe dc to cluster value cl
    and returns the results of paper_id s where there is 
    more than one country_code
    """
    dg = dc[dc['paper_cluster'] == cl].copy()
    dv = dg.groupby('paper_id')['country_code'].apply(lambda x: len(set(x.values))).to_frame()
    dc = dg.groupby('paper_id')['country_code'].apply(lambda x: list(set(x.values))).to_frame()
    dc.columns = ['collab_countries']
    dv.columns = ['country_count']
    dv['collab_countries'] = dc['collab_countries']
    dv.sort_values('country_count',ascending=False, inplace=True)
    di = dfinfo.loc[dv.index].copy()
    di['country_count'] = dv['country_count']
    di['collab_countries'] = dv['collab_countries']
    return di[di['country_count'] > 1]

In [87]:
dv = get_country_collaborations_sort(dftriple, 0)
dv

Unnamed: 0_level_0,x,y,id,title,doi,cluster,probability,publication_date,grants,locations,...,top_concepts,affil_list,author_list,wrapped_affil_list,wrapped_author_list,wrapped_keywords,source,source_type,country_count,collab_countries
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
https://openalex.org/W4393413673,12.451026,8.406316,https://openalex.org/W4393413673,MACIE scores for human genome assembly GRCh37 ...,https://doi.org/10.5281/zenodo.5755656,0,1.0,2021-12-08,[],"[{'is_oa': True, 'landing_page_url': 'https://...",...,[],[Department of Pathology and Laboratory Medici...,"[Xihao Li, Godwin Yung, Hufeng Zhou, Ryan Sun,...",['Department of Pathology and Laboratory<br>Me...,['Xihao Li' 'Godwin Yung' 'Hufeng Zhou'<br>'Ry...,"['evolutionarily conserved', 'estimated<br>pos...",Zenodo (CERN European Organization for Nuclear...,repository,3,"[US, CH, CN]"
https://openalex.org/W4393529795,12.45113,8.406391,https://openalex.org/W4393529795,MACIE scores for human genome assembly GRCh37 ...,https://doi.org/10.5281/zenodo.5755655,0,1.0,2021-12-08,[],"[{'is_oa': True, 'landing_page_url': 'https://...",...,[],[Department of Pathology and Laboratory Medici...,"[Xihao Li, Godwin Yung, Hufeng Zhou, Ryan Sun,...",['Department of Pathology and Laboratory<br>Me...,['Xihao Li' 'Godwin Yung' 'Hufeng Zhou'<br>'Ry...,"['evolutionarily conserved', 'estimated<br>pos...",Zenodo (CERN European Organization for Nuclear...,repository,3,"[US, CH, CN]"
https://openalex.org/W4393593906,12.451165,8.406417,https://openalex.org/W4393593906,MACIE scores for human genome assembly GRCh37 ...,https://doi.org/10.5281/zenodo.5756449,0,1.0,2021-12-08,[],"[{'is_oa': True, 'landing_page_url': 'https://...",...,[],[Department of Pathology and Laboratory Medici...,"[Xihao Li, Godwin Yung, Hufeng Zhou, Ryan Sun,...",['Department of Pathology and Laboratory<br>Me...,['Xihao Li' 'Godwin Yung' 'Hufeng Zhou'<br>'Ry...,"['Class Integrative Estimation',<br>'Annotatio...",Zenodo (CERN European Organization for Nuclear...,repository,3,"[US, CH, CN]"
https://openalex.org/W4393617317,12.451202,8.406441,https://openalex.org/W4393617317,MACIE scores for human genome assembly GRCh37 ...,https://doi.org/10.5281/zenodo.5756448,0,1.0,2021-12-08,[],"[{'is_oa': True, 'landing_page_url': 'https://...",...,[],[Department of Pathology and Laboratory Medici...,"[Xihao Li, Godwin Yung, Hufeng Zhou, Ryan Sun,...",['Department of Pathology and Laboratory<br>Me...,['Xihao Li' 'Godwin Yung' 'Hufeng Zhou'<br>'Ry...,"['Class Integrative Estimation',<br>'Annotatio...",Zenodo (CERN European Organization for Nuclear...,repository,3,"[US, CH, CN]"
https://openalex.org/W4393641940,12.451254,8.406468,https://openalex.org/W4393641940,MACIE scores for human genome assembly GRCh37 ...,https://doi.org/10.5281/zenodo.5756478,0,1.0,2021-12-08,[],"[{'is_oa': True, 'landing_page_url': 'https://...",...,[],"[Genentech/Roche, School of Statistics, Southw...","[Xihao Li, Godwin Yung, Hufeng Zhou, Ryan Sun,...","['Genentech/Roche', 'School of<br>Statistics, ...",['Xihao Li' 'Godwin Yung' 'Hufeng Zhou'<br>'Ry...,"['Class Integrative Estimation',<br>'Annotatio...",Zenodo (CERN European Organization for Nuclear...,repository,3,"[US, CH, CN]"
https://openalex.org/W4393736355,12.451313,8.406501,https://openalex.org/W4393736355,MACIE scores for human genome assembly GRCh37 ...,https://doi.org/10.5281/zenodo.5756562,0,1.0,2021-12-08,[],"[{'is_oa': True, 'landing_page_url': 'https://...",...,[],[Department of Pathology and Laboratory Medici...,"[Xihao Li, Godwin Yung, Hufeng Zhou, Ryan Sun,...",['Department of Pathology and Laboratory<br>Me...,['Xihao Li' 'Godwin Yung' 'Hufeng Zhou'<br>'Ry...,"['Class Integrative Estimation',<br>'Annotatio...",Zenodo (CERN European Organization for Nuclear...,repository,3,"[US, CH, CN]"
https://openalex.org/W4393742852,12.451364,8.406528,https://openalex.org/W4393742852,MACIE scores for human genome assembly GRCh37 ...,https://doi.org/10.5281/zenodo.5756563,0,1.0,2021-12-08,[],"[{'is_oa': True, 'landing_page_url': 'https://...",...,[],[Department of Pathology and Laboratory Medici...,"[Xihao Li, Godwin Yung, Hufeng Zhou, Ryan Sun,...",['Department of Pathology and Laboratory<br>Me...,['Xihao Li' 'Godwin Yung' 'Hufeng Zhou'<br>'Ry...,"['Class Integrative Estimation',<br>'Annotatio...",Zenodo (CERN European Organization for Nuclear...,repository,3,"[US, CH, CN]"
https://openalex.org/W4393819632,12.451362,8.406533,https://openalex.org/W4393819632,MACIE scores for human genome assembly GRCh37 ...,https://doi.org/10.5281/zenodo.5756479,0,1.0,2021-12-08,[],"[{'is_oa': True, 'landing_page_url': 'https://...",...,[],"[Genentech/Roche, School of Statistics, Southw...","[Xihao Li, Godwin Yung, Hufeng Zhou, Ryan Sun,...","['Genentech/Roche', 'School of<br>Statistics, ...",['Xihao Li' 'Godwin Yung' 'Hufeng Zhou'<br>'Ry...,"['Class Integrative Estimation',<br>'Annotatio...",Zenodo (CERN European Organization for Nuclear...,repository,3,"[US, CH, CN]"
https://openalex.org/W4394071783,11.703955,10.044555,https://openalex.org/W4394071783,Additional file 7 of The spatiotemporal regula...,https://doi.org/10.6084/m9.figshare.20408323,0,1.0,2022-01-01,[],"[{'is_oa': True, 'landing_page_url': 'https://...",...,"[Inflorescence, Fagopyrum]","[Anhui Medical University, Shanghai Tenth Peop...","[Xinwei Guo, Xinwei Guo, Xinwei Guo, Zuliang L...","['Anhui Medical University', ""Shanghai<br>Tent...",['Xinwei Guo' 'Xinwei Guo' 'Xinwei Guo'<br>'Zu...,"['continuous inflorescence removal',<br>'inflo...",Figshare,repository,3,"[US, CN, GB]"
https://openalex.org/W4394150963,11.704035,10.044626,https://openalex.org/W4394150963,Additional file 3 of The spatiotemporal regula...,https://doi.org/10.6084/m9.figshare.20408311,0,1.0,2022-01-01,[],"[{'is_oa': True, 'landing_page_url': 'https://...",...,"[Inflorescence, Fagopyrum]","[Anhui Medical University, Shanghai Tenth Peop...","[Xinwei Guo, Xinwei Guo, Xinwei Guo, Xinwei Gu...","['Anhui Medical University', ""Shanghai<br>Tent...",['Xinwei Guo' 'Xinwei Guo' 'Xinwei Guo'<br>'Xi...,"['continuous inflorescence removal',<br>'inflo...",Figshare,repository,3,"[US, CN, GB]"


In [88]:
import networkx as nx
from pyvis.network import Network
import igraph as ig # for g

In [89]:
dfinfo["funder_list"] = dfinfo["grants"].apply(get_funder_names)
dfinfo["wrapped_funder_list"] = dfinfo["funder_list"].apply(str).apply(wrap_it)

In [90]:
dfinfo.to_pickle('updatejammingdfinfo2d.pkl')

In [91]:
dfinfo[['id','keywords','wrapped_keywords','wrapped_funder_list']].head()

Unnamed: 0_level_0,id,keywords,wrapped_keywords,wrapped_funder_list
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
https://openalex.org/W3041133507,https://openalex.org/W3041133507,"[Transfer Learning, Learning, Transfer, transf...","['Transfer Learning', 'Learning',<br>'Transfer...",['National Natural Science Foundation of<br>Ch...
https://openalex.org/W4225854071,https://openalex.org/W4225854071,"[Immune Humoral Response, Waning Immune Humora...","['Immune Humoral Response', 'Waning<br>Immune ...",[]
https://openalex.org/W4283271244,https://openalex.org/W4283271244,"[World Health Organization, Health Organizatio...","['World Health Organization', 'Health<br>Organ...",[]
https://openalex.org/W3171517418,https://openalex.org/W3171517418,"[chemotherapy versus chemotherapy, chemotherap...","['chemotherapy versus chemotherapy',<br>'chemo...","['Bristol-Myers Squibb', 'Ono<br>Pharmaceutical']"
https://openalex.org/W4220999759,https://openalex.org/W4220999759,"[human reference genome, human genome, genome,...","['human reference genome', 'human<br>genome', ...",[]


In [92]:
kw_dict = dfinfo['keywords'].to_dict()

In [93]:
dc = dftriple[dftriple['paper_cluster'] == 10].copy()
dc.shape

(4730, 27)

In [94]:
[x for row in dc['funder_list'].tolist() for x in row][:10]

['Istituto Nazionale di Astrofisica',
 'Istituto Nazionale di Astrofisica',
 'Istituto Nazionale di Astrofisica',
 'Istituto Nazionale di Astrofisica',
 'Istituto Nazionale di Astrofisica',
 'Istituto Nazionale di Astrofisica',
 'Istituto Nazionale di Astrofisica',
 'Istituto Nazionale di Astrofisica',
 'Istituto Nazionale di Astrofisica',
 'Istituto Nazionale di Astrofisica']

In [95]:
kw_dict = dfinfo['keywords'].to_dict()

# add in the affiliations as nodes as well; that row, author, paper, affil. all three get links. ok.
def create_nx_graph(df: pd.DataFrame, cl:int) -> nx.Graph:
    """
    takes the dataframe df, and creates the undirected graph
    from the source and target columns for each row.
    """
    g = nx.Graph() # dc['paper_cluster'] == cl
    dc = df[df['paper_cluster'] == cl]
    author_counts = dc['paper_author_id'].tolist()
    author_counts_dict = {c:author_counts.count(c) for c in author_counts}
    affiliation_counts = dc['id'].tolist()
    affiliation_counts_dict = {c:affiliation_counts.count(c) for c in affiliation_counts}
    source_counts = dc['source'].tolist()
    source_counts_dict = {c:source_counts.count(c) for c in source_counts}
    funder_counts = [x for row in dc['funder_list'].tolist() for x in row]
    funder_counts_dict = {c:funder_counts.count(c) for c in funder_counts}
    for index, row in df[df['paper_cluster'] == cl].iterrows():
        g.add_node(row['paper_id'], group='work', title=row['paper_title'])
        g.add_node(row['paper_author_id'], title=row['paper_author_display_name'],
                   group='author',value = author_counts_dict[row['paper_author_id']])
        g.add_node(row['id'], group='affiliation',
                   title=row['display_name'] + '\n' + row['country_code'],
                  value = affiliation_counts_dict[row['id']])
        if row['source']:
            g.add_node(row['source'], group=row['source_type'],
                      title=row['source'] + ' :\n ' + row['source_type'],
                      value=source_counts_dict[row['source']])
            g.add_edge(
                row['paper_id'],
                row['source'],
                title=row['paper_title'] + ' :\n ' + str(row['paper_publication_date']) +  \
                ' :\n' + row['source'] + ' :\n ' + \
                row['source_type'],
              #  weight = df[(df['paper_id'] == row['paper_id']) & \
              #              (df['source'] == row['source'])]['paper_cluster_score'].sum()
               # weight = row['paper_cluster_score']
            )
            g.add_edge(
                row['paper_author_id'],
                row['source'],
                title=row['paper_author_display_name'] + ':\n' + row['source'],
             #   weight = df[(df['paper_author_id'] == row['paper_author_id']) & \
              #              (df['source'] == row['source'])]['paper_cluster_score'].sum()
               # weight = row['paper_cluster_score']
            )
        if len(row['funder_list']) > 0:
            for f in row['funder_list']:
                g.add_node(f, group='funder',
                          title=str(f),
                          value = founder_counts_dict[f]),
                g.add_edge(
                       row['paper_id'],
                       f,
                       title=row['paper_title'] + ':\n ' +  str(row['paper_publication_date']) + \
                       ' :\n' + str(f),
                  #  weight = row['paper_cluster_score']
                   )
                g.add_edge(
                       f,
                       row['paper_author_id'],
                       title=row['paper_author_display_name'] + ' :\n ' + \
                       str(f),
                  #  weight = row['paper_cluster_score']
                       
                   )
                g.add_edge(
                       f,
                       row['id'],
                       title=row['display_name'] + '\n' + row['country_code'] + ' :\n ' + \
                       str(f)  ,
                  #  weight = row['paper_cluster_score']
                   )  
                if row["source"]:
                    g.add_edge(
                        f,
                        row["source"],
                        title=row["source"] + ' :\n' + str(f),
                     #   weight = row['paper_cluster_score']
                    )
        g.nodes[row['paper_id']]['title'] = (
            row['paper_title'] + ' :\n ' + str(row['paper_publication_date'] + ':\n' + 
            '\n'.join(kw_dict[row['paper_id']]))
        )
        g.nodes[row['paper_author_id']]['title'] = (
            row['paper_author_display_name']
        )
        g.add_edge(
            row['paper_id'],
            row['paper_author_id'],
        title=row['paper_title'] + ' :\n ' + row['paper_author_display_name'] + ' :\n ' + \
            row['paper_raw_affiliation_string'],
         #   weight = row['paper_cluster_score']
        )
        g.add_edge(
            row['paper_author_id'],
            row['id'],
            title=row['paper_author_display_name'] + ' :\n ' + \
            row['display_name'] + ' :\n ' + row['country_code'],
          #  weight = row['paper_cluster_score']
        )
        g.add_edge(
            row['paper_id'],
            row['id'],
            title=row['paper_title'] + ' :\n ' + str(row['paper_publication_date']) + ':\n' + 
            row['display_name'] + ' :\n ' + row['country_code'],
         #   weight = row['paper_cluster_score']
        )
        
    g_ig = ig.Graph.from_networkx(g) # assign 'x', and 'y' to g before returning
    #layout = g_ig.layout_auto()
    #layout = g_ig.layout_davidson_harel()
    layout = g_ig.layout_umap(min_dist = 2, epochs = 500)
    # https://igraph.org/python/tutorial/0.9.6/visualisation.html
    coords = layout.coords
    allnodes = list(g.nodes())
    coords_dict = {allnodes[i]:(coords[i][0], coords[i][1]) for i in range(len(allnodes))}
    for i in g.nodes():
        g.nodes[i]['x'] = 250 * coords_dict[i][0] # the scale factor needed 
        g.nodes[i]['y'] = 250 * coords_dict[i][1]
    return g

In [96]:
def create_pyvis_html(cl: int, filename: str = "pyvis_coauthorships_graph.html"):
    """
    wrapper function that calls create_nx_graph to finally 
    produce an interactive pyvis standalone html file
    """
    g_nx = create_nx_graph(dftriple, cl);
    h = Network(height="1000px",
          #  heading="Mitigations and Techniques Relationships",
                width="100%",
                cdn_resources="remote", # can grab the visjs library to make this local if needed
            # probably should
                bgcolor="#222222",
            neighborhood_highlight=True,
              # default_node_size=1,
                font_color="white",
                directed=False,
               # select_menu=True,
                filter_menu=True,
                notebook=False,
               )
    #h.repulsion()
    h.from_nx(g_nx, show_edge_weights=False)
    #h.barnes_hut()
    #h.repulsion(node_distance=40,
    #            central_gravity=-0.2, spring_length=5, spring_strength=0.005, damping=0.09)
    neighbor_map = h.get_adj_list()
   # for node in h.nodes:
   #     if node['group'] == 'author':
   #         a = list(neighbor_map[node["id"]]) # want to insert a "\n" into every third element of a
   #     if node['group'] == 'work':
   #         a = list(neighbor_map[node["id"]])
   #     i = 3
   #     while i < len(a):
   #         a.insert(i, "\n")
   #         i += 4
   #     node["title"] += "\n Neighbors: \n" + " | ".join(a)
   #     node["value"] = len(neighbor_map[node["id"]]) 
# "physics": {
#    "enabled": false
#  },
    h.set_options(
    """
const options = {
  "interaction": {
    "navigationButtons": false
  },
 "physics": {
     "enabled": false
 },
  "edges": {
    "color": {
        "inherit": true
    },
    "setReferenceSize": null,
    "setReference": {
        "angle": 0.7853981633974483
    },
    "smooth": {
        "forceDirection": "none"
    }
  }
  }
    """
    )
    #h.show_buttons(filter_=['physics'])
  #  h.barnes_hut()
    #h.repulsion()
    try:
        path = './tmp'
        h.save_graph(f"{path}/{filename}")
        HtmlFile = open(f"{path}/{filename}","r",
                        encoding='utf-8')
    except:
        h.save_graph(f"{filename}")
        HtmlFile = open(f"{filename}", "r",
                        encoding="utf-8")
    return h

In [97]:
dfinfo.shape

(31109, 21)

In [98]:
dftime = dfinfo[['cluster','probability','publication_date']].copy()

In [99]:
dftime['publication_datetime'] = pd.to_datetime(dftime['publication_date'])

In [100]:
def get_time_series(dg, cl:int):
    """
    takes dg and the cluster number cl
    and returns a time series chart
    by month, y-axis is the article count
    """
    dftime = dg[dg.cluster == cl][['cluster','probability','publication_date']].copy()
    dftime['date'] = pd.to_datetime(dftime['publication_date'])
    dftime.sort_values('date', inplace=True)
    #by_month = pd.to_datetime(dftime['date']).dt.to_period('M').value_counts().sort_index()
    #by_month.index = pd.PeriodIndex(by_month.index)
    #df_month = by_month.rename_axis('month').reset_index(name='counts')
    return dftime

In [101]:
import altair as alt
#alt.data_transformers.enable("data_server")

In [102]:
sources_list = dftriple['source'].unique().tolist()
type(sources_list), len(sources_list)

(list, 5247)

In [103]:
def get_source_json(s:str):
    """
    s is an openalex Sources display_name
    return that Sources object
    """
    source_json = Sources().search_filter(display_name = s).get()
    a = source_json[0]['type']
    if "homepage_url" in source_json[0] and source_json[0]['homepage_url']:
       # print(f"{s} has homepage_url and type {source_json[0]['type']}")
        return source_json[0]["homepage_url"]
    else:
        return None

In [104]:
sources_list[5]

'Nature'

In [105]:
sj0 = get_source_json(sources_list[5])
sj0

'https://www.nature.com/nature/'

In [106]:
def get_display_page_dict(sl:list):
    """
    sl is a list of Sources display_name values
    returns the dictionary mapping
    display_names with homepage_url values."""
    mapping_dict = dict()
    for s in tqdm(sl):
        try:
            mapping_dict[s] = get_source_json(s)
        except:
            pass
    return mapping_dict

In [107]:

source_page_dict = get_display_page_dict(sources_list)

100%|█████████████████████████████████████████████████████████████████████| 5247/5247 [51:00<00:00,  1.71it/s]


In [108]:
len(source_page_dict)

4667

In [109]:
import pickle

with open("updatesource_page_dict.pkl", "wb") as f:
    pickle.dump(source_page_dict, f)

In [110]:
with open("updatesource_page_dict.pkl", "rb") as f:
    source_dict = pickle.load(f)

In [115]:
dftriple.columns

Index(['id', 'display_name', 'ror', 'country_code', 'type', 'lineage',
       'paper_id', 'paper_raw_affiliation_strings', 'paper_author_position',
       'paper_doi', 'paper_title', 'paper_abstract', 'paper_publication_date',
       'paper_publication_year', 'paper_grants', 'paper_locations',
       'paper_is_corrresponding', 'paper_x', 'paper_y', 'paper_cluster',
       'paper_cluster_score', 'paper_author_id', 'paper_author_display_name',
       'paper_author_orcid', 'source', 'source_type', 'funder_list'],
      dtype='object')

In [118]:
dftriple[['source','source_type','paper_cluster']].head()

Unnamed: 0,source,source_type,paper_cluster
0,Proceedings of the IEEE,journal,183
1,Proceedings of the IEEE,journal,183
2,Proceedings of the IEEE,journal,183
3,Proceedings of the IEEE,journal,183
4,Proceedings of the IEEE,journal,183


In [120]:
#source_dict

In [111]:
def get_journals_cluster_sort(dc:pd.DataFrame, cl:int):
    """
    restricts the dataframe dc to cluster value cl
    and returns the results grouped by source (where
    source_type == 'journal') sorted
    by the some of probablity descending
    """
    dg = dc[dc['paper_cluster'] == cl].copy()
    print(cl)
    dv = dg[dg['source_type'] == 'journal'].groupby(['source'])['paper_cluster_score'].sum().to_frame()
    dv.sort_values('paper_cluster_score', ascending=False, inplace=True)
    dv['journal'] = dv.index
    dv['hompage_url'] = dv['journal'].map(source_dict)
    kw = centroids[centroids.cluster == cl]['keywords'].iloc[0]
    return dv, kw

In [121]:
dv, kw = get_journals_cluster_sort(dftriple, 183)
dv.head()

183


Unnamed: 0_level_0,paper_cluster_score,journal,hompage_url
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
IEEE transactions on pattern analysis and machine intelligence,470.547493,IEEE transactions on pattern analysis and mach...,http://www.computer.org/portal/web/tpami
IEEE transactions on circuits and systems for video technology,239.0,IEEE transactions on circuits and systems for ...,http://ieeexplore.ieee.org/servlet/opac?punumb...
IEEE transactions on image processing,230.987161,IEEE transactions on image processing,http://ieeexplore.ieee.org/servlet/opac?punumb...
IEEE transactions on multimedia,188.651563,IEEE transactions on multimedia,http://ieeexplore.ieee.org/servlet/opac?punumb...
IEEE transactions on neural networks and learning systems,147.9426,IEEE transactions on neural networks and learn...,http://cis.ieee.org/ieee-transactions-on-neura...


In [122]:
def get_conferences_cluster_sort(dc:pd.DataFrame, cl:int):
    """
    restricts the dataframe dc to cluster value cl
    and returns the results grouped by source (where
    source_type == 'journal') sorted
    by the some of probablity descending
    """
    dg = dc[dc['paper_cluster'] == cl].copy()
    print(cl)
    dv = dg[dg['source_type'] == 'conference'].groupby(['source'])['paper_cluster_score'].sum().to_frame()
    dv.sort_values('paper_cluster_score', ascending=False, inplace=True)
    dv['conference'] = dv.index
    dv['homepage_url'] = dv['conference'].map(source_dict)
    kw = centroids[centroids.cluster == cl]['keywords'].iloc[0]
    return dv, kw

In [123]:
affils_list = dftriple['display_name'].unique().tolist()
type(affils_list), len(affils_list)

(list, 21126)

In [124]:
affil_json = Institutions().search_filter(display_name = affils_list[0]).get()

In [125]:
affils_list = dftriple['display_name'].unique().tolist()
type(affils_list), len(affils_list)

affil_json = Institutions().search_filter(display_name = affils_list[1]).get()

def get_affil_json(s:str):
    """
    s is an openalex Sources display_name
    return that Sources object
    """
    affil_json = Institutions().search_filter(display_name = s).get()
    #a = source_json[0]['type']
    if "geo" in affil_json[0]:
       # print(f"{s} has homepage_url and type {source_json[0]['type']}")
        return affil_json[0]["geo"]["latitude"], affil_json[0]["geo"]["longitude"]
    else:
        return None, None
    
def get_display_geo_dict(sl:list):
    """
    sl is a list of Institution display_name values
    returns the dictionary mapping
    display_names with (latitude, longitude) values."""
    mapping_dict = dict()
    for s in tqdm(sl):
        try:
            mapping_dict[s] = get_affil_json(s)
        except:
            pass
    return mapping_dict

affil_geo_dict = get_display_geo_dict(affils_list)


import pickle

with open("updateaffil_geo_dict.pkl", "wb") as f:
    pickle.dump(affil_geo_dict, f)

100%|█████████████████████████████████████████████████████████████████| 21126/21126 [3:02:08<00:00,  1.93it/s]


In [126]:
dfinfo['abstract'] = dfinfo['title'].map(dftriple.set_index('paper_title')['paper_abstract'].to_dict())
dfinfo.to_pickle('updatejammingdfinfo2d.pkl')