## Setup Environment

### Dimensions API:

In [1]:
# Connect With Drive
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

# %cd to api key
%cd /content/drive/MyDrive/NAACL 2024 /Scientometrics
!ls

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/NAACL 2024 /Scientometrics
'1. Global Health PMIDs Extraction.ipynb'   gender-api-output.csv    Preprocessing.ipynb
 Data_extraction.ipynb			   'List of economies.csv'
 EDA.ipynb				    papers_2021_2024.csv


In [2]:
#!pip install dimcli
#!pip install python-dotenv

In [3]:
import dimcli
from dimcli.shortcuts import dslquery_json as dslquery
from dotenv import load_dotenv
import os
import matplotlib.pyplot as plt
import pandas as pd

# Load environment variables from .env
load_dotenv()

api_key = os.getenv("DIMENSIONS_API_KEY")
endpoint="https://app.dimensions.ai"

if api_key is None:
    raise ValueError("API key not found. Make sure to set DIMENSIONS_API_KEY in your .env file.")

dimcli.login(key=api_key, endpoint=endpoint)
dsl = dimcli.Dsl()

[2mDimcli - Dimensions API Client (v1.2)[0m
[2mConnected to: <https://app.dimensions.ai/api/dsl> - DSL v2.10[0m
[2mMethod: manual login[0m


### Read PMID

In [4]:
df = pd.read_csv('papers_2021_2024.csv', index_col=0)
# Remove NaNs in PMID
df = df[df['PMID'].notna()]

In [5]:
print(df.shape)
df.head()

(1274, 16)


Unnamed: 0,PMID,JournalTitle,Title,doi,Abstract,Language,Year_A,Month_A,Day_A,Year_PM,Month_PM,Day_PM,Status,MeSH,MeSH_UI,Keyword
0,38876802,Regional anesthesia and pain medicine,Artificial intelligence and regional anesthesi...,10.1136/rapm-2024-105522,Artificial intelligence (AI) has demonstrated ...,eng,2024.0,6.0,14.0,2024,6,15,aheadofprint,,,EDUCATION|REGIONAL ANESTHESIA|TECHNOLOGY
1,38876484,JMIR mental health,Crisis prediction among tele-mental health pat...,10.2196/58129,Due to recent advances in artificial intellige...,eng,2024.0,6.0,14.0,2024,6,15,aheadofprint,,,
2,38875696,Journal of medical Internet research,Triage Performance Across Large Language Model...,10.2196/53297,Large language models (LLMs) have demonstrated...,eng,2024.0,6.0,14.0,2024,6,14,epublish,Triage|Humans|Emergency Medicine|Physicians|Em...,D014218|D006801|D004635|D010820|D004636|D00780...,ChatGPT|German|Germany|artificial intelligence...
3,38875562,JMIR AI,"Cost, Usability, Credibility, Fairness, Accoun...",10.2196/51834,The world has witnessed increased adoption of ...,eng,2024.0,4.0,23.0,2024,6,14,epublish,,,AHP|CUC-FATE framework|ChatGPT|LLM|TISM|adopti...
4,38875551,JMIR AI,Online Health Search Via Multidimensional Info...,10.2196/42630,Widespread misinformation in web resources can...,eng,2024.0,5.0,2.0,2024,6,14,epublish,,,deep learning|health misinformation|infodemic|...


In [6]:
df.shape
#year = 2023 # Change this to the year you want to filter by 2018 - 2021
#df = df[df['Year_PM'] == year]
df.shape

(1274, 16)

### Set the Query

To set you query you need 3 things:

1. The type of search. Could be:
* Publications
* Datasets
* Grants
* Patents
* Clinical trials
* Policy documents
* Reports
* Source titles
* Researchers
* Organizations

2. The Topic: Use any topic for your search. You can also use boolean operations such as 'and' or 'or'

3. Where: To limitate your search you can use the where to limitate by year, citations, author, and many others.

4. Columns of the resulting query: Which are the columns that we want to return from the query. Example:

| Fieldset                     | Fields                                  |
|------------------------------|-----------------------------------------|
| extras                       | altmetric date doi funders open_access pmcid pmid relative_citation_ratio research_org_cities research_org_countries research_org_country_names research_org_state_codes research_org_state_names research_orgs researchers times_cited |
| categories                   | category_bra category_for category_hra category_hrcs_hc category_hrcs_rac category_icrp_cso category_icrp_ct category_rcdc category_sdg category_uoa |
| book                         | book_doi book_series_title book_title   |
| basics                       | authors id issue journal pages title type volume year |


For more information check the [Publication's documentation](https://docs.dimensions.ai/dsl/datasource-publications.html)



In [7]:
from dimcli.shortcuts import dslquery_json as dslquery

# Function to fetch paper details by PMID
def get_paper_details(pmid):
    query = f'search publications where pmid="{pmid}" return publications[id+title+authors+year+source_title+journal+date+doi+funder_countries+funders+linkout]'
    response = dslquery(query)
    return response['publications']

# Function to fetch paper details by PMID
def get_paper_details(pmid):
    # Corrected query with proper '+' signs for field concatenation
    query = f'''
    search publications where pmid="{pmid}"
    return publications[id+title+authors+year+source_title+journal+date+doi+linkout+funder_countries+funders]
    '''
    response = dslquery(query)
    paper_details = response['publications']
    if paper_details:
        paper = paper_details[0]
        # Initialize the information dictionary
        info = {
            'ID': paper.get('id', ''),
            'Title': paper.get('title', ''),
            'Year': paper.get('year', ''),
            'Journal': paper.get('journal', {}).get('title', ''),
            'DOI': paper.get('doi', ''),
            'First Author': '',
            'First Author Affiliation': '',
            'First Author Country': '',
            'Last Author': '',
            'Last Author Affiliation': '',
            'Last Author Country': '',
            'Funding Country': '',
            'Funder': ''
        }

        # Extract first and last author details
        if paper['authors']:
            first_author = paper['authors'][0]
            last_author = paper['authors'][-1]
            info['First Author'] = first_author.get('first_name', '') + ' ' + first_author.get('last_name', '')
            info['Last Author'] = last_author.get('first_name', '') + ' ' + last_author.get('last_name', '')
            if first_author['affiliations']:
                first_affiliation = first_author['affiliations'][0]
                info['First Author Affiliation'] = first_affiliation.get('name', '')
                info['First Author Country'] = first_affiliation.get('country', '')
            if last_author['affiliations']:
                first_affiliation = first_author['affiliations'][0]
                info['Last Author Affiliation'] = first_affiliation.get('name', '')
                info['Last Author Country'] = first_affiliation.get('country', '')

        try:
            # Extract funding details
            if paper['funders']:
                funder_info = paper['funders'][0]
                info['Funder'] = funder_info.get('name', '')
                if funder_info['country_name']:
                    info['Funding Country'] = funder_info['country_name']
        except:
            info['Founder'] = 'None'
            info['Funding Country'] = 'None'

        return info, paper



# Example PMIDs (replace with your actual PMIDs)
pmids = pd.unique(df['PMID'])

# Fetch details for each PMID
papers_data = []
papers_data_raw = []
no_info = []
i = 0
for pmid in pmids:
    #print(f'Extracting information of PMID: {pmid}')
    try:
      paper_info, info_raw = get_paper_details(pmid)
    except:
      print('No Info...')
      no_info.append(pmid)
      continue

    if paper_info:
        #print(info_raw)
        papers_data.append(paper_info)
        papers_data_raw.append(info_raw)

    if i % 100 == 0:
      print(f'{i} papers extracted')
    i += 1

# Create a DataFrame from the fetched data
papers_df = pd.DataFrame(papers_data_raw)
papers_df_prep = pd.DataFrame(papers_data)

# Display the DataFrame
# papers_df

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Field current_organization_id of the authors field is deprecated and will be removed in the next major release.
Returned Publications: 1 (total = 1)
[2mTime: 4.90s[0m
Field current_organization_id of the authors field is deprecated and will be removed in the next major release.
Returned Publications: 1 (total = 1)
[2mTime: 0.79s[0m
Field current_organization_id of the authors field is deprecated and will be removed in the next major release.
Returned Publications: 1 (total = 1)
[2mTime: 5.71s[0m
Field current_organization_id of the authors field is deprecated and will be removed in the next major release.
Returned Publications: 1 (total = 1)
[2mTime: 1.29s[0m
Field current_organization_id of the authors field is deprecated and will be removed in the next major release.
Returned Publications: 1 (total = 1)
[2mTime: 5.54s[0m
Field current_organization_id of the authors field is deprecated and will be removed in th

In [8]:
papers_df.to_csv(f'Papers_Info_raw.csv', index=False)
papers_df_prep.to_csv(f'Papers_Info.csv', index=False)