<a href="https://colab.research.google.com/github/dkisselev-zz/mmc-pipeline/blob/main/Authors_List.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ==============================================================================
# STEP 1: SETUP (IMPORTS AND AUTHENTICATION)
# ==============================================================================
import os
import io
import requests
import time
import pandas as pd
import re
from collections import defaultdict
import xml.etree.ElementTree as ET
from google.colab import auth
from google.colab import userdata
from google.colab.data_table import DataTable
from google.auth import default
import google.generativeai as genai
import gspread

# --- Authenticate to access Google Sheet ---
# This will prompt you to log in and authorize access.
try:
    auth.authenticate_user()
    creds, _ = default()
    gc = gspread.authorize(creds)
    print("Authentication successful.")
except Exception as e:
    print(f"Authentication failed. Please ensure you are in a Google Colab environment. Error: {e}")

# Configure Gemini API
# Make sure you have your GOOGLE_API_KEY stored as a secret in Colab
try:
    API_KEY = userdata.get('GOOGLE_API_KEY')
    genai.configure(api_key=API_KEY)
    print("Gemini API configured successfully.")
except Exception as e:
    print(f"Could not configure Gemini API. Please add GOOGLE_API_KEY to your Colab secrets. Error: {e}")

try:
    EMAIL = userdata.get('EMAIL')
except (ValueError, FileNotFoundError):
    raise ValueError("EMAIL not found in Colab secrets. Please add it.")

# Load NCBI API Key if it exists
try:
    NCBI_API_KEY = userdata.get('NCBI_API_KEY')
    print("NCBI API Key loaded successfully.")
except Exception:
    NCBI_API_KEY = None
    print("NCBI API Key not found in Colab secrets. Proceeding with lower rate limits.")

Authentication successful.
Gemini API configured successfully.
NCBI API Key loaded successfully.


In [None]:
# Central Configuration
config = {
    "llm": {
        "model_name": "gemini-2.5-flash-lite",
        "prompt_get_country": (
            "From the following affiliation text from a scientific paper, extract ONLY the country name. "
            "Do not add any explanation or prefixes like 'Country:'. If no country is mentioned, respond with 'Unknown'.\n\n"
            "Affiliation: {affiliation_text}"
        )
    },
    "api": {
        "ncbi_base_url": "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/",
        "ncbi_api_email": EMAIL
    }
}

In [None]:
import urllib.parse # Make sure this import is at the top of your script

class MetadataProcessor:
    """
    (V5 - Complete and Corrected)
    Restores the missing 'get_country_from_affiliation' method.
    """
    def __init__(self, config, api_key=None):
        self.config = config
        self.model = genai.GenerativeModel(config["llm"]["model_name"])
        self.base_url = config["api"]["ncbi_base_url"]
        self.api_key = api_key
        self.country_synonyms = {
            'USA': 'United States', 'U.S.A': 'United States', 'U.S.A.': 'United States', 'United States of America': 'United States',
            'U.S.': 'United States', 'UK': 'United Kingdom', 'U.K.': 'United Kingdom',
            'P.R.C.': 'China', 'PRC': 'China',
            'The Netherlands': 'Netherlands'
        }

    def _call_gemini(self, prompt, retries=3, delay=5):

        print(prompt)
        print("--------------------")
        for attempt in range(retries):
            try:
                request_options = {"timeout": 30} # Timeout after 60 seconds
                response = self.model.generate_content(
                    prompt,
                    request_options=request_options
                )

                print(response.text.strip())
                return response.text.strip()
            except exceptions.DeadlineExceeded as e:
                print(f"\n  > LLM Error: Request timed out on attempt {attempt + 1}/{retries}.")

            except Exception as e:
                print(f"\n  > LLM Error (Attempt {attempt + 1}/{retries}): {e}")

            if attempt < retries - 1:
                wait_time = delay * (2 ** attempt) # Exponential backoff
                print(f"  > Retrying LLM call in {wait_time} seconds...")
                time.sleep(wait_time)
            else:
                print("  > LLM call failed after all retries.")
                return "LLM Error: Failed to get a response from the model."

        return "LLM Error: An unexpected issue occurred."

    def _make_ncbi_request(self, base_url, params=None, data=None, retries=3, delay=5):
        request_delay = 0.1 if self.api_key else 0.4
        if self.api_key:
            if data:
                data['api_key'] = self.api_key
            if params:
                params['api_key'] = self.api_key
        for attempt in range(retries):
            time.sleep(request_delay)
            try:
                if data:
                    response = requests.post(base_url, data=data, timeout=30)
                else:
                    response = requests.get(base_url, params=params, timeout=30)
                if response.status_code in [429, 500, 502, 503, 504]:
                    wait_time = delay * (attempt + 1)
                    print(f"  > WARNING: Received status {response.status_code}. Retrying in {wait_time} seconds...")
                    time.sleep(wait_time)
                    continue
                response.raise_for_status()
                return response.content
            except requests.exceptions.RequestException as e:
                print(f"  > ERROR: Request failed on attempt {attempt + 1}/{retries}: {e}")
                if attempt < retries - 1:
                    wait_time = delay * (attempt + 1)
                    print(f"  > Retrying in {wait_time} seconds...")
                    time.sleep(wait_time)
                continue
        print(f"  > All {retries} retry attempts failed.")
        return None


    def get_country_from_affiliation(self, affiliation_text):
        """Uses an LLM to extract a country from author affiliation text."""
        if not affiliation_text:
            return "Unknown"
        prompt = self.config["llm"]["prompt_get_country"].format(affiliation_text=affiliation_text)
        llm_country = self._call_gemini(prompt)
        if "LLM Error" in llm_country or not llm_country:
            return "Unknown"
        standardized_country = self.country_synonyms.get(llm_country.upper(), llm_country)
        return standardized_country

    def get_author_metadata_by_doi(self, doi):
        print(f"  Fetching author data for raw DOI: {doi}")
        author_data = []
        if not isinstance(doi, str):
            print(f"  > Invalid DOI format: Not a string.")
            return []
        match = re.search(r'10\.\d{4,9}/[-._;()/:A-Z0-9]+', doi, re.IGNORECASE)
        if not match:
            print(f"  > Could not parse a valid DOI from input: '{doi}'")
            return []
        clean_doi = match.group(0)

        params = {'db': 'pubmed', 'term': f'"{clean_doi}"[aid]'}
        log_params = params.copy()
        if self.api_key: log_params['api_key'] = self.api_key
        final_url_for_log = f"{self.base_url}esearch.fcgi?{urllib.parse.urlencode(log_params)}"


        search_response = self._make_ncbi_request(f"{self.base_url}esearch.fcgi", params=params)
        if not search_response: return []

        pmid_root = ET.fromstring(search_response)
        pmid = pmid_root.findtext('.//Id')
        if not pmid:
            print(f"  > No article found for cleaned DOI: {clean_doi}")
            return []

        fetch_data = {'db': 'pubmed', 'id': pmid, 'retmode': 'xml'}
        fetch_response = self._make_ncbi_request(f"{self.base_url}efetch.fcgi", data=fetch_data)
        if not fetch_response: return []

        article_root = ET.fromstring(fetch_response)
        for author_node in article_root.findall('.//Author'):
            lastname = author_node.findtext('LastName', '')
            forename = author_node.findtext('ForeName', '')
            author_name = f"{lastname}, {forename}".strip(', ')
            affiliation = author_node.findtext('.//Affiliation', '')
            country = self.get_country_from_affiliation(affiliation) # This line will now work
            if author_name:
                author_data.append({'name': author_name, 'country': country})
        print(f"  > Found {len(author_data)} authors.")
        return author_data

    def get_sample_countries_by_project(self, project_accession):
        print(f"  Fetching sample data for Project: {project_accession}")
        countries = []
        search_params = {'db': 'sra', 'term': f"{project_accession}[BioProject]", 'retmax': '10000'}
        sra_search_response = self._make_ncbi_request(f"{self.base_url}esearch.fcgi", params=search_params)
        if not sra_search_response: return []

        sra_root = ET.fromstring(sra_search_response)
        sra_ids = [elem.text for elem in sra_root.findall('.//Id')]
        if not sra_ids:
            print(f"  > No SRA records found for project {project_accession}")
            return []
        print(f"  > Found {len(sra_ids)} SRA records. Finding linked BioSamples...")

        all_biosample_ids = set()
        chunk_size = 200
        for i in range(0, len(sra_ids), chunk_size):
            chunk = sra_ids[i:i + chunk_size]
            fetch_data = {'db': 'sra', 'id': ','.join(chunk), 'retmode': 'xml'}
            sra_fetch_response = self._make_ncbi_request(f"{self.base_url}efetch.fcgi", data=fetch_data)
            if not sra_fetch_response: continue

            sra_data_root = ET.fromstring(sra_fetch_response)
            for biosample_id_node in sra_data_root.findall(".//EXTERNAL_ID[@namespace='BioSample']"):
                if biosample_id_node.text:
                    all_biosample_ids.add(biosample_id_node.text)

        if not all_biosample_ids:
            print(f"  > No BioSample links found in SRA records.")
            return []
        biosample_id_list = list(all_biosample_ids)
        print(f"  > Found {len(biosample_id_list)} unique BioSamples. Fetching attributes...")

        for i in range(0, len(biosample_id_list), chunk_size):
            chunk = biosample_id_list[i:i + chunk_size]
            fetch_data = {'db': 'biosample', 'id': ','.join(chunk), 'retmode': 'xml'}
            fetch_response = self._make_ncbi_request(f"{self.base_url}efetch.fcgi", data=fetch_data)
            if not fetch_response: continue

            samples_root = ET.fromstring(fetch_response)
            for sample in samples_root.findall('BioSample'):
                for attr in sample.findall('.//Attribute[@attribute_name="geo_loc_name"]'):
                    if attr.text:
                        country = attr.text.split(':')[0].strip()
                        countries.append(country)

        print(f"  > Extracted {len(countries)} country data points from samples.")
        return list(set(countries))

In [None]:
# spreadsheet_url = "https://docs.google.com/spreadsheets/d/1TfMcVgAiwPI0hDVjeYrI-z7gy7t8CxQ6I1MnWkvElfA" # @param {"type":"string"}
# worksheet_name = "Study List" # @param {"type":"string"}
# header_indx = 0 # @param {"type":"integer"}
# accession_col_name = "Accession #" # @param {"type":"string"}
spreadsheet_url = "https://docs.google.com/spreadsheets/d/1tBjpV_GoXIjx4_3o73Qml0h-BzFBZTD5bc3QHx1l1_4" # @param {"type":"string"}
worksheet_name = "Main Data Sheet" # @param {"type":"string"}
header_indx = 1 # @param {"type":"integer"}
accession_col_name = "AccessionCode" # @param {"type":"string"}

In [None]:
# ==============================================================================
# STEP 3: LOAD DATA FROM GOOGLE SHEET
# ==============================================================================
print("\n--- Loading data from Google Sheet ---")
try:
    spreadsheet = gc.open_by_url(spreadsheet_url)
    worksheet = spreadsheet.worksheet(worksheet_name)
    all_values = worksheet.get_all_values()
    header = all_values[header_indx]
    data_rows = all_values[header_indx+1:]
    input_df = pd.DataFrame(data_rows, columns=header)
    input_df.reset_index(inplace=True)
    input_df.rename(columns={'index': 'row_index'}, inplace=True)

    if 'Processed' not in input_df.columns:
        input_df['Processed'] = ''
    rows_to_process = input_df[
        (input_df['Processed'] == '') & (input_df[accession_col_name] != '')
    ].copy()

    print(f"Loaded {len(input_df)} total records.")
    print(f"Found {len(rows_to_process)} new records to process.")
    print("--- Successfully loaded data from Google Sheet ---")
    display(DataTable(rows_to_process.head()))
except Exception as e:
    print(f"Could not load Google Sheet. Error: {e}")
    rows_to_process = pd.DataFrame()


--- Loading data from Google Sheet ---
Loaded 2329 total records.
Found 2308 new records to process.
--- Successfully loaded data from Google Sheet ---


Unnamed: 0,row_index,Author1Author2name (LastName1LastName2),Has someone done this study?,YourName,EmilysHost,Khanh Host,Host Check,DOI,Environment,StudyLink,...,Bias,Log 2 Fold Change (L2FC),number_of_authors,number_of_authors_countries,authors,authors_countries,number_of_samples_countries,sample_countries,authors_complete,Processed
0,0,ZhangLiu2020,you're good!,SamuelDegregori,Humans,human,Homo sapiens,10.3389/fcimb.2019.00476,Oral,https://pmc.ncbi.nlm.nih.gov/articles/pmid/320...,...,,,4,1,"Zhang, Ling; Liu, Yuan; Zheng, Hua Jun; Zhang,...",China,1,China,Yes,
1,1,Lev-SagieGoldman-Wohl2019,you're good!,EmilySong,Humans,human,Homo sapiens,10.1038/s41591-019-0600-6,Vaginal,https://www.nature.com/articles/s41591-019-060...,...,805.0,2.967543788,11,1,"Lev-Sagie, Ahinoam; Goldman-Wohl, Debra; Cohen...",Israel,0,,Yes,
2,2,BaldiBraat2024,you're good!,SolanaCallaway,Humans,human,Homo sapiens,10.1038/s41467-024-53013-x,Gut,https://pubmed.ncbi.nlm.nih.gov/39367018/,...,-1615.0,-3.842687995,19,4,"Baldi, Andrew; Braat, Sabine; Hasan, Mohammed ...",Australia; Bangladesh; United Kingdom; United ...,1,Bangladesh,Yes,
3,3,HosangCanals2022,you're good!,VictoriaXu,Rats,rat,Rattus norvegicus,10.1038/s41586-022-04427-4,Pulmonary,https://www.nature.com/articles/s41586-022-044...,...,54.0,1.700439718,7,1,"Hosang, Leon; Canals, Roger Cugota; van der Fl...",Germany,1,Germany,Yes,
4,4,XiaoCai2022,you're good!,AnjaliVinodh,Humans,human,Homo sapiens,10.1128/spectrum.01901-21,Pulmonary,https://pmc.ncbi.nlm.nih.gov/articles/PMC88654...,...,-8.0,-0.202816883,15,1,"Xiao, Guohui; Cai, Zhao; Guo, Qinglong; Ye, Ta...",China,1,China,Yes,


In [None]:
print("Searching for and consolidating duplicate 'DOI' columns...")

# Check if there are duplicate 'DOI' columns before proceeding
if 'DOI' in input_df.columns and isinstance(input_df['DOI'], pd.DataFrame):

    # 1. Isolate only the columns named 'DOI' into a temporary DataFrame
    doi_cols_df = input_df['DOI']

    # 2. Define a function to find the first valid DOI in a row of the temporary DataFrame
    def find_first_valid_doi(row_of_dois):
        for entry in row_of_dois:
            # Use regex to find a valid DOI pattern within the entry's string representation
            match = re.search(r'(10\.\d{4,9}/[-._;()/:A-Z0-9]+)', str(entry), re.IGNORECASE)
            if match:
                return match.group(0) # Return the first valid DOI we find
        return "" # If no valid DOI is found in any of the columns for this row, return empty

    # 3. Apply this function across the rows to create a new, clean, single column
    input_df['doi_clean'] = doi_cols_df.apply(find_first_valid_doi, axis=1)

    # 4. Get the names of all original columns EXCEPT the duplicates
    original_cols_to_keep = [col for col in input_df.columns if col != 'DOI']

    # 5. Recreate the DataFrame without the old 'DOI' columns
    input_df = input_df[original_cols_to_keep]

    # 6. Rename our new clean column to 'doi' for the rest of the script to use
    input_df.rename(columns={'doi_clean': 'doi'}, inplace=True)

    print("Successfully consolidated duplicate 'DOI' columns into a single 'doi' column.")

else:
    print("No duplicate 'DOI' columns found, or 'DOI' column is already clean. Skipping consolidation.")

# Display the first few rows with the new, clean 'doi' column
display(input_df[['row_index', 'doi', 'AccessionCode']].head())

Searching for and consolidating duplicate 'DOI' columns...
Successfully consolidated duplicate 'DOI' columns into a single 'doi' column.


Unnamed: 0,row_index,doi,AccessionCode
0,0,10.3389/fcimb.2019.00476,PRJNA533177
1,1,10.1038/s41591-019-0600-6,PRJEB34085
2,2,10.1038/s41467-024-53013-x,PRJNA1081952
3,3,10.1038/s41586-022-04427-4,PRJNA789820
4,4,10.1128/spectrum.01901-21,PRJNA655567


In [None]:
# 3.1: Initialize the processor
processor = MetadataProcessor(config, api_key=NCBI_API_KEY)

# # 3.2: Load your input data
# # !!! IMPORTANT !!!
# # Replace this example DataFrame with your code to load data from the Google Sheet.
# # Make sure your DataFrame has 'doi' and 'AccessionCode' columns.
# input_data = {
#     'doi': [
#         '10.1128/msystems.00539-19',
#         '10.1186/s40168-020-00874-1',
#         '10.3389/fcimb.2019.00476'
#     ],
#     'AccessionCode': [
#         'PRJNA544527',
#         'PRJEB33564',
#         'PRJNA533177'
#     ]
# }
# input_df = pd.DataFrame(input_data)
# # Preserve the original index by resetting it into a column
# input_df.reset_index(inplace=True)
# input_df.rename(columns={'index': 'row_index'}, inplace=True)


# 3.3: Main processing loop (Corrected with Accession Validation)

# Define valid NCBI BioProject prefixes
NCBI_PREFIXES = ('PRJDB', 'PRJNA', 'PRJEB', 'SRP', 'ERP', 'DRP')

results = []
for index, row in input_df.iterrows():
    print(f"\n--- Processing Row {row['row_index']} ---")
    # if row['row_index'] < 1608:
    #   continue

    # Get values from the current row
    doi_object = row['doi']
    project_id_raw = row['AccessionCode']

    # Ensure the doi value is a string before processing.
    doi_as_string = str(doi_object)

    # --- Author data can be processed regardless of the project ID ---
    authors_metadata = processor.get_author_metadata_by_doi(doi_as_string)
    author_names = [auth['name'] for auth in authors_metadata]
    author_countries = sorted(list(set([auth['country'] for auth in authors_metadata if auth['country'] != 'Unknown'])))


    # --- Sample data processing with VALIDATION ---
    project_id = str(project_id_raw).split(',')[0].strip().upper()

    sample_countries = [] # Default to an empty list

    # Check if the (now single) accession is valid before processing
    if project_id and project_id.startswith(NCBI_PREFIXES):
        sample_countries = sorted(processor.get_sample_countries_by_project(project_id))
    else:
        # Handle invalid or missing project IDs gracefully
        print(f"  > Skipping sample data search: Invalid or missing Project Accession '{project_id_raw}'")


    # --- Compile results for this row ---
    results.append({
        'row_index': row['row_index'],
        'doi': doi_as_string,
        'number_of_authors': len(author_names),
        'number_of_authors_countries': len(author_countries),
        'authors': "; ".join(author_names),
        'authors_countries': "; ".join(author_countries),
        'number_of_samples_countries': len(sample_countries),
        'sample_countries': "; ".join(sample_countries)
    })

print("\n--- Processing complete. ---")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m

--- Processing Row 1378 ---

--- Processing Row 1379 ---

--- Processing Row 1380 ---

--- Processing Row 1381 ---

--- Processing Row 1382 ---

--- Processing Row 1383 ---

--- Processing Row 1384 ---

--- Processing Row 1385 ---

--- Processing Row 1386 ---

--- Processing Row 1387 ---

--- Processing Row 1388 ---

--- Processing Row 1389 ---

--- Processing Row 1390 ---

--- Processing Row 1391 ---

--- Processing Row 1392 ---

--- Processing Row 1393 ---

--- Processing Row 1394 ---

--- Processing Row 1395 ---

--- Processing Row 1396 ---

--- Processing Row 1397 ---

--- Processing Row 1398 ---

--- Processing Row 1399 ---

--- Processing Row 1400 ---

--- Processing Row 1401 ---

--- Processing Row 1402 ---

--- Processing Row 1403 ---

--- Processing Row 1404 ---

--- Processing Row 1405 ---

--- Processing Row 1406 ---

--- Processing Row 1407 ---

--- Processing Row 1408 ---

--- Processing Row 1409 ---

--- Pr

NameError: name 'exceptions' is not defined

In [None]:
# 1. Create a DataFrame from the results list generated by your main loop
results_df = pd.DataFrame(results)

# --- Define the mapping and cleaning functions ---

# A dictionary to standardize common country name variations
country_name_variations = {
    'USA': 'United States',
    'United States of America': 'United States',
    'U.S.A': 'United States',
    'U.S.A.': 'United States',
    'U.S.': 'United States',
    'UK': 'United Kingdom',
    'U.K.': 'United Kingdom',
    'England': 'United Kingdom',
    'P.R.C.': 'China',
    'PRC': 'China',
    'People\'s Republic of China': 'China',
    'The Netherlands': 'Netherlands',
    'Korea': 'South Korea', # General 'Korea' maps to South Korea
    'Republic of Korea': 'South Korea',
    'Korea, Rep.': 'South Korea'
}

# The World Bank country-to-region mapping
country_to_region = {
    'Aruba': 'Latin America & Caribbean', 'Afghanistan': 'South Asia',
    'Angola': 'Sub-Saharan Africa', 'Albania': 'Europe & Central Asia', 'Andorra': 'Europe & Central Asia',
    'United Arab Emirates': 'Middle East & North Africa', 'Argentina': 'Latin America & Caribbean',
    'Armenia': 'Europe & Central Asia', 'American Samoa': 'East Asia & Pacific', 'Antigua and Barbuda': 'Latin America & Caribbean',
    'Australia': 'East Asia & Pacific', 'Austria': 'Europe & Central Asia', 'Azerbaijan': 'Europe & Central Asia',
    'Burundi': 'Sub-Saharan Africa', 'Belgium': 'Europe & Central Asia', 'Benin': 'Sub-Saharan Africa',
    'Burkina Faso': 'Sub-Saharan Africa', 'Bangladesh': 'South Asia', 'Bulgaria': 'Europe & Central Asia',
    'Bahrain': 'Middle East & North Africa', 'Bahamas, The': 'Latin America & Caribbean',
    'Bosnia and Herzegovina': 'Europe & Central Asia', 'Belarus': 'Europe & Central Asia', 'Belize': 'Latin America & Caribbean',
    'Bermuda': 'North America', 'Bolivia': 'Latin America & Caribbean', 'Brazil': 'Latin America & Caribbean',
    'Barbados': 'Latin America & Caribbean', 'Brunei Darussalam': 'East Asia & Pacific', 'Bhutan': 'South Asia',
    'Botswana': 'Sub-Saharan Africa', 'Central African Republic': 'Sub-Saharan Africa', 'Canada': 'North America',
    'Switzerland': 'Europe & Central Asia', 'Channel Islands': 'Europe & Central Asia', 'Chile': 'Latin America & Caribbean',
    'China': 'East Asia & Pacific', "Côte d'Ivoire": 'Sub-Saharan Africa', 'Cameroon': 'Sub-Saharan Africa',
    'Congo, Dem. Rep.': 'Sub-Saharan Africa', 'Congo, Rep.': 'Sub-Saharan Africa', 'Colombia': 'Latin America & Caribbean',
    'Comoros': 'Sub-Saharan Africa', 'Cabo Verde': 'Sub-Saharan Africa', 'Costa Rica': 'Latin America & Caribbean',
    'Cuba': 'Latin America & Caribbean', 'Curaçao': 'Latin America & Caribbean', 'Cayman Islands': 'Latin America & Caribbean',
    'Cyprus': 'Europe & Central Asia', 'Czechia': 'Europe & Central Asia', 'Germany': 'Europe & Central Asia',
    'Djibouti': 'Middle East & North Africa', 'Dominica': 'Latin America & Caribbean', 'Denmark': 'Europe & Central Asia',
    'Dominican Republic': 'Latin America & Caribbean', 'Algeria': 'Middle East & North Africa', 'Ecuador': 'Latin America & Caribbean',
    'Egypt, Arab Rep.': 'Middle East & North Africa', 'Eritrea': 'Sub-Saharan Africa', 'Spain': 'Europe & Central Asia',
    'Estonia': 'Europe & Central Asia', 'Ethiopia': 'Sub-Saharan Africa', 'Finland': 'Europe & Central Asia',
    'Fiji': 'East Asia & Pacific', 'France': 'Europe & Central Asia', 'Faroe Islands': 'Europe & Central Asia',
    'Micronesia, Fed. Sts.': 'East Asia & Pacific', 'Gabon': 'Sub-Saharan Africa', 'United Kingdom': 'Europe & Central Asia',
    'Georgia': 'Europe & Central Asia', 'Ghana': 'Sub-Saharan Africa', 'Gibraltar': 'Europe & Central Asia',
    'Guinea': 'Sub-Saharan Africa', 'Gambia, The': 'Sub-Saharan Africa', 'Guinea-Bissau': 'Sub-Saharan Africa',
    'Equatorial Guinea': 'Sub-Saharan Africa', 'Greece': 'Europe & Central Asia', 'Grenada': 'Latin America & Caribbean',
    'Greenland': 'Europe & Central Asia', 'Guatemala': 'Latin America & Caribbean', 'Guam': 'East Asia & Pacific',
    'Guyana': 'Latin America & Caribbean', 'Hong Kong SAR, China': 'East Asia & Pacific', 'Honduras': 'Latin America & Caribbean',
    'Croatia': 'Europe & Central Asia', 'Haiti': 'Latin America & Caribbean', 'Hungary': 'Europe & Central Asia',
    'Indonesia': 'East Asia & Pacific', 'Isle of Man': 'Europe & Central Asia', 'India': 'South Asia',
    'Ireland': 'Europe & Central Asia', 'Iran': 'Middle East & North Africa', 'Iran, Islamic Rep.': 'Middle East & North Africa',
    'Iraq': 'Middle East & North Africa', 'Iceland': 'Europe & Central Asia', 'Israel': 'Middle East & North Africa',
    'Italy': 'Europe & Central Asia', 'Jamaica': 'Latin America & Caribbean', 'Jordan': 'Middle East & North Africa',
    'Japan': 'East Asia & Pacific', 'Kazakhstan': 'Europe & Central Asia', 'Kenya': 'Sub-Saharan Africa',
    'Kyrgyz Republic': 'Europe & Central Asia', 'Cambodia': 'East Asia & Pacific', 'Kiribati': 'East Asia & Pacific',
    'St. Kitts and Nevis': 'Latin America & Caribbean', 'South Korea': 'East Asia & Pacific', 'Kuwait': 'Middle East & North Africa',
    'Lao PDR': 'East Asia & Pacific', 'Lebanon': 'Middle East & North Africa', 'Liberia': 'Sub-Saharan Africa',
    'Libya': 'Middle East & North Africa', 'St. Lucia': 'Latin America & Caribbean', 'Liechtenstein': 'Europe & Central Asia',
    'Sri Lanka': 'South Asia', 'Lesotho': 'Sub-Saharan Africa', 'Lithuania': 'Europe & Central Asia',
    'Luxembourg': 'Europe & Central Asia', 'Latvia': 'Europe & Central Asia', 'Macao SAR, China': 'East Asia & Pacific',
    'St. Martin (French part)': 'Latin America & Caribbean', 'Morocco': 'Middle East & North Africa',
    'Monaco': 'Europe & Central Asia', 'Moldova': 'Europe & Central Asia', 'Madagascar': 'Sub-Saharan Africa',
    'Maldives': 'South Asia', 'Mexico': 'Latin America & Caribbean', 'Marshall Islands': 'East Asia & Pacific',
    'North Macedonia': 'Europe & Central Asia', 'Mali': 'Sub-Saharan Africa', 'Malta': 'Middle East & North Africa',
    'Myanmar': 'East Asia & Pacific', 'Montenegro': 'Europe & Central Asia', 'Mongolia': 'East Asia & Pacific',
    'Northern Mariana Islands': 'East Asia & Pacific', 'Mozambique': 'Sub-Saharan Africa', 'Mauritania': 'Sub-Saharan Africa',
    'Mauritius': 'Sub-Saharan Africa', 'Malawi': 'Sub-Saharan Africa', 'Malaysia': 'East Asia & Pacific',
    'Namibia': 'Sub-Saharan Africa', 'New Caledonia': 'East Asia & Pacific', 'Niger': 'Sub-Saharan Africa',
    'Nigeria': 'Sub-Saharan Africa', 'Nicaragua': 'Latin America & Caribbean', 'Netherlands': 'Europe & Central Asia',
    'Norway': 'Europe & Central Asia', 'Nepal': 'South Asia', 'Nauru': 'East Asia & Pacific',
    'New Zealand': 'East Asia & Pacific', 'Oman': 'Middle East & North Africa',
    'Pakistan': 'South Asia', 'Panama': 'Latin America & Caribbean', 'Peru': 'Latin America & Caribbean',
    'Philippines': 'East Asia & Pacific', 'Palau': 'East Asia & Pacific', 'Papua New Guinea': 'East Asia & Pacific',
    'Poland': 'Europe & Central Asia', 'Puerto Rico': 'Latin America & Caribbean', "Korea, Dem. People's Rep.": 'East Asia & Pacific',
    'Portugal': 'Europe & Central Asia', 'Paraguay': 'Latin America & Caribbean', 'West Bank and Gaza': 'Middle East & North Africa',
    'French Polynesia': 'East Asia & Pacific', 'Qatar': 'Middle East & North Africa',
    'Romania': 'Europe & Central Asia', 'Russia': 'Europe & Central Asia', 'Rwanda': 'Sub-Saharan Africa',
    'Saudi Arabia': 'Middle East & North Africa', 'Sudan': 'Sub-Saharan Africa', 'Senegal': 'Sub-Saharan Africa',
    'Singapore': 'East Asia & Pacific', 'Solomon Islands': 'East Asia & Pacific', 'Sierra Leone': 'Sub-Saharan Africa',
    'El Salvador': 'Latin America & Caribbean', 'San Marino': 'Europe & Central Asia', 'Somalia': 'Sub-Saharan Africa',
    'Serbia': 'Europe & Central Asia', 'South Sudan': 'Sub-Saharan Africa', 'São Tomé and Principe': 'Sub-Saharan Africa',
    'Suriname': 'Latin America & Caribbean', 'Slovak Republic': 'Europe & Central Asia', 'Slovenia': 'Europe & Central Asia',
    'Sweden': 'Europe & Central Asia', 'Eswatini': 'Sub-Saharan Africa', 'Sint Maarten (Dutch part)': 'Latin America & Caribbean',
    'Seychelles': 'Sub-Saharan Africa', 'Syrian Arab Republic': 'Middle East & North Africa',
    'Turks and Caicos Islands': 'Latin America & Caribbean', 'Chad': 'Sub-Saharan Africa', 'Togo': 'Sub-Saharan Africa',
    'Thailand': 'East Asia & Pacific', 'Tajikistan': 'Europe & Central Asia', 'Turkmenistan': 'Europe & Central Asia',
    'Timor-Leste': 'East Asia & Pacific', 'Tonga': 'East Asia & Pacific', 'Trinidad and Tobago': 'Latin America & Caribbean',
    'Tunisia': 'Middle East & North Africa', 'Türkiye': 'Europe & Central Asia',
    'Tuvalu': 'East Asia & Pacific', 'Tanzania': 'Sub-Saharan Africa', 'Uganda': 'Sub-Saharan Africa',
    'Ukraine': 'Europe & Central Asia', 'Uruguay': 'Latin America & Caribbean', 'United States': 'North America',
    'Uzbekistan': 'Europe & Central Asia', 'St. Vincent and the Grenadines': 'Latin America & Caribbean',
    'Venezuela, RB': 'Latin America & Caribbean', 'British Virgin Islands': 'Latin America & Caribbean',
    'Virgin Islands (U.S.)': 'Latin America & Caribbean', 'Viet Nam': 'East Asia & Pacific', 'Vanuatu': 'East Asia & Pacific',
    'Samoa': 'East Asia & Pacific', 'Kosovo': 'Europe & Central Asia', 'Yemen, Rep.': 'Middle East & North Africa',
    'South Africa': 'Sub-Saharan Africa', 'Zambia': 'Sub-Saharan Africa', 'Zimbabwe': 'Sub-Saharan Africa'
}

# --- Main function to map countries to regions ---
def map_countries_to_regions(country_string):
    if not isinstance(country_string, str) or not country_string:
        return "" # Return an empty string if input is invalid

    countries = country_string.split(';')
    regions = []
    for country in countries:
        clean_country = country.strip()
        standardized_country = country_name_variations.get(clean_country, clean_country)
        region = country_to_region.get(standardized_country, 'Unknown')
        regions.append(region)

    # Get a list of unique regions, preserving order
    unique_regions = list(dict.fromkeys(regions))


    return "; ".join(unique_regions)

# 2. Apply the function to the 'authors_countries' column of the new DataFrame
final_df['authors_regions'] = final_df['authors_countries'].apply(map_countries_to_regions)

# 3. Merge the results (now including authors_regions) back into your main DataFrame
# This follows the logic from the final cell in your notebook.
# final_df = pd.merge(input_df[['row_index']], results_df, on='row_index', how='left')
# final_df.fillna('', inplace=True)

# 4. Display the updated final DataFrame to verify
print("--- Final Output Data (with authors_regions) ---")
display(final_df[['authors_countries', 'authors_regions']].head())

# You can now proceed with saving final_df to your CSV
# output_filename = 'final_metadata_output_with_regions.csv'
# final_df.to_csv(output_filename, index=False)
# print(f"\\n✅ Successfully saved final data to '{output_filename}'")

--- Final Output Data (with authors_regions) ---


Unnamed: 0_level_0,authors_countries,authors_regions
row_index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,China,East Asia & Pacific
1,Israel,Middle East & North Africa
2,Australia; Bangladesh; United Kingdom; United ...,East Asia & Pacific; South Asia; Europe & Cent...
3,Germany,Europe & Central Asia
4,China,East Asia & Pacific


In [None]:
results_df.head()

Unnamed: 0,row_index,doi,number_of_authors,number_of_authors_countries,authors,authors_countries,number_of_samples_countries,sample_countries,authors_regions
0,1608,10.3389/fcimb.2021.823262,9,1,"Yan, Hang; Qin, Qian; Chen, Jengfeng; Yan, Su;...",China,0,,East Asia & Pacific
1,1609,10.3390/nu11010051,9,2,"de la Cuesta-Zuluaga, Jacobo; Mueller, Noel T;...",Colombia; United States,0,,Latin America & Caribbean; North America
2,1610,10.3390/microorganisms9050897,14,3,"Therdtatha, Phatthanaphong; Song, Yayi; Tanaka...",Indonesia; Japan; Singapore,0,,East Asia & Pacific
3,1611,10.1186/s40168-021-01069-y,18,1,"Newman, Tiffany M; Shively, Carol A; Register,...",United States,0,,North America
4,1612,10.3389/fonc.2022.837525,22,2,"Fang, Chao; Fang, Wenfeng; Xu, Liqin; Gao, Fan...",China; Denmark,0,,East Asia & Pacific; Europe & Central Asia


In [None]:
processor = MetadataProcessor(config, api_key=NCBI_API_KEY)
processor.get_author_metadata_by_doi('10.3389/fcimb.2021.823262')

  Fetching author data for raw DOI: 10.3389/fcimb.2021.823262
From the following affiliation text from a scientific paper, extract ONLY the country name. Do not add any explanation or prefixes like 'Country:'. If no country is mentioned, respond with 'Unknown'.

Affiliation: Health Management Center, The First Affiliated Hospital of Zhengzhou University, Zhengzhou, China.
--------------------
China
From the following affiliation text from a scientific paper, extract ONLY the country name. Do not add any explanation or prefixes like 'Country:'. If no country is mentioned, respond with 'Unknown'.

Affiliation: Health Management Center, The First Affiliated Hospital of Zhengzhou University, Zhengzhou, China.
--------------------
China
From the following affiliation text from a scientific paper, extract ONLY the country name. Do not add any explanation or prefixes like 'Country:'. If no country is mentioned, respond with 'Unknown'.

Affiliation: Health Management Center, The First Affiliat

[{'name': 'Yan, Hang', 'country': 'China'},
 {'name': 'Qin, Qian', 'country': 'China'},
 {'name': 'Chen, Jengfeng', 'country': 'China'},
 {'name': 'Yan, Su', 'country': 'China'},
 {'name': 'Li, Tiantian', 'country': 'China'},
 {'name': 'Gao, Xinxin', 'country': 'China'},
 {'name': 'Yang, Yang', 'country': 'China'},
 {'name': 'Li, Ang', 'country': 'China'},
 {'name': 'Ding, Suying', 'country': 'China'}]

In [None]:
    import dill

    # Save the current session
    dill.dump_session('session.pkl')

    # Later, load the session
    # dill.load_session('session.pkl')

In [None]:
# List your files from oldest to newest
# The last file in this list has the highest priority
files = [
    'final_metadata_output_0.csv',
    'final_metadata_output_1.csv',
    'final_metadata_output_2.csv'
]

# When loading, ensure blank strings are read as 'NaN' (Not a Number)
# This is how pandas recognizes a cell as "blank"
all_dfs = [
    pd.read_csv(f, index_col='row_index', keep_default_na=False, na_values=[''])
    for f in files
]

# 1. Start with the newest DataFrame as our base
final_df = all_dfs[-1]

# 2. Loop backwards through the older DataFrames
for i in range(len(all_dfs) - 2, -1, -1):
    # 3. Fill the blanks in our final_df with data from the older ones
    final_df = final_df.combine_first(all_dfs[i])

print("Final DataFrame with Blanks Filled:")
print(final_df)

# Save the final, complete DataFrame to a new CSV
final_df.to_csv('final_complete_data.csv')

Final DataFrame with Blanks Filled:
                                  doi  number_of_authors  \
row_index                                                  
0            10.3389/fcimb.2019.00476                4.0   
1           10.1038/s41591-019-0600-6               11.0   
2          10.1038/s41467-024-53013-x               19.0   
3          10.1038/s41586-022-04427-4                7.0   
4           10.1128/spectrum.01901-21               15.0   
...                               ...                ...   
2324                              NaN                NaN   
2325                              NaN                NaN   
2326                              NaN                NaN   
2327                              NaN                NaN   
2328                              NaN                NaN   

           number_of_authors_countries  \
row_index                                
0                                  1.0   
1                                  1.0   
2              

In [None]:
final_df.to_csv('final_merged_output.csv', index=False)


In [None]:
df.iloc[0:10]

Unnamed: 0_level_0,doi,number_of_authors,number_of_authors_countries,authors,authors_countries,number_of_samples_countries,sample_countries,authors_regions
row_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,10.3389/fcimb.2019.00476,4.0,1.0,"Zhang, Ling; Liu, Yuan; Zheng, Hua Jun; Zhang,...",China,1.0,China,East Asia & Pacific
1,10.1038/s41591-019-0600-6,11.0,1.0,"Lev-Sagie, Ahinoam; Goldman-Wohl, Debra; Cohen...",Israel,0.0,,Middle East & North Africa
2,10.1038/s41467-024-53013-x,19.0,4.0,"Baldi, Andrew; Braat, Sabine; Hasan, Mohammed ...",Australia; Bangladesh; United Kingdom; United ...,1.0,Bangladesh,East Asia & Pacific; South Asia; Europe & Cent...
3,10.1038/s41586-022-04427-4,7.0,1.0,"Hosang, Leon; Canals, Roger Cugota; van der Fl...",Germany,1.0,Germany,Europe & Central Asia
4,10.1128/spectrum.01901-21,15.0,1.0,"Xiao, Guohui; Cai, Zhao; Guo, Qinglong; Ye, Ta...",China,1.0,China,East Asia & Pacific
5,10.1038/s41598-022-07995-7,7.0,2.0,"Zuo, Wenxuan; Wang, Beibei; Bai, Xin; Luan, Yi...",China; United States,1.0,USA,East Asia & Pacific; North America
6,10.1038/s41522-021-00185-9,32.0,1.0,"Opron, Kristopher; Begley, Lesa A; Erb-Downwar...",United States,2.0,not applicable; not collected,North America
7,10.1164/rccm.202308-1326OC,12.0,2.0,"Combs, Michael P; Luth, Jenna E; Falkowski, Ni...",Canada; United States,1.0,USA,North America
8,10.1371/journal.pone.0137318,8.0,1.0,"Koopman, Jessica E; van der Kaaij, Nicoline C ...",The Netherlands,1.0,Netherlands,Europe & Central Asia
9,10.1186/s40168-019-0683-9,17.0,1.0,"Liu, Honghong; Chen, Xi; Hu, Xiaomin; Niu, Hai...",China,1.0,China,East Asia & Pacific


In [None]:
# 4.1: Create a DataFrame from the results
results_df = pd.DataFrame(results)

# 4.2: Merge with original data to ensure all rows are kept in order
# This creates the final output DataFrame, filling in blanks for any rows that failed.
final_df = pd.merge(input_df[['row_index']], results_df, on='row_index', how='left')
final_df.fillna('', inplace=True)

# 4.3: Save the final DataFrame to a CSV file
output_filename = 'final_metadata_output.csv'
final_df.to_csv(output_filename, index=False)

print(f"\n✅ Successfully saved final data to '{output_filename}'")

# 4.4: Display the final result
print("--- Final Output Data ---")
DataTable(final_df)


✅ Successfully saved final data to 'final_metadata_output.csv'
--- Final Output Data ---


  final_df.fillna('', inplace=True)


Unnamed: 0,row_index,doi,number_of_authors,number_of_authors_countries,authors,authors_countries,number_of_samples_countries,sample_countries
0,0,10.3389/fcimb.2019.00476,4.0,1.0,"Zhang, Ling; Liu, Yuan; Zheng, Hua Jun; Zhang,...",China,1.0,China
1,1,10.1038/s41591-019-0600-6,11.0,1.0,"Lev-Sagie, Ahinoam; Goldman-Wohl, Debra; Cohen...",Israel,0.0,
2,2,10.1038/s41467-024-53013-x,19.0,4.0,"Baldi, Andrew; Braat, Sabine; Hasan, Mohammed ...",Australia; Bangladesh; United Kingdom; United ...,1.0,Bangladesh
3,3,10.1038/s41586-022-04427-4,7.0,1.0,"Hosang, Leon; Canals, Roger Cugota; van der Fl...",Germany,1.0,Germany
4,4,10.1128/spectrum.01901-21,15.0,1.0,"Xiao, Guohui; Cai, Zhao; Guo, Qinglong; Ye, Ta...",China,1.0,China
...,...,...,...,...,...,...,...,...
2324,2324,,,,,,,
2325,2325,,,,,,,
2326,2326,,,,,,,
2327,2327,,,,,,,
