In [1]:
import pandas as pd
from ete3 import NCBITaxa
from tqdm import tqdm

In [2]:
def is_valid_scientific_name(name, ncbi):
    try:
        taxid = ncbi.get_name_translator([name]).get(name)
        return taxid is not None
    except Exception as e:
        print(f"Error: {e}")
        return False

def validate_scientific_names(file_path, column_name):
    df = pd.read_excel(file_path)
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in the Excel file.")

    ncbi = NCBITaxa()
    valid_names = []
    invalid_names = []

    for name in tqdm(df[column_name], desc="Validating names", unit="name"):
        is_valid = is_valid_scientific_name(name, ncbi)
        if is_valid:
            valid_names.append(name)
        else:
            invalid_names.append(name)

    return valid_names, invalid_names

file_path = "C:/Users/USER/FPIDatabase_RA_100.xlsx"
column_name = "ScientificName"

try:
    valid_names, invalid_names = validate_scientific_names(file_path, column_name)

    print("Valid Scientific Names:")
    print(valid_names)

    print("\nInvalid Scientific Names:")
    print(invalid_names)
except Exception as e:
    print(f"Error: {e}")

Validating names: 100%|██████████| 99/99 [00:00<00:00, 1697.63name/s]

Valid Scientific Names:
['Abelmoschus ficulneus', 'Abelmoschus manihot', 'Abelmoschus moschatus', 'Abelmoschus moschatus subsp. tuberosus', 'Abies firma', 'Abies grandis', 'Abobra tenuifolia', 'Abronia fragrans', 'Abronia latifolia', 'Abrus precatorius', 'Bakeridesia esculenta', 'Abutilon indicum', 'Abutilon otocarpum', 'Abutilon theophrasti', 'Acacia acradenia', 'Acacia acuminata', 'Acacia adoxa', 'Acacia adsurgens', 'Faidherbia albida', 'Acacia ammobia', 'Acacia ampliceps', 'Acacia anaticeps', 'Acacia ancistrocarpa', 'Acacia aneura', 'Senegalia ataxacantha', 'Acacia auriculiformis', 'Acacia beauverdiana', 'Acacia bivenosa', 'Acacia brachystachya', 'Acacia calcicola', 'Acacia cambagei', 'Acacia citrinoviridis', 'Acacia cochliacantha', 'Acacia colei', 'Acacia complanata', 'Acacia concinna', 'Acacia concurrens', 'Acacia coriacea', 'Acacia cowleana', 'Acacia craspedocarpa', 'Acacia crassicarpa', 'Acacia decora', 'Acacia decurrens', 'Acacia dictyophleba', 'Acacia difficilis', 'Acacia drep




In [1]:
import pandas as pd
from ete3 import NCBITaxa
from tqdm import tqdm

In [2]:
def is_valid_scientific_name(name, ncbi):
    try:
        taxid = ncbi.get_name_translator([name]).get(name)
        return taxid is not None
    except Exception as e:
        print(f"Error: {e}")
        return False

def validate_scientific_names(file_path, column_name):
    df = pd.read_excel(file_path)
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in the Excel file.")

    ncbi = NCBITaxa()
    valid_names = []
    invalid_names = []

    for name in tqdm(df[column_name], desc="Validating names", unit="name"):
        is_valid = is_valid_scientific_name(name, ncbi)
        if is_valid:
            valid_names.append(name)
        else:
            invalid_names.append(name)

    validation_results = pd.DataFrame({
        "ScientificName_Cor": valid_names + invalid_names,
        "Validity": [True] * len(valid_names) + [False] * len(invalid_names)
    })

    output_file_path = "validation_results_ete3_Corr.xlsx"
    validation_results.to_excel(output_file_path, index=False)

    return valid_names, invalid_names

file_path = "C:/Users/USER/Downloads/validation_results_ete3_1.xlsx"
column_name = "ScientificName_Cor"

try:
    valid_names, invalid_names = validate_scientific_names(file_path, column_name)

    print("Valid Scientific Names:")
    print(valid_names)

    print("\nInvalid Scientific Names:")
    print(invalid_names)
except Exception as e:
    print(f"Error: {e}")

Validating names: 100%|██████████| 33511/33511 [00:06<00:00, 4817.40name/s]


Valid Scientific Names:
['Abelmoschus ficulneus', 'Abelmoschus manihot', 'Abelmoschus moschatus', 'Abelmoschus moschatus subsp. tuberosus', 'Abies firma', 'Abies grandis', 'Abobra tenuifolia', 'Abronia fragrans', 'Abronia latifolia', 'Abrus precatorius', 'Bakeridesia esculenta', 'Abutilon indicum', 'Abutilon otocarpum', 'Abutilon theophrasti', 'Acacia acradenia', 'Acacia acuminata', 'Acacia adoxa', 'Acacia adsurgens', 'Faidherbia albida', 'Acacia ammobia', 'Acacia ampliceps', 'Acacia anaticeps', 'Acacia ancistrocarpa', 'Acacia aneura', 'Senegalia ataxacantha', 'Acacia auriculiformis', 'Acacia beauverdiana', 'Acacia bivenosa', 'Acacia brachystachya', 'Acacia calcicola', 'Acacia cambagei', 'Acacia citrinoviridis', 'Acacia cochliacantha', 'Acacia colei', 'Acacia complanata', 'Acacia concinna', 'Acacia concurrens', 'Acacia coriacea', 'Acacia cowleana', 'Acacia craspedocarpa', 'Acacia crassicarpa', 'Acacia decora', 'Acacia decurrens', 'Acacia dictyophleba', 'Acacia difficilis', 'Acacia drep

In [2]:
NCBITaxa()

<ete3.ncbi_taxonomy.ncbiquery.NCBITaxa at 0x1ad673020a0>