#### Description:The script is to read Bat1k sequencing status table and map it to GoaT format before importing into the GoaT database.


In [1]:
# GoaT curation spreadsheet link:
# https://docs.google.com/spreadsheets/d/1vsV7OTU-BAeOkBSrsESGCHaGuLcFW6U9mUluy6II0tY/edit?gid=0#gid=0

# Bat1k sequencing status table:
csv_link = "https://urldefense.proofpoint.com/v2/url?u=https-3A__research.st-2Dandrews.ac.uk_bat1k_goat.csv&amp;d=DwIGaQ&amp;c=D7ByGjS34AllFgecYw0iC6Zq7qlm8uclZFI0SqQnqBo&amp;r=ROFPW9s86BDaHzmK9wYUbmvRhmF6aEC2Q3PZs5L7P9o&amp;m=jM-boN6vlSWlPdGLVjQQZzpw7PNs2oKYQHyh4OBVWihTymqJOPdUPpCU0RsiwTjV&amp;s=Ysq_9r0PCehJDGp4j4z0O1CAha0luPXBSU31vqHWH7Y&amp;e="

# Mapping guide:
"https://docs.google.com/spreadsheets/d/1TctmGGvjI7otqTozBqvckOvhv0WmvHXOODzlzWmsF2M/edit?gid=0#gid=0"

'https://docs.google.com/spreadsheets/d/1TctmGGvjI7otqTozBqvckOvhv0WmvHXOODzlzWmsF2M/edit?gid=0#gid=0'

In [2]:
import pandas as pd
import numpy as np
import os


In [3]:
# Select colums to import
columns = [
    "family",
    "species",
    "ncbi_taxon_id",
    "commonname",
    "country of collection",
    "collected",
    "sequencing",
    "assembly",
    "annotation",
    "goat_summary",
]
# Read the table from the link
df = pd.read_csv(csv_link,
                    delimiter=",",
                    dtype=object,
                    usecols=columns,
                    )

print('bat1k file successfuly opened. Starting cleanup...')

# validate the data loading
print(f"Loaded {df.shape[0]} rows and {df.shape[1]} columns")
print("Available columns:", df.columns.tolist())


bat1k file successfuly opened. Starting cleanup...
Loaded 1469 rows and 10 columns
Available columns: ['family', 'species', 'ncbi_taxon_id', 'commonname', 'country of collection', 'collected', 'sequencing', 'assembly', 'annotation', 'goat_summary']


In [4]:
def table_cleanup(df):
    """
    Cleans up a pandas DataFrame by performing the following actions:
    - Replaces empty or whitespace-only strings with NaN.
    - Strips leading and trailing spaces from all string values.
    - Drops columns and rows where all values are NaN.
    
    Args:
        df (pandas.DataFrame): The input DataFrame to be cleaned.

    Returns:
        pandas.DataFrame: The cleaned DataFrame.
    """
    df = df.replace(r'^\s*$', np.nan, regex=True)
    df = df.replace(r"^ +| +$", r"", regex=True)
    df.dropna(how="all", axis=1, inplace=True)
    df.dropna(how="all", axis=0, inplace=True)
    return df

def headers_cleanup(df):
    """
    Cleans up the headers by performing the following actions:
    - Replaces spaces with underscores.
    - Converts all characters to lowercase.
    - Removes parentheses.

    Args:

        df (pandas.DataFrame): The input DataFrame to be cleaned.

    Returns:
        pandas.DataFrame: The cleaned DataFrame.
    """
    df.columns = (
        df.columns
        .str.replace(' ', '_')
        .str.replace(r'\(', '',regex=True)
        .str.replace(r'\)', '',regex=True)
        .str.lower()
    )
    return df


In [5]:
# Clean up the table:
df_cleaned = headers_cleanup(table_cleanup(df))

print('bat1k file successfuly cleaned. Treating project columns...')

# Add a sequencing_status column to the table
df_cleaned["sequencing_status"] = df_cleaned["goat_summary"]


bat1k file successfuly cleaned. Treating project columns...


  df = df.replace(r'^\s*$', np.nan, regex=True)


In [9]:
# Define the BAT1K constant
BAT1K = "BAT1K"

# Create new columns using possible sequencing status
possible_seq_status = ["sample_collected","sample_acquired","in_progress","data_generation","in_assembly","insdc_submitted","open","insdc_open","published"]
for item in possible_seq_status:
    if item not in df_cleaned:
        df_cleaned[item] = pd.Series(dtype='object')

# Assign values to each status column
for item in possible_seq_status:
    df_cleaned.loc[df_cleaned['sequencing_status'] == item, item] = BAT1K

# Populate the status columns with the project names based on the hierarchy of the status
df_cleaned.loc[df_cleaned["published"] == BAT1K, "insdc_open"] = BAT1K
df_cleaned.loc[df_cleaned['insdc_open'] == BAT1K, 'open'] = BAT1K
df_cleaned.loc[df_cleaned['open'] == BAT1K, 'in_progress'] = BAT1K
df_cleaned.loc[df_cleaned['data_generation'] == BAT1K, 'in_progress'] = BAT1K
df_cleaned.loc[df_cleaned['in_assembly'] == BAT1K, 'in_progress'] = BAT1K
df_cleaned.loc[df_cleaned['in_progress'] == BAT1K, 'sample_acquired'] = BAT1K
df_cleaned.loc[df_cleaned['sample_acquired'] == BAT1K, 'sample_collected'] = BAT1K

# Create mandatory columns
mandatory_fields = ["ncbi_taxon_id", "species", "family", "synonym", "publication_id", "contributing_project_lab"]

for item in mandatory_fields:
    if item not in df_cleaned:
        df_cleaned[item] = np.nan

print("Generating BAT1K.tsv file...")
df_cleaned.to_csv("tsv/BAT1K_expanded.tsv",sep="\t", index=False)

Generating BAT1K.tsv file...
