#### Description:The script is to read AEGIS long_list table and map it to GoaT format before importing into the GoaT database. the AEGIS status will be imported from Sanger STS system.


In [4]:
# GoaT curation spreadsheet link:
# https://docs.google.com/spreadsheets/d/1VDkgBFOg7ELqr3B_OXLvWOFlTYyB_A-zlq1Rbolo93c/edit?gid=1968802073#gid=1968802073

# AEGIS sequencing status table:
csv_link = "https://docs.google.com/spreadsheets/d/e/2PACX-1vTT8yYFN0sEMxzBRWv1sPE0b9G-sB6ua5PZuCW2rcdI1ttVG3YIAmz-7E0miHc6Shwz-68k5svT1YU5/pub?gid=1968802073&single=true&output=tsv"



In [14]:
import pandas as pd
import numpy as np
import os
import import_status_lib as isl


In [8]:
df = pd.read_csv(csv_link,
                delimiter=",",
                dtype=object,
                skiprows=7,
                header=0)
print("Available columns:", df.columns.tolist())

Available columns: ['ncbi_taxon_id\tspecies\tsubspecies\tfamily\ttarget_list_status\tsequencing_status\tsynonym\tpublication_id\tAEGIS sub-project\t\tTropical Crop Pests']


In [None]:
# Select colums to import
columns = [
    "ncbi_taxon_id",
    "species",
    "family",
    "AEGIS sub-project",
]
# Read the table from the link
df = pd.read_csv(csv_link,
                    delimiter="\t",
                    dtype=object,
                    usecols=columns,
                    skiprows=7,
                    header=0,
                    )

print('aegis file successfuly opened. Starting cleanup...')

# validate the data loading
print(f"Loaded {df.shape[0]} rows and {df.shape[1]} columns")
print("Available columns:", df.columns.tolist())


aegis file successfuly opened. Starting cleanup...
Loaded 669 rows and 5 columns
Available columns: ['ncbi_taxon_id', 'species', 'family', 'sequencing_status', 'AEGIS sub-project']


In [33]:
df_cleaned = isl.general_cleanup_for_table(df)
df_cleaned = isl.cleanup_headers_specific_units(df_cleaned)

  project_table = project_table.replace(r"^\s*$", np.nan, regex=True)


In [34]:
print('aegis file successfuly cleaned. Treating project columns...')

# Define the AEGIS variabl
AEGIS = "AEGIS"

# Add a column that has all values as AEGIS
df_cleaned['long_list'] = AEGIS


aegis file successfuly cleaned. Treating project columns...


In [36]:
# Create new columns using possible sequencing status
possible_seq_status = ["sample_collected","sample_acquired","in_progress","data_generation","in_assembly","insdc_submitted","open","insdc_open","published"]
for item in possible_seq_status:
    if item not in df_cleaned:
        df_cleaned[item] = pd.Series(dtype='object')

# Create mandatory columns
mandatory_fields = ["ncbi_taxon_id", "species", "family", "sequencing_status", "sequencing_status_aegis", "synonym", "publication_id", "contributing_project_lab"]

for item in mandatory_fields:
    if item not in df_cleaned:
        df_cleaned[item] = np.nan

df_cleaned['contributing_project_lab'] = df_cleaned['aegis_sub-project'] 
  
print("Generating AEGIS.tsv file...")
df_cleaned.to_csv("tsv/AEGIS_expanded.tsv",sep="\t", index=False)

Generating AEGIS.tsv file...
