In [1]:
import pandas as pd
import random
import os



### Download prokaryotic database and info

https://zenodo.org/records/10457006

    VMGC_prokaryote_MAG.info

    VMGC_prokaryote_MAG.tar.gz

    VMGC_prokaryote_SGB.info

### Make genomes.tsv file for MIDAS input


Needs columns: genome, species, representative, genome_is_representative

In [2]:
df = pd.read_csv('VMGC_orig_files/VMGC_prokaryote_MAG.info', sep='\t')

In [3]:
df.head()

Unnamed: 0,Genome_ID,BioSample_ID,Collection/isolation_source,Type,Genome_size_(bp),No._of_contigs,N50_length_(bp),N90_length_(bp),Max_length_(bp),%_GC_content,%_Completeness,%_Contamination,Quality_score,Clade_separation_score_(GUNC),Presence_of_5S_rRNA,Presence_of_16S_rRNA,Presence_of_23S_rRNA,No._of_tRNA,Genome_quality,Species-level_genomic_bin_(95%_ANI)
0,ERR10897564.mbin.1,ERR10897564,vaginal swab,MAG,1087166,64,21321,9170,106681,32.47,92.37,0.36,90.57,0.0,Y,N,N,16,high-quality,SGB001
1,ERR10897570.mbin.1,ERR10897570,vaginal swab,MAG,946879,36,35494,16114,91025,44.15,69.36,0.05,69.11,0.03,N,N,N,14,medium-quality,SGB020
2,ERR10897572.mbin.2,ERR10897572,vaginal swab,MAG,634492,160,4095,2310,18976,32.86,51.82,0.05,51.57,0.0,Y,N,N,12,medium-quality,SGB001
3,ERR10897572.mbin.3,ERR10897572,vaginal swab,MAG,1042602,73,18747,7657,84304,37.32,59.34,0.17,58.49,0.0,N,N,N,16,medium-quality,SGB006
4,ERR10897573.mbin.4,ERR10897573,vaginal swab,MAG,2066518,406,5736,2674,38890,42.4,75.82,1.43,68.67,0.0,N,N,N,16,medium-quality,SGB014


In [4]:
len(df['Genome_ID'].unique()), df.shape[0]

(19542, 19542)

In [5]:
#get representative genomes from SGB file

In [6]:
sgb = pd.read_csv('VMGC_orig_files/VMGC_prokaryote_SGB.info', sep='\t', index_col=0)

In [7]:
sgb.shape

(786, 11)

In [8]:
rep_genomes = sgb['Representative_genome_ID'].to_dict()

In [9]:
genomes = df[['Genome_ID','Species-level_genomic_bin_(95%_ANI)']]
genomes = genomes.rename(columns={'Genome_ID':'genome', 'Species-level_genomic_bin_(95%_ANI)':'SGB'})
genomes['representative'] = genomes['SGB'].map(rep_genomes)
genomes

Unnamed: 0,genome,SGB,representative
0,ERR10897564.mbin.1,SGB001,SRR17284223.mbin.1
1,ERR10897570.mbin.1,SGB020,ERR10897780.mbin.8
2,ERR10897572.mbin.2,SGB001,SRR17284223.mbin.1
3,ERR10897572.mbin.3,SGB006,GCF_000162255.1
4,ERR10897573.mbin.4,SGB014,P10708985.mbin.1
...,...,...,...
19537,SRR6748217.mbin.3,SGB026,SRR13167550.mbin.1
19538,SRR6748218.mbin.1,SGB025,MG238.mbin.3
19539,SRR6748219.mbin.1,SGB004,ERR10897722.mbin.1
19540,GCF_029101565.1,SGB009,GCF_029101565.1


In [10]:
genomes['genome_is_representative'] = (genomes['genome'] == genomes['representative']).astype(int)

In [11]:
genomes['genome_is_representative'].sum()

786

In [12]:
sp_list = genomes['SGB'].unique()

In [13]:
#randomly assign 6 digit numbers to species

random.seed(1)
nums = random.sample(range(100000,999999), len(sp_list))

sp_to_nums = {sp_list[i]:nums[i] for i in range(len(sp_list))}

In [14]:
genomes['species'] = genomes['SGB'].map(sp_to_nums)

In [15]:
genomes = genomes.set_index('genome')

In [16]:
genomes.to_csv('VMGC_orig_files/genomes_and_SGBs.csv')

In [17]:
genomes.drop(columns='SGB').to_csv('VMGC_db/genomes.tsv', sep='\t')

### Organize genomes

Needs to be in the format: "cleaned_genomes/species/genome/genome.fna"

In [18]:
base_dir = 'VMGC_db/cleaned_genomes/'
!mkdir {base_dir}

mkdir: VMGC_db/cleaned_genomes/: File exists


In [19]:
fasta_dir='VMGC_orig_files/VMGC_prokaryote_MAG'
fastas = !ls {fasta_dir}

In [20]:
!ls VMGC_db/cleaned_genomes

[34m109652[m[m [34m187277[m[m [34m278624[m[m [34m342081[m[m [34m536396[m[m [34m653259[m[m [34m805810[m[m [34m940775[m[m [34m988598[m[m
[34m130451[m[m [34m207192[m[m [34m285819[m[m [34m367695[m[m [34m553789[m[m [34m674974[m[m [34m897926[m[m [34m941235[m[m [34m992625[m[m
[34m134035[m[m [34m223646[m[m [34m320153[m[m [34m447222[m[m [34m595185[m[m [34m696853[m[m [34m927036[m[m [34m956096[m[m [34m994737[m[m
[34m173875[m[m [34m240891[m[m [34m339874[m[m [34m508744[m[m [34m611554[m[m [34m766234[m[m [34m937223[m[m [34m982633[m[m


In [None]:
moved = []

for f in fastas:
    
    old_path = fasta_dir + f
    
    genome = f.strip('.fa')
    
    if genome not in genomes.index:
        print(genome)
        
    species = genomes.loc[genome]['species']
    
    species_dir = 'VMGC_db/cleaned_genomes/{}/'.format(species)
    genome_dir = species_dir+genome
    new_path = genome_dir + '/' + genome + '.fna'
    
    if not os.path.exists(species_dir):
        
        !mkdir {species_dir}
        
    !mkdir {genome_dir}
    !mv {old_path} {new_path}


mv: rename VMGC_orig_files/VMGC_prokaryote_MAGMG1.mbin.1.fa to VMGC_db/cleaned_genomes/927036/MG1.mbin.1/MG1.mbin.1.fna: No such file or directory
mv: rename VMGC_orig_files/VMGC_prokaryote_MAGMG1.mbin.2.fa to VMGC_db/cleaned_genomes/653259/MG1.mbin.2/MG1.mbin.2.fna: No such file or directory
mv: rename VMGC_orig_files/VMGC_prokaryote_MAGMG1.mbin.4.fa to VMGC_db/cleaned_genomes/696853/MG1.mbin.4/MG1.mbin.4.fna: No such file or directory
mv: rename VMGC_orig_files/VMGC_prokaryote_MAGMG10.mbin.1.fa to VMGC_db/cleaned_genomes/240891/MG10.mbin.1/MG10.mbin.1.fna: No such file or directory
mv: rename VMGC_orig_files/VMGC_prokaryote_MAGMG10.mbin.2.fa to VMGC_db/cleaned_genomes/223646/MG10.mbin.2/MG10.mbin.2.fna: No such file or directory
mv: rename VMGC_orig_files/VMGC_prokaryote_MAGMG10.mbin.3.fa to VMGC_db/cleaned_genomes/696853/MG10.mbin.3/MG10.mbin.3.fna: No such file or directory
mv: rename VMGC_orig_files/VMGC_prokaryote_MAGMG10.sbin.5.fa to VMGC_db/cleaned_genomes/342081/MG10.sbin.5/MG

mv: rename VMGC_orig_files/VMGC_prokaryote_MAGMG101.mbin.18.fa to VMGC_db/cleaned_genomes/109652/MG101.mbin.18/MG101.mbin.18.fna: No such file or directory
mv: rename VMGC_orig_files/VMGC_prokaryote_MAGMG101.mbin.19.fa to VMGC_db/cleaned_genomes/653259/MG101.mbin.19/MG101.mbin.19.fna: No such file or directory
mv: rename VMGC_orig_files/VMGC_prokaryote_MAGMG101.mbin.20.fa to VMGC_db/cleaned_genomes/927036/MG101.mbin.20/MG101.mbin.20.fna: No such file or directory
mv: rename VMGC_orig_files/VMGC_prokaryote_MAGMG101.mbin.21.fa to VMGC_db/cleaned_genomes/285819/MG101.mbin.21/MG101.mbin.21.fna: No such file or directory


In [48]:
len(fastas)

811

### Select a few genomes for a test run

In [49]:
!mkdir test_db

In [51]:
!mv VMGC_db/cleaned_genomes test_db/

In [39]:
genome_names = []

for genome, row in genomes.iterrows():
    
    sp = row['species']
    if os.path.exists('test_db/cleaned_genomes/{}/{}'.format(sp, genome)):
        genome_names += [genome]

In [49]:
test_set = genomes.loc[genome_names]
test_set = test_set.merge(df.set_index('Genome_ID')[['Quality_score']], left_index=True, right_index=True)

In [51]:
#resetting representative genomes from test set

test_rep_genomes = test_set.sort_values('Quality_score', ascending=False).drop_duplicates('species').reset_index().set_index('species')['index'].to_dict()

test_set['representative'] = test_set['species'].map(test_rep_genomes)
test_set['genome_is_representative'] = (test_set.index == test_set['representative']).astype(int)
test_set = test_set.drop(columns='Quality_score')
test_set.index.name = 'genome'

In [53]:
test_set

Unnamed: 0_level_0,species,representative,genome_is_representative
genome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MG100.sbin.5,897926,MG106.mbin.2,0
MG100.mbin.11,223646,MG100.mbin.11,1
MG100.mbin.2,339874,MG1030.mbin.15,0
MG100.mbin.5,447222,MG1063.mbin.8,0
MG100.mbin.1,109652,MG1035.mbin.3,0
...,...,...,...
MG1070.mbin.31,332460,MG103.mbin.9,0
MG1070.mbin.26,956728,MG1069.mbin.26,0
MG1070.mbin.33,620611,MG1070.mbin.33,1
MG1070.mbin.38,460020,MG1038.sbin.18,0


In [54]:
test_set.to_csv('test_db/genomes.tsv', sep='\t')