In [1]:
import pandas as pd
import os

## Analyzing kaiju.names output from main experiment files

In [2]:
def get_more_common_taxon_level(input_file,
                                columns = ["file","percent","reads","taxon_id","taxon_name"],
                                taxon_level = 2,
                                search_query = "Bacteria"):
    def filter_bacteria(x):
        line_list = x.split(';')
        if search_query in x:
            return line_list[taxon_level]
        else:
            return 'NA'

    
    df = pd.read_csv(input_file, names=columns, sep='\t', header=0)

    paths = df['taxon_name']
    more_commons = paths.apply(filter_bacteria).value_counts()
    return more_commons

In [3]:
kaiju_names_files = ["D:\\code\\CassBERT\\references\\kaiju-names\\P26-4C-FrogSkin_S11_L001.kaiju.table.txt",
                     "D:\\code\\CassBERT\\references\\kaiju-names\\DSCG1_S1_L001.kaiju.table.txt",
                     "D:\\code\\CassBERT\\references\\kaiju-names\\DSCG12_S12_L001.kaiju.table.txt"]

for input_file in kaiju_names_files:
    commons = get_more_common_taxon_level(input_file, taxon_level=2, search_query="Bacteria")
    print(input_file)
    print(commons)


D:\code\CassBERT\references\kaiju-names\P26-4C-FrogSkin_S11_L001.kaiju.table.txt
taxon_name
NA                       73
Terrabacteria group      38
Pseudomonadota           37
FCB group                11
environmental samples     1
Name: count, dtype: int64
D:\code\CassBERT\references\kaiju-names\DSCG1_S1_L001.kaiju.table.txt
taxon_name
Terrabacteria group    13
NA                     13
Pseudomonadota          8
FCB group               4
Name: count, dtype: int64
D:\code\CassBERT\references\kaiju-names\DSCG12_S12_L001.kaiju.table.txt
taxon_name
NA                     21
Terrabacteria group    17
Pseudomonadota         12
FCB group               4
Name: count, dtype: int64


In [7]:
## Checking the AJB1 File
input_file = "D:\\docker_projects\\bioinformatics\\sequences\\kaiju_dbs\\out\\kaiju.classified.names.out"
cols = ['classified', 'sequence_id', 'unknow_number_1', 'unknow_number_2', 'unknow_number_3',
        'ncbi_id', 'sequence', 'taxonomy_path']
commons = get_more_common_taxon_level(input_file, taxon_level=2)
pd.set_option('display.max_rows', None)
commons

taxon_name
NA                                 938907
 NA                                518352
 Streptomyces                      126051
 Pseudomonas                       107392
 Acinetobacter                      52396
 Bacillus                           40121
 Vibrio                             31973
 Staphylococcus                     31236
 Escherichia                        29490
 Klebsiella                         21199
 Paenarthrobacter                   18209
 Enterococcus                       14022
 Labilibacter                       11059
 Actinomadura                       10081
 Ancylomarina                       10012
 Frankia                             9779
 Corynebacterium                     9386
 Halomonas                           8625
 Nocardioides                        6931
 Steroidobacter                      6926
 Microbacterium                      5687
 Soehngenia                          4979
 Muricauda                           4803
 Flectobacillus        

## Download the most frequent taxonomy level from ncbi

In [6]:
more_commons = ["Pseudomonas", "Terrabacteria group", "FCB group", "Streptomyces", "Acinetobacter"]
for val in list(more_commons[more_commons > 1000][2:].keys()):
    command = f'datasets.exe download genome taxon {val} --reference --include protein --assembly-level complete --filename {val}.zip'
    os.system(command)
    os.system(f'move {val}.zip bacterias\\')
    print("OK", val)

OK  Streptomyces
OK  Pseudomonas
OK  Acinetobacter
OK  Bacillus
OK  Vibrio
OK  Staphylococcus
OK  Escherichia
OK  Klebsiella
OK  Paenarthrobacter
OK  Enterococcus
OK  Labilibacter
OK  Actinomadura
OK  Ancylomarina
OK  Frankia
OK  Corynebacterium
OK  Halomonas
OK  Nocardioides
OK  Steroidobacter
OK  Microbacterium
OK  Soehngenia
OK  Muricauda
OK  Flectobacillus
OK  Solirubrobacter
OK  Modestobacter
OK  Mesorhizobium
OK  Ectobacillus
OK  Photobacterium
OK  Marinifilum
OK  Putridiphycobacter
OK  Kribbella
OK  Paraclostridium
OK  Synechococcus
OK  Paenibacillus
OK  Listeria
OK  Salmonella
OK  Blastococcus
OK  Pasteurella
OK  Methylobacterium
OK  Thermogemmatispora
OK  Rhizobium
OK  Enterobacter
OK  Westiellopsis
OK  Alteromonas
OK  Nitrosospira
OK  Alteraurantiacibacter
OK  Geminicoccus
OK  Arthrobacter
OK  Gluconobacter
OK  Ruegeria
OK  Shigella
OK  Myroides
OK  Paraburkholderia
OK  Tissierella
OK  Capnocytophaga
OK  Mycobacteroides
