In [None]:
import os
import pandas as pd

In [None]:
blast_output = pd.read_csv("blastn_results.out", delimiter="\t", header=None)
blast_output.columns = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore',  'staxids']
blast_output

In [None]:
taxid_sci_name = pd.read_csv("taxid_scientific_name.txt", delimiter="\t", header=None)
taxid_sci_name.columns = ['staxids', 'genus_species', 'common_name', 'rank', 'division']
taxid_sci_name

In [None]:
def clean_df(df: pd.DataFrame) -> pd.DataFrame:
    """Move values for a row to the right if the common_name column (index 2) is filled with rank column value. This occurs when there is no common name available."""
    for index, row in df.iterrows():
        if row['common_name'] in ['species', 'subspecies']:
            df.at[index, 'common_name'] = ''
            row_values = row[2:].values
            df.iloc[index, 3:] = row_values[: -1]
    return df

In [None]:
taxid_sci_name = clean_df(taxid_sci_name)
print(taxid_sci_name['staxids'].dtype)
taxid_sci_name['staxids'] = taxid_sci_name['staxids'].astype(str)

In [None]:
blast_plus_sci_name = blast_output.merge(taxid_sci_name, how='left', on='staxids')
# blast_plus_sci_name.to_csv("blast_plus_sci_name.csv")
blast_plus_sci_name

In [None]:
blast_plus_sci_name['division'].unique()

In [None]:
taxid_sci_name['division'].unique()

In [None]:
taxid_sci_name['common_name'].unique()

In [None]:
taxid_sci_name['genus_species'].unique()
len(taxid_sci_name['genus_species'].unique())

In [None]:
non_fish_list = ['g-proteobacteria', 'birds', 'bats',
       'unclassified sequences', 'carnivores',
       'bacteria', 'primates', 'even-toed ungulates & whales']

In [None]:
non_fish_blast_hits = blast_plus_sci_name.loc[blast_plus_sci_name['division'].isin(non_fish_list)]
# non_fish_blast_hits.to_csv("non_fish_blast_hits.csv")

In [None]:
len(non_fish_blast_hits['qseqid'].unique())
non_fish_sequence_ids = non_fish_blast_hits['qseqid'].unique()

In [None]:
non_fish_sequence_ids

In [None]:
with open("dna-sequences.fasta", "r") as fasta:
        with open("dna-sequences-filtered.fasta", "w") as filtered_fasta:
            write_sequence = True 
            for line in fasta:
                if line.startswith(">"):
                    hashid = line[1:].strip()
                    print(hashid)
                    write_sequence = hashid not in non_fish_sequence_ids # evals to a boolean
                if write_sequence:
                    filtered_fasta.write(line)
                else: 
                    pass

In [None]:
!grep -c '^>' dna-sequences.fasta 

In [None]:
!grep -c '^>' dna-sequences-filtered.fasta 