In [1]:
# check blastn of 16genomes against pangenome

In [1]:
import pandas as pd
import os
import re
from Bio.Seq import Seq
from Bio import SeqIO

In [45]:
file_dir = '../data/blastn/'

# column names: https://www.metagenomics.wiki/tools/blast/blastn-output-format-6
blastn_cols = ['query','target','percent_id','alignment_length','mismatch','gapopen','qstart','qend','tstart','tend','evalue','bitscore']

df_list = []

for n,file in enumerate(os.listdir(file_dir)):
    df = pd.read_csv(f"{file_dir}/{file}", names=blastn_cols, sep='\t')
    df_list.append(df)

df = pd.concat(df_list, axis=0, ignore_index=True)

In [23]:
# map length of target (ref) sequences
input_file = '../data/1011_pangenome/allORFs_pangenome.fasta'
handle = []
seq = []

for record in SeqIO.parse(input_file, "fasta"):
    handle.append(record.id)
    seq.append(str(record.seq))
    
ref_df = pd.DataFrame({'ref_id':handle,'ref_seq':seq})

ref_df['ref_len'] = ref_df.ref_seq.str.len()

ref_map = dict(zip(ref_df.ref_id, ref_df.ref_len))

In [46]:
df['tseq_len'] = df.target.map(ref_map)

In [53]:
# percent id whole
df['percent_id_whole'] = (((df.alignment_length) / df.tseq_len) * df.percent_id/100)

In [56]:
idx = df.groupby('query')['percent_id_whole'].transform(max) == df['percent_id_whole']
top_df = df[idx]

In [78]:
top_df['qstrain'] = top_df['query'].str.split('_').str[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_df['qstrain'] = top_df['query'].str.split('_').str[0]


In [80]:
top_df.insert(0,'qstrain', top_df.pop('qstrain'))

In [82]:
top_df.to_csv('output/16strains_blastn_percent-whole.csv',index=False)

In [83]:
# there are 13 query ORFs that have identical hits in the pangenome
top_df[top_df.groupby('query')['target'].transform('count') > 1].to_csv('output/16strains_blastn_percent-whole-duplciates.csv', index=False)

In [87]:
# check that all 16strain ORFs are present in blastn hits

file_path = '../data/16_genomes/orf_fastas'
files = files = [ 
 'orfs_MSY24.fasta',
 'orfs_MSY25.fasta',
 'orfs_MSY26.fasta',
 'orfs_MSY27.fasta',
 'orfs_MSY28.fasta',
 'orfs_MSY29.fasta',
 'orfs_MSY30.fasta',
 'orfs_MSY31.fasta',
 'orfs_MSY32.fasta',
 'orfs_MSY33.fasta',
 'orfs_MSY34.fasta',
 'orfs_MSY35.fasta',
 'orfs_MSY36.fasta',
 'orfs_MSY37.fasta',
 'orfs_MSY38.fasta',
 'orfs_MSY39.fasta'
]

strain = []
orf = []
seq = []

for file in files:
    input_file = f"{file_path}/{file}"

    for record in SeqIO.parse(input_file, "fasta"):
        strain.append(re.split('\.|_', file)[1])
        orf.append(record.id)
        seq.append(str(record.seq))

df_16 = pd.DataFrame({'strain':strain, 'orf':orf, 'seq':seq})

In [97]:
df_16

Unnamed: 0,strain,orf,seq
0,MSY24,MSY24_gene1_additional.copy,ATGGACTTGAATCAAAGAAAGGAAAAAAGGGCCAGCATGTTGGATG...
1,MSY24,MSY24_gene2_dubious,ATGATGCCTGCTAAACTGCAGCTTGACGTACTGCGGACCCTGCAGT...
2,MSY24,MSY24_gene3_dubious,ATGGAATCTATTATCCTCAGCATTGCCATCTTTATTGGCGTCCTCC...
3,MSY24,MSY24_gene4_retained,ATGCGAGTTCTCATCGGAACGAAATTAGTGACTGAAGGAATTGACA...
4,MSY24,MSY24_gene5_retained,ATGCGTACTTTCACTGACTTTGTTTCTGGCGCACCTATTGTAAGGA...
...,...,...,...
108345,MSY39,MSY39_ORF114_ORF,TTGGTACCGGTGACAGTAGTCATTTCAGTGGAAGTCGAAGTGAAAG...
108346,MSY39,MSY39_ORF115_ORF,ATGTCTCAAACAAAGCTGATCAAGCCGCTGTATTTATATGAAACTT...
108347,MSY39,MSY39_ORF116_ORF,ATGCAATCTCCCTTGGAGAGTAACGATGCTGAATCGGATACGCTCA...
108348,MSY39,MSY39_ORF117_ORF,ATGCAAGCCCCTTCGGAAAATACCGATACCAAATTGGATACATCCA...


In [101]:
hit_set = set(top_df['query'].unique().tolist())

In [105]:
orf_set = set(df_16.orf.unique().tolist())

In [109]:
unique_orfs = list(orf_set - hit_set)

In [116]:
df_unique = df_16[df_16['orf'].isin(unique_orfs)]

In [117]:
df_unique.to_csv('output/16strains_blastn_unique-orfs.csv', index=False)

In [118]:
df_unique.shape

(1122, 3)

In [119]:
df_unique.orf.nunique()

1122