/
hits.py
81 lines (63 loc) · 3.08 KB
/
hits.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/usr/bin/env python
from __future__ import print_function
import pandas as pd
class BestHits(object):
def __init__(self, comparison_col='E', query_name_col='q_name',
subject_name_col='s_name', query_length_col='q_len',
subject_length_col='q_len'):
self.comparison_col = comparison_col
self.query_name_col = query_name_col
self.subject_name_col = subject_name_col
self.query_length_col = query_length_col
self.subject_length_col = subject_length_col
def best_hits(self, aln_df):
'''Get the best hit for each query in the alignment DataFrame.
Operates in-place. Sorts the hits by query name and then e-value,
and then uses the drop_duplicates() function to remove all but the
first (lowest e-value) hit for each query.
Args:
aln_df (DataFrame): The MAF alignment DataFrame.
comp_col (str): The column name to use for sorting hits.
'''
return aln_df.sort_values(
by=[self.query_name_col,self.comparison_col]
).drop_duplicates(subset=self.query_name_col)
def reciprocal_best_hits(self, aln_df_A, aln_df_B):
'''Given to DataFrames with reciprocal MAF alignments, get the
reciprocal best hits.
Uses the CRBL.best_hits function to get best hits for each DataFrame,
and then does an inner join to find the reciprocals. The DataFrames
*will* be modified in-place by the best_hits function!
Args:
aln_df_A (DataFrame): The query hits.
aln_df_B (DataFrame): The subject hits.
comp_col (str): The comparison column to use for best hits.
Returns:
DataFrame with the reciprocal best hits.
'''
aln_df_A = CRBL.best_hits(aln_df_A)
aln_df_B = CRBL.best_hits(aln_df_B)
# Join between subject A and query B
rbh_df = pd.merge(aln_df_A, aln_df_B, how='inner',
left_on=self.subject_name_col,
right_on=self.query_name_col)
# Renamed columns after join
left_q_col = self.query_name_col + '_x'
left_s_col = self.subject_name_col + '_x'
left_len_col = self.query_length_col + '_x'
left_comp_col = self.comparison_col + '_x'
right_q_col = self.query_name_col + '_y'
right_s_col = self.subject_name_col + '_y'
right_len_col = self.query_len_col + '_y'
# Select those where query A is the same as subject B
rbh_df = rbh_df[(rbh_df[left_q_col] == rbh_df[right_s_col])]
# Drop extra columns and rename for clarity
del rbh_df[left_s_col]
del rbh_df[right_s_col]
rbh_df.rename(columns={left_q_col: self.query_name_col,
right_q_col: self.subject_name_col,
left_comp_col: self.comparison_col,
left_len_col: self.query_length_col,
right_len_col: self.subject_length_col},
inplace=True)
return rbh_df