Skip to content

Commit

Permalink
Added ssu_erroneous method and updated method for identifying taxonom…
Browse files Browse the repository at this point in the history
…ic outliers.
  • Loading branch information
donovan-h-parks committed Jul 25, 2016
1 parent dfb5cd0 commit 0a9c842
Show file tree
Hide file tree
Showing 12 changed files with 1,707 additions and 167 deletions.
3 changes: 2 additions & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,6 @@ include README.rst
include MANIFEST.in
include bin refinem
include refinem VERSION
include *.txt
recursive-include refinem *.py *.txt

recursive-include docs *.txt
173 changes: 126 additions & 47 deletions bin/refinem

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion refinem/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.0.13
0.0.14
24 changes: 24 additions & 0 deletions refinem/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
###############################################################################
# #
# This program is free software: you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation, either version 3 of the License, or #
# (at your option) any later version. #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program. If not, see <http://www.gnu.org/licenses/>. #
# #
###############################################################################

import os

def version():
"""Read program version from file."""
import refinem
version_file = open(os.path.join(__path__[0], 'VERSION'))
return version_file.read().strip()
69 changes: 69 additions & 0 deletions refinem/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,3 +207,72 @@ def dbscan(self, scaffold_stats, num_clusters, num_components, K, no_coverage, n
"""

pass

def split(self, scaffold_stats, criteria1, criteria2, genome_file, output_dir):
"""Split genome into two based ongenomic feature.
Parameters
----------
scaffold_stats : ScaffoldStats
Statistics for individual scaffolds.
criteria1 : str
First criteria used for splitting genome.
criteria2 : str
Second criteria used for splitting genome.
genome_file : str
Sequences being clustered.
output_dir : str
Directory to write results.
"""

seqs = seq_io.read(genome_file)

# calculate PCA if necessary
if 'pc' in criteria1 or 'pc' in criteria2:
self.logger.info('Performing PCA.')
signatures = GenomicSignature(K)
signature_matrix = []
seqs = seq_io.read(genome_file)
for seq_id, seq in seqs.iteritems():
stats = scaffold_stats.stats[seq_id]

signature_matrix.append(stats.signature)

pc, _variance = self.pca(signature_matrix)
for i, seq_id in enumerate(seqs):
scaffold_stats.stats[seq_id].pc1 = pc[i][0]
scaffold_stats.stats[seq_id].pc2 = pc[i][1]
scaffold_stats.stats[seq_id].pc3 = pc[i][2]

# split bin
genome_id = remove_extension(genome_file)
fout1 = open(os.path.join(output_dir, genome_id + '_c1.fna'), 'w')
fout2 = open(os.path.join(output_dir, genome_id + '_c2.fna'), 'w')

for seq_id, seq in seqs.iteritems():
stats = scaffold_stats.stats[seq_id]

meet_criteria = True
for criteria in [criteria1, criteria2]:
if 'gc' in criteria:
v = eval(criteria.replace('gc', str(stats.gc)), {"__builtins__": {}})
elif 'coverage' in criteria:
v = eval(criteria.replace('coverage', str(stats.coverage)), {"__builtins__": {}})
elif 'pc1' in criteria:
v = eval(criteria.replace('pc1', str(stats.pc1)), {"__builtins__": {}})
elif 'pc2' in criteria:
v = eval(criteria.replace('pc2', str(stats.pc2)), {"__builtins__": {}})
elif 'pc3' in criteria:
v = eval(criteria.replace('pc3', str(stats.pc3)), {"__builtins__": {}})

meet_criteria = meet_criteria and v

if meet_criteria:
fout1.write('>' + seq_id + '\n')
fout1.write(seqs[seq_id] + '\n')
else:
fout2.write('>' + seq_id + '\n')
fout2.write(seqs[seq_id] + '\n')

fout1.close()
fout2.close()
3 changes: 1 addition & 2 deletions refinem/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@

import biolib.seq_io as seq_io


def concatenate_gene_files(gene_files, concatenated_gene_file):
"""Combine all gene files into a single file.
Expand All @@ -47,7 +46,7 @@ def concatenate_gene_files(gene_files, concatenated_gene_file):
genome_id = remove_extension(gf)

for seq_id, seq in seq_io.read_seq(gf):
fout.write('>' + seq_id + '~' + genome_id + '\n')
fout.write('>' + genome_id + '~' + seq_id + '\n')
if seq[-1] == '*':
seq = seq[0:-1]
fout.write(seq + '\n')
Expand Down
Loading

0 comments on commit 0a9c842

Please sign in to comment.