diff --git a/epic/bigwig/create_bigwigs.py b/epic/bigwig/create_bigwigs.py index 1acb4bd..3a9a8ff 100644 --- a/epic/bigwig/create_bigwigs.py +++ b/epic/bigwig/create_bigwigs.py @@ -2,13 +2,16 @@ import numpy as np from os.path import join, basename, splitext from subprocess import call - +from argparse import Namespace +import pandas as pd +from typing import Any, Dict, Iterable, List import pyBigWig from joblib import Parallel, delayed def create_bigwigs(matrix, outdir, args): + # type: (pd.DataFrame, str, Namespace) -> None """Create bigwigs from matrix.""" call("mkdir -p {}".format(outdir), shell=True) genome_size_dict = args.chromosome_sizes @@ -25,10 +28,12 @@ def create_bigwigs(matrix, outdir, args): def _to_int(l): + # type: (Iterable[Any]) -> List[int] return [int(i) for i in l] def _create_bigwig(bed_column, outpath, genome_size_dict): + # type: (pd.Series, str, Dict[str, int]) -> None logging.info("Creating biwgwig " + outpath) @@ -52,6 +57,7 @@ def _create_bigwig(bed_column, outpath, genome_size_dict): def create_sum_bigwigs(matrix, outdir, args): + # type: (pd.DataFrame, str, Namespace) -> None call("mkdir -p {}".format(outdir), shell=True) chip = matrix[args.treatment].sum(axis=1) diff --git a/epic/config/genomes.py b/epic/config/genomes.py index 291c65b..b935a0f 100644 --- a/epic/config/genomes.py +++ b/epic/config/genomes.py @@ -2,6 +2,7 @@ from collections import OrderedDict import pkg_resources import logging +from typing import Dict from epic.config import logging_settings from epic.utils.find_readlength import (find_readlength, @@ -12,6 +13,7 @@ def get_genome_size_file(genome): + # type: (str) -> str genome_names = pkg_resources.resource_listdir("epic", "scripts/chromsizes") name_dict = {n.lower().replace(".chromsizes", ""): n for n in genome_names} @@ -25,6 +27,7 @@ def get_genome_size_file(genome): def create_genome_size_dict(genome): + # type: (str) -> Dict[str,int] """Creates genome size dict from string containing data.""" size_file = get_genome_size_file(genome) @@ -39,10 +42,11 @@ def create_genome_size_dict(genome): def create_genome_size_dict_custom_genome(chromsizes): + # type: (str) -> OrderedDict[str, int] chromosome_lengths = [l.split() for l in open(chromsizes).readlines()] - od = OrderedDict() + od = OrderedDict() # type: OrderedDict[str, int] for c, l in natsorted(chromosome_lengths): od[c] = int(l) @@ -51,6 +55,7 @@ def create_genome_size_dict_custom_genome(chromsizes): def get_effective_genome_length(genome, read_length): + # type: (str, int) -> float genome_names = pkg_resources.resource_listdir("epic", "scripts/effective_sizes") @@ -59,7 +64,7 @@ def get_effective_genome_length(genome, read_length): try: genome_exact = name_dict[genome.lower()] - egf = pkg_resources.resource_string( + egf = pkg_resources.resource_string( # type: ignore "epic", "scripts/effective_sizes/{}_{}.txt".format( genome_exact, read_length)).split()[-1].decode() except KeyError: diff --git a/epic/matrixes/matrixes.py b/epic/matrixes/matrixes.py index 4d797a3..3fdcd72 100644 --- a/epic/matrixes/matrixes.py +++ b/epic/matrixes/matrixes.py @@ -1,6 +1,9 @@ import logging from os.path import dirname, join, basename from subprocess import call +from itertools import chain +from typing import Iterable, Sequence, Tuple +from argparse import Namespace import pandas as pd @@ -14,6 +17,7 @@ def write_matrix_files(chip_merged, input_merged, df, args): + # type: (Dict[str, pd.DataFrame], Dict[str, pd.DataFrame], pd.DataFrame, Namespace) -> None matrixes = create_matrixes(chip_merged, input_merged, df, args) @@ -29,6 +33,7 @@ def write_matrix_files(chip_merged, input_merged, df, args): # TODO: remove out of bounds bins + if args.bigwig: # defer initialization so not run during travis from epic.bigwig.create_bigwigs import create_bigwigs @@ -40,8 +45,9 @@ def write_matrix_files(chip_merged, input_merged, df, args): create_sum_bigwigs(matrix, args.sum_bigwig, args) -def _create_matrixes(chromosome, chip, input, islands, chromosome_size, - window_size): +def _create_matrixes(chromosome, chip, input, islands, + chromosome_size, window_size): + # type: (str, Dict[str, pd.DataFrame], Dict[str, pd.DataFrame], pd.DataFrame, int, int) -> pd.DataFrame chip_df = get_chromosome_df(chromosome, chip) input_df = get_chromosome_df(chromosome, input) @@ -66,7 +72,7 @@ def _create_matrixes(chromosome, chip, input, islands, chromosome_size, def create_matrixes(chip, input, df, args): - + # type: (Iterable[pd.DataFrame], Iterable[pd.DataFrame], pd.DataFrame, Namespace) -> List[pd.DataFrame] "Creates matrixes which can be written to file as is (matrix) or as bedGraph." genome = args.chromosome_sizes @@ -86,7 +92,7 @@ def create_matrixes(chip, input, df, args): def print_matrixes(matrixes, args): - + # type: (Iterable[pd.DataFrame], Namespace) -> None outpath = args.store_matrix dir = dirname(outpath) @@ -110,13 +116,14 @@ def print_matrixes(matrixes, args): chunksize=1e6) -def get_island_bins(df, window_size, genome): +def get_island_bins(df, window_size, genome, args): + # type: (pd.DataFrame, int, str, Namespace) -> Dict[str, Set[int]] """Finds the enriched bins in a df.""" # need these chromos because the df might not have islands in all chromos chromosomes = natsorted(list(args.chromosome_sizes)) - chromosome_island_bins = {} + chromosome_island_bins = {} # type: Dict[str, Set[int]] df_copy = df.reset_index(drop=False) for chromosome in chromosomes: cdf = df_copy.loc[df_copy.Chromosome == chromosome] @@ -134,7 +141,7 @@ def get_island_bins(df, window_size, genome): def put_dfs_in_dict(dfs): - + # type: (Iterable[pd.DataFrame]) -> Dict[str, pd.DataFrame] sample_dict = {} for df in dfs: @@ -148,8 +155,9 @@ def put_dfs_in_dict(dfs): def put_dfs_in_chromosome_dict(dfs): + # type: (Iterable[pd.DataFrame]) -> Dict[str, pd.DataFrame] - chromosome_dict = {} + chromosome_dict = {} # type: Dict[str, pd.DataFrame] for df in dfs: if df.empty: @@ -162,6 +170,7 @@ def put_dfs_in_chromosome_dict(dfs): def get_chromosome_df(chromosome, df_dict): + # type: (str, Dict[str, pd.DataFrame]) -> pd.DataFrame if chromosome in df_dict: df = df_dict[chromosome] @@ -172,6 +181,7 @@ def get_chromosome_df(chromosome, df_dict): def enriched_bins(df, args): + # type: (pd.DataFrame, Namespace) -> pd.DataFrame df = df.loc[df.FDR < args.false_discovery_rate_cutoff] diff --git a/epic/run/run_epic.py b/epic/run/run_epic.py index 3ad033d..4df7864 100644 --- a/epic/run/run_epic.py +++ b/epic/run/run_epic.py @@ -10,9 +10,11 @@ from subprocess import call import logging +from argparse import Namespace import pandas as pd from numpy import log2 +from typing import Iterable from natsort import natsorted from joblib import Parallel, delayed @@ -27,6 +29,7 @@ def run_epic(args): + # type: (Namespace) -> pd.DataFrame chip_windows = multiple_files_count_reads_in_windows(args.treatment, args) input_windows = multiple_files_count_reads_in_windows(args.control, args) @@ -46,6 +49,7 @@ def run_epic(args): score_threshold, island_enriched_threshold, average_window_readcount = \ compute_background_probabilities(nb_chip_reads, args) + dfs = [] # type: Iterable[pd.DataFrame] dfs = count_to_pvalue(merged_dfs, island_enriched_threshold, average_window_readcount, args.number_cores) @@ -75,6 +79,7 @@ def run_epic(args): def df_to_bed(df): + # type: (pd.DataFrame) -> pd.DataFrame # '''Chromosome Start End ChIP Input Score Fold_change P FDR # chr5 53000 55399 121 13 77.6075622841774 13.655736573980159 6.040968494897508e-92 1.9241805908359603e-91\'' @@ -91,6 +96,7 @@ def df_to_bed(df): def sum_columns(dfs): + # type: (Iterable[pd.DataFrame]) -> List[pd.DataFrame] new_dfs = [] for df in dfs: @@ -104,11 +110,12 @@ def sum_columns(dfs): def multiple_files_count_reads_in_windows(bed_files, args): + # type: (Iterable[str], Namespace) -> OrderedDict[str, List[pd.DataFrame]] """Use count_reads on multiple files and store result in dict. Untested since does the same thing as count reads.""" - bed_windows = OrderedDict() + bed_windows = OrderedDict() # type: OrderedDict[str, List[pd.DataFrame]] for bed_file in bed_files: logging.info("Binning " + bed_file) if args.paired_end: @@ -121,6 +128,7 @@ def multiple_files_count_reads_in_windows(bed_files, args): def _merge_files(windows, nb_cpu): + # type: (Iterable[pd.DataFrame], int) -> pd.DataFrame """Merge lists of chromosome bin df chromosome-wise. windows is an OrderedDict where the keys are files, the values are lists of diff --git a/epic/scripts/effective_genome_size.py b/epic/scripts/effective_genome_size.py index 25de45a..a1e000f 100644 --- a/epic/scripts/effective_genome_size.py +++ b/epic/scripts/effective_genome_size.py @@ -13,6 +13,7 @@ def effective_genome_size(fasta, read_length, nb_cores, tmpdir="/tmp"): + # type: (str, int, int, str) -> None """Compute effective genome size for genome.""" idx = Fasta(fasta) diff --git a/epic/scripts/overlaps/files_to_chromosome_coverage.py b/epic/scripts/overlaps/files_to_chromosome_coverage.py index ec116b7..4bd6f74 100644 --- a/epic/scripts/overlaps/files_to_chromosome_coverage.py +++ b/epic/scripts/overlaps/files_to_chromosome_coverage.py @@ -11,24 +11,28 @@ from io import StringIO +from typing import DefaultDict, Dict, Iterable # from helper.functions import logging from rpy2.robjects import r, pandas2ri pandas2ri.activate() +from rpy2.robjects.robject import RObject from rpy2.robjects.packages import importr + importr("S4Vectors") bioc = importr("GenomicRanges") def files_to_chromosome_coverage(all_files, nb_cpu): + # type: (Iterable[str], int) -> DefaultDict[str,Dict[str, RObject]] df_to_coverage = r("function(x) coverage(GRanges(x$Chromosome, IRanges(x$Start, x$End)))") logging.info("Finding nucleotide coverage of files.") - coverages = defaultdict(dict) + coverages = defaultdict(dict) # type: DefaultDict[str,Dict[str, RObject]] for f in all_files: df = pd.read_table(f, usecols=[0, 1, 2], header=None, names="Chromosome Start End".split()) cv = df_to_coverage(df) @@ -40,14 +44,14 @@ def files_to_chromosome_coverage(all_files, nb_cpu): max_per_chromosome_coverage = r("function(x) sum(runLength(x))") remove_duplicate_list_entries = r("function(x) x[unique(names(x))]") - maxlengths = defaultdict(int) + maxlengths = defaultdict(int) # type: DefaultDict[str, int] for f, data in coverages.items(): for chromosome, rle in data.items(): current_len = max_per_chromosome_coverage(rle)[0] maxlengths[chromosome] = max(maxlengths[chromosome], current_len) extend_rle = r('function(cvg, maxlen) c(cvg,Rle(0,maxlen-length(cvg)))') - extended_rles = defaultdict(dict) + extended_rles = defaultdict(dict) # type: DefaultDict[str,Dict[str, RObject]] for f, d in coverages.items(): for chromosome, cv in d.items(): maxlength = maxlengths[chromosome] diff --git a/epic/scripts/overlaps/nucleotide_bargraph.py b/epic/scripts/overlaps/nucleotide_bargraph.py index d8092d3..a4827ea 100644 --- a/epic/scripts/overlaps/nucleotide_bargraph.py +++ b/epic/scripts/overlaps/nucleotide_bargraph.py @@ -1,9 +1,10 @@ - +import pytest from joblib import Parallel, delayed from collections import defaultdict import pandas as pd import numpy as np +from typing import DefaultDict, Dict, Iterable import pkg_resources, os from natsort import natsorted @@ -11,22 +12,23 @@ from io import StringIO # from helper.functions -from epic.scripts.overlaps.files_to_chromosome_coverage import (files_to_chromosome_coverage) import logging from rpy2.robjects import r, pandas2ri pandas2ri.activate() +from rpy2.robjects.robject import RObject from rpy2.robjects.packages import importr importr("S4Vectors") bioc = importr("GenomicRanges") - +from epic.scripts.overlaps.files_to_chromosome_coverage import files_to_chromosome_coverage __author__ = "Endre Bakken Stovner https://github.com/endrebak/" __license__ = "MIT" def overlap_matrix_nucleotides(all_files, nb_cpu): + # type: (Iterable[str], int) -> pd.DataFrame rles = files_to_chromosome_coverage(all_files, nb_cpu) nucleotide_overlaps = Parallel(n_jobs=nb_cpu)(delayed(_overlap_matrix_nucleotides)( @@ -38,6 +40,7 @@ def overlap_matrix_nucleotides(all_files, nb_cpu): def _overlap_matrix_nucleotides(bed_file, extended_rles): + # type: (str, Dict[str,Dict[str, RObject]]) -> pd.DataFrame overlaps = _create_overlap_matrix_nucleotides(bed_file, extended_rles) return _counts_runlengths(bed_file, overlaps) @@ -47,6 +50,7 @@ def _overlap_matrix_nucleotides(bed_file, extended_rles): def _create_overlap_matrix_nucleotides(bed_file, coverages): + # type: (str, Dict[str,Dict[str, RObject]]) -> Dict[str, RObject] base_bed_other = bed_file.split("/")[-1] logging.info("Processing {} at nucleotide level".format(base_bed_other)) @@ -72,10 +76,11 @@ def _create_overlap_matrix_nucleotides(bed_file, coverages): def _counts_runlengths(bed_file, cvs): + # type: (str, Dict[str, RObject]) -> pd.DataFrame base_bed = bed_file.split("/")[-1].split(".")[0] - overlaps = defaultdict(int) + overlaps = defaultdict(int) # type: DefaultDict[str, int] get_runlength = r("function(x, v) runLength(x[x == v])") for chromosome, overlap in cvs.items(): run_values = set(r["runValue"](overlap)) diff --git a/epic/scripts/overlaps/nucleotides_heatmap.py b/epic/scripts/overlaps/nucleotides_heatmap.py index 26629d6..143f8e9 100644 --- a/epic/scripts/overlaps/nucleotides_heatmap.py +++ b/epic/scripts/overlaps/nucleotides_heatmap.py @@ -4,6 +4,7 @@ from collections import defaultdict import pandas as pd import numpy as np +from typing import Dict, Iterable import pkg_resources, os from natsort import natsorted @@ -15,6 +16,7 @@ import logging from rpy2.robjects import r, pandas2ri pandas2ri.activate() +from rpy2.robjects.robject import RObject from rpy2.robjects.packages import importr importr("S4Vectors") @@ -30,6 +32,7 @@ def nucleotide_overlaps_per_file(all_files, nb_cpu): + # type: (Iterable[str], int) -> pd.DataFrame rles = files_to_chromosome_coverage(all_files, nb_cpu) @@ -41,6 +44,7 @@ def nucleotide_overlaps_per_file(all_files, nb_cpu): def _nucleotide_overlaps_per_file(bed_file, extended_rles): + # type: (str, Dict[str,Dict[str, RObject]]) -> pd.DataFrame base_bed = bed_file.split("/")[-1].split(".")[0] logging.info("Finding the number of nucleotides in " + base_bed + " overlapping other files.") diff --git a/epic/scripts/overlaps/overlaps.py b/epic/scripts/overlaps/overlaps.py index 7157d47..3b05178 100644 --- a/epic/scripts/overlaps/overlaps.py +++ b/epic/scripts/overlaps/overlaps.py @@ -2,12 +2,14 @@ from subprocess import check_output from os.path import basename import logging +from typing import Iterable import pandas as pd from joblib import Parallel, delayed def overlap_matrix_region_counts(all_files, nb_cpu): + # type: (Iterable[str], int) -> pd.DataFrame # bargraph regions_matrixes = Parallel(n_jobs=nb_cpu)( @@ -24,6 +26,7 @@ def overlap_matrix_region_counts(all_files, nb_cpu): def _create_overlap_matrix_regions(bed_file, all_files): + # type: (str, Iterable[str]) -> pd.DataFrame all_files_str = " ".join(all_files) @@ -41,6 +44,7 @@ def _create_overlap_matrix_regions(bed_file, all_files): def _compute_region_overlap(df): + # type: (pd.DataFrame) -> pd.DataFrame main_file = df.Main.ix[0,0] @@ -54,6 +58,7 @@ def _compute_region_overlap(df): def overlap_matrix_regions(all_files, nb_cpu): + # type: (Iterable[str], int) -> pd.DataFrame #heatmap diff --git a/epic/statistics/add_to_island_expectations.py b/epic/statistics/add_to_island_expectations.py index 8a48f7c..e9a8aa5 100644 --- a/epic/statistics/add_to_island_expectations.py +++ b/epic/statistics/add_to_island_expectations.py @@ -3,13 +3,17 @@ E_VALUE = 1000 E_VALUE_THRESHOLD = E_VALUE * .0000001 +from typing import Sequence + from epic.statistics.compute_window_score import compute_window_score from epic.statistics.compute_poisson import _poisson -def add_to_island_expectations_dict( - average_window_readcount, current_max_scaled_score, - island_eligibility_threshold, island_expectations, gap_contribution): +def add_to_island_expectations_dict(average_window_readcount, + current_max_scaled_score, + island_eligibility_threshold, + island_expectations, gap_contribution): + # type: ( float, int, float, Dict[int, float], float) -> Dict[int, float] """Can probably be heavily optimized. Time required to run can be seen from logging info.""" diff --git a/epic/statistics/compute_background_probabilites.py b/epic/statistics/compute_background_probabilites.py index 3bba431..ce7c673 100644 --- a/epic/statistics/compute_background_probabilites.py +++ b/epic/statistics/compute_background_probabilites.py @@ -8,7 +8,8 @@ """ from __future__ import print_function - +from argparse import Namespace +from typing import Tuple import logging from epic.statistics.compute_values_needed_for_recurrence import ( @@ -18,6 +19,7 @@ # @MEMORY.cache(verbose=0) def compute_background_probabilities(total_chip_count, args): + # type: (int, Namespace) -> Tuple[float, int, float] effective_genome_size = args.effective_genome_size logging.debug(str(effective_genome_size) + " effective_genome_size") diff --git a/epic/statistics/compute_poisson.py b/epic/statistics/compute_poisson.py index c62dae8..fac7f91 100644 --- a/epic/statistics/compute_poisson.py +++ b/epic/statistics/compute_poisson.py @@ -4,6 +4,7 @@ @lru_cache() def _factln(num): + # type: (int) -> float """ Computes logfactorial regularly for tractable numbers, uses Ramanujans approximation otherwise. """ @@ -19,6 +20,7 @@ def _factln(num): @lru_cache() def _poisson(i, average): + # type: (int, float) -> float """ """ exponent = -average + i * log(average) - _factln(i) diff --git a/epic/statistics/compute_score_threshold.py b/epic/statistics/compute_score_threshold.py index 3de177c..c9855c3 100644 --- a/epic/statistics/compute_score_threshold.py +++ b/epic/statistics/compute_score_threshold.py @@ -8,8 +8,10 @@ def compute_score_threshold(average_window_readcount, - island_enriched_threshold, gap_contribution, - boundary_contribution, genome_length_in_bins): + island_enriched_threshold, + gap_contribution, boundary_contribution, + genome_length_in_bins): + # type: (float, int, float, float, float) -> float """ What does island_expectations do? """ @@ -23,7 +25,7 @@ def compute_score_threshold(average_window_readcount, current_scaled_score = int(round(score / BIN_SIZE)) - island_expectations_d = {} + island_expectations_d = {} # type: Dict[int, float] island_expectations_d[current_scaled_score] = prob * genome_length_in_bins island_expectations_d[ 0] = boundary_contribution * genome_length_in_bins / gap_contribution @@ -31,7 +33,7 @@ def compute_score_threshold(average_window_readcount, current_max_scaled_score = current_scaled_score interval = int(1 / BIN_SIZE) - partial_cumu = 0 + partial_cumu = 0.0 logging.info("Finding the score required to consider an island enriched.") while (partial_cumu > E_VALUE_THRESHOLD or partial_cumu < 1e-100): diff --git a/epic/statistics/compute_values_needed_for_recurrence.py b/epic/statistics/compute_values_needed_for_recurrence.py index 7a9cd10..f882bb9 100644 --- a/epic/statistics/compute_values_needed_for_recurrence.py +++ b/epic/statistics/compute_values_needed_for_recurrence.py @@ -5,6 +5,7 @@ def compute_enriched_threshold(average_window_readcount): + # type: (float) -> int """ Computes the minimum number of tags required in window for an island to be enriched. """ @@ -21,10 +22,11 @@ def compute_enriched_threshold(average_window_readcount): return island_enriched_threshold -def compute_gap_factor(island_enriched_threshold, gap_intervals_allowed, - poisson_distribution_parameter): +def compute_gap_factor(island_enriched_threshold, + gap_intervals_allowed, poisson_distribution_parameter): + # type: (int, int, float) -> float - max_gap_score = 1 + max_gap_score = 1.0 gap_factor = single_gap_factor(island_enriched_threshold, poisson_distribution_parameter) max_gap_score += sum([pow(gap_factor, i) @@ -34,6 +36,7 @@ def compute_gap_factor(island_enriched_threshold, gap_intervals_allowed, def single_gap_factor(island_enriched_threshold, poisson_distribution_parameter): + # type: (int, float) -> float poisson_scores = [poisson.pmf(i, poisson_distribution_parameter) for i in range(island_enriched_threshold)] @@ -42,6 +45,7 @@ def single_gap_factor(island_enriched_threshold, def compute_boundary(island_enriched_threshold, gap_intervals_allowed, average): + # type: (int, int, float) -> float single_gap = single_gap_factor(island_enriched_threshold, average) single_boundary_score = pow(single_gap, gap_intervals_allowed + 1) diff --git a/epic/statistics/compute_window_score.py b/epic/statistics/compute_window_score.py index db24a03..6b8aba3 100644 --- a/epic/statistics/compute_window_score.py +++ b/epic/statistics/compute_window_score.py @@ -5,6 +5,7 @@ @lru_cache() def compute_window_score(i, poisson_parameter): + # type: (int, float) -> float # No enrichment; poisson param also average if i < poisson_parameter: diff --git a/epic/statistics/count_to_pvalue.py b/epic/statistics/count_to_pvalue.py index 34c2d54..2424268 100644 --- a/epic/statistics/count_to_pvalue.py +++ b/epic/statistics/count_to_pvalue.py @@ -5,19 +5,22 @@ from functools import partial from joblib import Parallel, delayed import logging - +from typing import List, Sequence def count_to_pvalue(merged_dfs, island_enriched_threshold, average_window_readcount, nb_cpu): + # type: (Sequence[pd.DataFrame], int, float, int) -> List[pd.DataFrame] logging.info("Giving bins poisson score.") - parallel_count = partial(_count_to_pvalue, island_enriched_threshold, + parallel_count_to_pvalue = partial(_count_to_pvalue, island_enriched_threshold, average_window_readcount) - return Parallel(n_jobs=nb_cpu)(delayed(parallel_count)(df) + return Parallel(n_jobs=nb_cpu)(delayed(parallel_count_to_pvalue)(df) for df in merged_dfs) -def _count_to_pvalue(island_enriched_threshold, average_window_readcount, df): +def _count_to_pvalue(island_enriched_threshold, + average_window_readcount, df): + # type: (int, float, pd.DataFrame) -> pd.DataFrame df = df.loc[df["ChIP"] >= island_enriched_threshold] scores = df["ChIP"].apply( diff --git a/epic/statistics/fdr.py b/epic/statistics/fdr.py index c7cfafd..aeb4e20 100644 --- a/epic/statistics/fdr.py +++ b/epic/statistics/fdr.py @@ -1,8 +1,9 @@ -from scipy.stats import poisson, rankdata import pandas as pd - +from scipy.stats import poisson, rankdata +from argparse import Namespace def compute_fdr(df, total_chip_reads, total_input_reads, args): + # type: (pd.DataFrame, int, int, Namespace) -> pd.DataFrame total_island_input_reads = df.Input.sum() diff --git a/epic/statistics/generate_cumulative_distribution.py b/epic/statistics/generate_cumulative_distribution.py index b4d25e4..71687f3 100644 --- a/epic/statistics/generate_cumulative_distribution.py +++ b/epic/statistics/generate_cumulative_distribution.py @@ -1,12 +1,14 @@ from epic.config.constants import E_VALUE, BIN_SIZE +from typing import Sequence, List def generate_cumulative_dist(island_expectations_d, total_length): + # type: (Dict[int, float], int) -> float """ Generate cumulative distribution: a list of tuples (bins, hist). """ - cumulative = [0] * (total_length + 1) + cumulative = [0.0] * (total_length + 1) partial_sum = 0.0 island_expectations = [] diff --git a/epic/utils/find_readlength.py b/epic/utils/find_readlength.py index 169275d..a904802 100644 --- a/epic/utils/find_readlength.py +++ b/epic/utils/find_readlength.py @@ -3,6 +3,7 @@ from re import search, IGNORECASE from io import BytesIO from subprocess import check_output +from argparse import Namespace import pandas as pd @@ -13,6 +14,7 @@ def find_readlength(args): + # type: (Namespace) -> int """Estimate length of reads based on 10000 first.""" bed_file = args.treatment[0] @@ -51,6 +53,7 @@ def find_readlength(args): def get_closest_readlength(estimated_readlength): + # type: (int) -> int """Find the predefined readlength closest to the estimated readlength. In the case of a tie, choose the shortest readlength.""" diff --git a/epic/utils/helper_functions.py b/epic/utils/helper_functions.py index 8ff35c4..1f2905e 100644 --- a/epic/utils/helper_functions.py +++ b/epic/utils/helper_functions.py @@ -4,13 +4,15 @@ from joblib import Parallel, delayed import pandas as pd +from typing import Iterable, Sequence, Tuple try: - from functools import lru_cache + from functools import lru_cache # type: ignore except ImportError: - from functools32 import lru_cache + from functools32 import lru_cache # type: ignore def _merge_chip_and_input(chip_df, input_df): + # type: (pd.DataFrame, pd.DataFrame) -> pd.DataFrame chip_df = chip_df.set_index("Chromosome Bin".split()) input_df = input_df.set_index("Chromosome Bin".split()) @@ -39,13 +41,14 @@ def _merge_chip_and_input(chip_df, input_df): merged_df.head().to_csv(sep=" "), "Tail of merged df: ", merged_df.tail().to_csv(sep=" ") ] - assertion_message = "\n".join(assertion_message) + assertion_message = "\n".join(assertion_message) # type: ignore assert len(merged_df) == chip_df_nb_bins, assertion_message return merged_df def merge_chip_and_input(chip_dfs, input_dfs, nb_cpu): + # type: (Iterable[pd.DataFrame], Iterable[pd.DataFrame], int) -> Sequence[pd.DataFrame] # should be same length, since missing chromos get empty df # assert len(chip_dfs) == len(input_dfs) @@ -61,10 +64,12 @@ def merge_chip_and_input(chip_dfs, input_dfs, nb_cpu): def get_total_number_of_reads(dfs): + # type: (Iterable[pd.DataFrame]) -> int return sum([df.Count.sum() for df in dfs]) def ensure_same_chromosomes_in_list(sample1_dfs, sample2_dfs): + # type: (List[pd.DataFrame], List[pd.DataFrame]) -> Tuple[Dict[str, pd.DataFrame], Dict[str, pd.DataFrame]] d1 = create_chromsome_df_map(sample1_dfs) d2 = create_chromsome_df_map(sample2_dfs) @@ -77,6 +82,7 @@ def ensure_same_chromosomes_in_list(sample1_dfs, sample2_dfs): def create_chromsome_df_map(dfs): + # type: (Iterable[pd.DataFrame]) -> Dict[str, pd.DataFrame] sample_dict = {} for df in dfs: @@ -91,6 +97,7 @@ def create_chromsome_df_map(dfs): def fill_missing_chromosomes(d1, d2): + # type: (pd.DataFrame, pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame] all_chromosomomes = set(d1.keys()).union(d2.keys()) @@ -107,23 +114,25 @@ def fill_missing_chromosomes(d1, d2): def merge_same_files(sample1_dfs, sample2_dfs, nb_cpu): + # type: (List[pd.DataFrame], List[pd.DataFrame], int) -> List[pd.DataFrame] # if one list is missing a chromosome, we might pair up the wrong dataframes # therefore creating dicts beforehand to ensure they are paired up properly - sample1_dfs, sample2_dfs = ensure_same_chromosomes_in_list(sample1_dfs, - sample2_dfs) + d1, d2 = ensure_same_chromosomes_in_list(sample1_dfs, + sample2_dfs) - assert len(sample1_dfs) == len(sample2_dfs) + assert len(d1) == len(d2) logging.info("Merging same class data.") merged_chromosome_dfs = Parallel(n_jobs=nb_cpu)(delayed(_merge_same_files)( - sample1_dfs[chromosome], - sample2_dfs[chromosome]) for chromosome in sample1_dfs.keys()) + d1[chromosome], + d2[chromosome]) for chromosome in d1.keys()) return merged_chromosome_dfs def _merge_same_files(sample1_df, sample2_df): + # type: (pd.DataFrame, pd.DataFrame) -> pd.DataFrame merged_df = sample1_df.merge(sample2_df, how="outer", diff --git a/epic/utils/separate_input_and_chip_infiles.py b/epic/utils/separate_input_and_chip_infiles.py index 96d8138..625148c 100644 --- a/epic/utils/separate_input_and_chip_infiles.py +++ b/epic/utils/separate_input_and_chip_infiles.py @@ -1,7 +1,8 @@ from fnmatch import fnmatch - +from typing import Iterable, Sequence, Tuple def separate_input_and_chip_infiles(bed_files, input_pattern="input"): + # type: (Iterable, str) -> Tuple[Sequence[str], Sequence[str]] """Split list of files depending on whether name contains input_pattern. Since docopt cannot take two lists of variadic arguments, splitting it diff --git a/epic/windows/cluster/find_islands.py b/epic/windows/cluster/find_islands.py index ae7e54d..33ceddc 100644 --- a/epic/windows/cluster/find_islands.py +++ b/epic/windows/cluster/find_islands.py @@ -5,8 +5,11 @@ from joblib import Parallel, delayed +from typing import Iterable +from argparse import Namespace def find_islands(dfs, score_threshold, args): + # type: (Iterable[pd.DataFrame], float, Namespace) -> Iterable[pd.DataFrame] logging.info("Clustering bins into islands.") parallel_find_islands = partial(_find_islands, args.window_size, args.gaps_allowed, score_threshold) @@ -16,6 +19,7 @@ def find_islands(dfs, score_threshold, args): def _find_islands(window_size, gaps_allowed, score_threshold, df): + # type: (int, int, float, pd.DataFrame) -> pd.DataFrame if df.empty: return df diff --git a/epic/windows/count/count_reads_in_windows.py b/epic/windows/count/count_reads_in_windows.py index 9cb044e..64f9132 100644 --- a/epic/windows/count/count_reads_in_windows.py +++ b/epic/windows/count/count_reads_in_windows.py @@ -3,6 +3,8 @@ from itertools import product from logging import info from subprocess import check_output, Popen, PIPE +from typing import Any, Iterable, Tuple +from argparse import Namespace import pandas as pd from joblib import Parallel, delayed @@ -15,6 +17,7 @@ def _options(bed_file, keep_duplicates): + # type: (str, bool) -> Tuple[str, str] if not keep_duplicates: duplicate_handling = " uniq | " @@ -32,6 +35,7 @@ def _options(bed_file, keep_duplicates): def count_reads_in_windows(bed_file, args): + # type: (str, Namespace) -> List[pd.DataFrame] chromosome_size_dict = args.chromosome_sizes chromosomes = natsorted(list(chromosome_size_dict.keys())) @@ -56,8 +60,9 @@ def count_reads_in_windows(bed_file, args): return merged_chromosome_dfs -def _count_reads_in_windows(bed_file, args, chromosome_size, chromosome, - strand): +def _count_reads_in_windows(bed_file, args, chromosome_size, + chromosome, strand): + # type: (str, Namespace, int, str, str) -> pd.DataFrame halved_fragment_size = args.fragment_size // 2 idx = 1 if strand == "+" else 2 # fragment start indices @@ -96,6 +101,7 @@ def _count_reads_in_windows(bed_file, args, chromosome_size, chromosome, def count_reads_in_windows_paired_end(bed_file, args): + # type: (str, Namespace) -> List[pd.DataFrame] chromosome_size_dict = args.chromosome_sizes chromosomes = natsorted(list(chromosome_size_dict.keys())) @@ -115,6 +121,7 @@ def count_reads_in_windows_paired_end(bed_file, args): def _count_reads_in_windows_paired_end(bed_file, keep_duplicates, chromosome_size, chromosome): + # type: (str, bool, int, str) -> pd.DataFrame grep, duplicate_handling = _options(bed_file, keep_duplicates) @@ -151,5 +158,6 @@ def _count_reads_in_windows_paired_end(bed_file, keep_duplicates, def _pairwise(iterable): + # type: (Iterable[Any]) -> Iterable[Tuple[Any, Any]] col = iter(iterable) return zip(col, col) diff --git a/epic/windows/count/merge_chromosome_dfs.py b/epic/windows/count/merge_chromosome_dfs.py index 89cb66b..f632b4e 100644 --- a/epic/windows/count/merge_chromosome_dfs.py +++ b/epic/windows/count/merge_chromosome_dfs.py @@ -1,8 +1,9 @@ import pandas as pd from numpy import int32 - +from typing import Any, Tuple, Sequence def merge_chromosome_dfs(df_tuple): + # type: (Tuple[pd.DataFrame, pd.DataFrame]) -> pd.DataFrame """Merges data from the two strands into strand-agnostic counts.""" plus_df, minus_df = df_tuple @@ -32,6 +33,7 @@ def merge_chromosome_dfs(df_tuple): def return_other(df, count_column, index_cols): + # type: (pd.DataFrame, Any, Sequence[Any]) -> pd.DataFrame df[[count_column, "Bin"]] = df[[count_column, "Bin"]].astype(int32) df = df.groupby(index_cols).sum().reset_index() diff --git a/epic/windows/count/remove_out_of_bounds_bins.py b/epic/windows/count/remove_out_of_bounds_bins.py index e5e1776..5787fbf 100644 --- a/epic/windows/count/remove_out_of_bounds_bins.py +++ b/epic/windows/count/remove_out_of_bounds_bins.py @@ -1,4 +1,7 @@ +import pandas as pd + def remove_out_of_bounds_bins(df, chromosome_size): + # type: (pd.DataFrame, int) -> pd.DataFrame """Remove all reads that were shifted outside of the genome endpoints.""" # The dataframe is empty and contains no bins out of bounds @@ -10,7 +13,9 @@ def remove_out_of_bounds_bins(df, chromosome_size): return df.drop(df[df.Bin < 0].index) -def remove_bins_with_ends_out_of_bounds(df, chromosome_size, window_size): +def remove_bins_with_ends_out_of_bounds(df, chromosome_size, + window_size): + # type: (pd.DataFrame, int, int) -> pd.DataFrame """Remove all reads that were shifted outside of the genome endpoints.""" # The dataframe is empty and contains no bins out of bounds diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 0000000..9725f57 --- /dev/null +++ b/mypy.ini @@ -0,0 +1,6 @@ +[mypy] +fast_parser = True +silent_imports = True +check_untyped_defs = True +warn_redundant_casts = True +disallow_untyped_defs = True diff --git a/setup.py b/setup.py index 60eeb82..d1fcd87 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ # from Cython.Build import cythonize from epic.version import __version__ -install_requires = ["scipy", "pandas", "numpy", "natsort", "joblib", "pyfaidx"] +install_requires = ["scipy", "pandas", "numpy", "natsort", "joblib", "pyfaidx", "typing"] try: os.getenv("TRAVIS")