Project rename#48 (#67)

* Initial commit * Removed * UPD: Replaced orgs_ids ref pickle with csv. Depends on dtypes from apis.Columns * Initial commit * UPD:Read csv not pickle. Prepare GENES and ORG cols * UPD:Replaced pickled KEGG name-ID dicts with hard-coded ones. * Rename * Rename * UPD: Read csv not pickle. Prepare the dtypes. * UPD:Extension change * FIX:Typo * Initial commit * Deleted * UPD:Read csv not pickle * UPD:Read csv not pickle * Removed pickled KEGG_ID_name and name_KEGG_ID dicts. Coded in tests.tests.DatabasesTests.setUp * Removed databases.ProfInt * Removed databases.ProfInt tests data after the class was removed * Removed databases.ProfInt-related tests * Removed SGA1 test_data pickle * UPD: Removed unsused big file * merge master into UPD#48 * UPD:Removed prowler.stats.Selector class and permutation-related functions * Removed after prowler.stats.Selector removal * UPD:Removed stats.Selector-related tests * UPD: Removed permutation-related tests * Removed pickled kegg database test data * UPD: PEP * Removed pickled reference kegg database * Removed old CSV kegg database * Initial commit updated CSV ref kegg database * Initial commit test text kegg database * UPD: Proper test for databases.KEGG.parse_database * UPD: Removed pickled data. UPD: Changed nwrk ref data to minified version so there is no need to manage ORGS or GENES cols. UPD: Changed reading-in ref file to csv.
dizak · Sep 12, 2018 · 976ee33 · 976ee33
1 parent 475f16b
commit 976ee33
Show file tree

Hide file tree

Showing 30 changed files with 161 additions and 3,926 deletions.
diff --git a/prowler/databases.py b/prowler/databases.py
@@ -434,91 +434,3 @@ def parse(self,
                                           names=self.names)
         self.bioprocesses = self.bioprocesses.astype({k: v for k, v in self.dtypes.items()
                                                      if k in self.bioprocesses.columns})
-
-
-class ProfInt(Columns):
-    """
-    Concatenation of SGA and profilized KO.
-    """
-    def __init__(self):
-        self.names = {"authors": self.AUTH,
-                      "definition": self.DEF,
-                      "entry": self.ENTRY,
-                      "genes": self.GENES,
-                      "journal": self.JOURN,
-                      "name": self.NAME,
-                      "orgs": self.ORGS,
-                      "profile": self.PROF,
-                      "reference": self.REF,
-                      "sequence": self.SEQ,
-                      "title": self.TITLE}
-
-    def merger(self,
-               KO_df,
-               ORF_KO,
-               sga):
-        """Return Ortho_Interactions.sga appended by
-        Ortho_Interactions.KO_df. Merge key: ORF
-        """
-        KO_df.rename(columns=self.names,
-                     inplace=True)
-        self.merged = pd.merge(sga,
-                               ORF_KO,
-                               left_on=self.ORF_Q,
-                               right_on=self.ORF_ID,
-                               how="left")
-        self.merged = pd.merge(self.merged,
-                               ORF_KO,
-                               left_on=self.ORF_A,
-                               right_on=self.ORF_ID,
-                               how="left",
-                               suffixes=(self.QUERY_SUF, self.ARRAY_SUF))
-        self.merged.drop([self.ORF_ID_Q,
-                          self.ORF_ID_A],
-                         axis=1,
-                         inplace=True)
-        self.merged.dropna(inplace=True)
-        self.merged = pd.merge(self.merged,
-                               KO_df,
-                               left_on=self.KEGG_ID_Q,
-                               right_on=self.ENTRY,
-                               how="left")
-        self.merged = pd.merge(self.merged,
-                               KO_df,
-                               left_on=self.KEGG_ID_A,
-                               right_on=self.ENTRY,
-                               how="left",
-                               suffixes=(self.QUERY_SUF, self.ARRAY_SUF))
-        self.merged.drop([self.KEGG_ID_Q,
-                          self.KEGG_ID_A],
-                         axis=1,
-                         inplace=True)
-        self.merged.dropna(inplace=True)
-
-    def profilize(self,
-                  reference_species,
-                  method="pairwise"):
-        """
-        Append databases.merged with Profiles Similarity Score and/or string
-        representation of the phylogenetic profiles.
-
-        Parameters
-        -------
-        reference_species: list of str
-            Species list compared to contained in the orthogroup. Basis for the
-            profiles construction.
-        method: str, default <pairwise>
-            Distance measure to use in Profiles Similarity Score calculation.
-        """
-        if method != "pairwise":
-            self.dtypes[self.PSS] = "float32"
-        self.merged[self.PROF_A] = self.merged[self.ORGS_A].apply(lambda x:
-                                                                  _Profile(x, reference_species))
-        self.merged[self.PROF_Q] = self.merged[self.ORGS_Q].apply(lambda x:
-                                                                  _Profile(x, reference_species))
-        self.merged[self.PSS] = self.merged.apply(lambda x:
-                                                  x[self.PROF_Q].calculate_pss(x[self.PROF_A],
-                                                                               method=method),
-                                                  axis=1).astype(self.dtypes[self.PSS])
-        self.merged = self.merged.astype({k: v for k, v in self.dtypes.items()
-                                          if k in self.merged.columns})
diff --git a/prowler/stats.py b/prowler/stats.py
@@ -115,95 +115,6 @@ def calculate_enrichment(selected,
                                          if k in selected_bins.columns})
     return selected_bins
 
-def permute_profiles(dataframe,
-                     iterations,
-                     return_series=False,
-                     multiprocessing=False,
-                     mp_backend="joblib"):
-    """
-    Returns list of PSS bins after each permutation.
-
-    The algorithm:
-        1. Extract ORFs and PROFs columns.
-        2. Make the non-redundant list of ORF-PROF.
-        4. Shuffle PROF column using pandas.Series.sample method.
-        5. Merge with the stripped DataFrame on ORF (how="left").
-        6. Calculate the results.
-
-    Parameters
-    -------
-    dataframe: pandas.DataFrame
-        Dataframe on which test is performed.
-    iterations: int
-        Number of permutations to perform.
-    multiprocessing: bool, default <False>
-        pathos multiprocessing is used if <True>. Divides iterations
-        between cores.
-    """
-    def _permute_profiles(dataframe,
-                          iteration):
-        """
-        Returns a interactions network with permuted profiles and re-calculated
-        PSS.
-
-        Parameters
-        ------
-        dataframe: pandas.DataFrame
-            Dataframe to be permuted.
-
-        Return
-        -------
-            pandas.DataFrame
-        Dataframe with the profiles permuted among ORFs names and PSS
-        re-calculated.
-        """
-        sub_Q = dataframe[[Columns.ORF_Q,
-                           Columns.PROF_Q]].rename(columns={Columns.ORF_Q:
-                                                            Columns.ORF,
-                                                            Columns.PROF_Q:
-                                                            Columns.PROF}).drop_duplicates(subset=[Columns.ORF]).reset_index(drop=True)
-        sub_A = dataframe[[Columns.ORF_A,
-                           Columns.PROF_A]].rename(columns={Columns.ORF_A:
-                                                            Columns.ORF,
-                                                            Columns.PROF_A:
-                                                            Columns.PROF}).drop_duplicates(subset=[Columns.ORF]).reset_index(drop=True)
-        sub_QA = pd.concat([sub_Q,
-                            sub_A]).drop_duplicates(subset=[Columns.ORF])
-        right_df = pd.concat([sub_QA[Columns.ORF].reset_index(drop=True),
-                              sub_QA[Columns.PROF].sample(n=len(sub_QA),
-                                                       replace=True).reset_index(drop=True)],
-                             axis=1)
-        del sub_Q, sub_A, sub_QA
-        permuted = pd.merge(left=dataframe.drop([Columns.PROF_Q, Columns.PROF_A, Columns.PSS], axis=1),
-                            right=right_df,
-                            left_on=[Columns.ORF_Q],
-                            right_on=[Columns.ORF],
-                            how="left").merge(right_df,
-                                              left_on=[Columns.ORF_A],
-                                              right_on=[Columns.ORF],
-                                              how="left",
-                                              suffixes=[Columns.QUERY_SUF, Columns.ARRAY_SUF])
-        permuted[Columns.PSS] = permuted.apply(lambda x:
-                                            x[Columns.PROF_Q].calculate_pss(x[Columns.PROF_A]),
-                                            axis=1)
-        del right_df
-        gc.collect()
-        return pd.DataFrame(permuted.groupby(by=[Columns.PSS]).size())
-    if multiprocessing is True:
-        f = partial(_permute_profiles, dataframe)
-        chunksize = iterations / ptmp.cpu_count()
-        out = ptmp.ProcessingPool().map(f, list(range(iterations)), chunksize=chunksize)
-    else:
-        out = []
-        for i in tqdm(list(range(iterations))):
-            out.append(_permute_profiles(dataframe, i))
-    if return_series:
-        return pd.concat([i[1].rename(columns={0: i[0]})
-                          for i in enumerate(out)],
-                         axis=1).fillna(value=0)
-    else:
-        return out
-
 def binomial_pss_test(desired_pss,
                       selected,
                       total,
@@ -236,156 +147,3 @@ def binomial_pss_test(desired_pss,
     test = np.random.binomial(n, p, test_size)
     return {"complete": sum(test <= real_val),
             "average": sum(test) / len(test)}
-
-
-class Selector(Columns,
-               _Profile):
-    """
-    Allows convenient selections of the Interactions Network.
-    """
-    def __init__(self,
-                 dataframe,
-                 profiles_similarity_threshold,
-                 p_value=0.05,
-                 positive_interactions_minimum_GIS=0.16,
-                 negative_interactions_maximum_GIS=-0.12,
-                 all_species_in_query=None,
-                 any_species_in_query=None,
-                 none_species_in_query=None,
-                 all_species_in_array=None,
-                 any_species_in_array=None,
-                 none_species_in_array=None):
-        if not isinstance(dataframe, pd.DataFrame):
-            raise TypeError("Must be pandas.DataFrame")
-        if not all([isinstance(i, float) for i in [p_value,
-                                                   positive_interactions_minimum_GIS,
-                                                   negative_interactions_maximum_GIS]]):
-            raise TypeError("Must be float.")
-        if not isinstance(profiles_similarity_threshold, int):
-            raise TypeError("Must be int.")
-        self._summary_dict = {}
-        self.dataframe = dataframe
-        self._profiles_similarity_threshold = profiles_similarity_threshold
-        self._p_value = p_value
-        self._GIS_min = positive_interactions_minimum_GIS
-        self._GIS_max = negative_interactions_maximum_GIS
-        self._all_species_in_query = all_species_in_query
-        self._any_species_in_query = any_species_in_query
-        self._none_species_in_query = none_species_in_query
-        self._all_species_in_array = all_species_in_array
-        self._any_species_in_array = any_species_in_array
-        self._none_species_in_array = none_species_in_array
-        self._summary_dict["total"] = len(self.dataframe)
-        try:
-            self.compensatory_interactions = ((self.dataframe[self.DMF] >
-                                               self.dataframe[self.SMF_Q]) &
-                                              (self.dataframe[self.DMF] >
-                                               self.dataframe[self.SMF_A]))
-            self.inv_compensatory_interactions = ((self.dataframe[self.DMF] <
-                                                   self.dataframe[self.SMF_Q]) &
-                                                   (self.dataframe[self.DMF] <
-                                                   self.dataframe[self.SMF_A]))
-            self.SMF_below_one = (self.dataframe[self.SMF_Q] < 1.0) &\
-                                 (self.dataframe[self.SMF_A] < 1.0)
-            self._summary_dict["compensatory_interactions"] = len(self.dataframe[self.compensatory_interactions]),
-            self._summary_dict["inv_compensatory_interactions"] = len(self.dataframe[self.inv_compensatory_interactions]),
-        except KeyError:
-            warnings.warn("Failed to make fitness-based booleans.",
-                          SelectionFailWarning)
-        try:
-            self.p_value = (self.dataframe[self.GIS_P] <= self._p_value)
-        except KeyError:
-            warnings.warn("Failed to make p-value-based booleans.",
-                          SelectionFailWarning)
-        try:
-            self.positive_interactions = (self.dataframe[self.GIS] > self._GIS_min)
-            self.negative_interactions = (self.dataframe[self.GIS] < self._GIS_max)
-        except KeyError:
-            warnings.warn("Failed to make Genetic Interactions Score-based booleans.",
-                          SelectionFailWarning)
-        try:
-            self.PSS_bins = pd.DataFrame(self.dataframe.groupby(by=[self.PSS]).size())
-            self.similar_profiles = (self.dataframe["PSS"] >=
-                                     self._profiles_similarity_threshold)
-            self.dissimilar_profiles = (self.dataframe["PSS"] <=
-                                        self._profiles_similarity_threshold)
-            self.mirror_profiles = (self.dataframe["PSS"] <=
-                                    self._profiles_similarity_threshold)
-            self.no_flat_plu_q = (self.dataframe[self.PROF_Q].apply(lambda x: x.to_string()) !=
-                                  _Profile._positive_sign * len(self.dataframe.PROF_Q[0]))
-            self.no_flat_min_q = (self.dataframe[self.PROF_Q].apply(lambda x: x.to_string()) !=
-                                  _Profile._negative_sign * len(self.dataframe.PROF_Q[0]))
-            self.no_flat_plu_a = (self.dataframe[self.PROF_A].apply(lambda x: x.to_string()) !=
-                                  _Profile._positive_sign * len(self.dataframe.PROF_Q[0]))
-            self.no_flat_min_a = (self.dataframe[self.PROF_A].apply(lambda x: x.to_string()) !=
-                                  _Profile._negative_sign * len(self.dataframe.PROF_Q[0]))
-            self.flat_plu_q = (self.dataframe[self.PROF_Q].apply(lambda x: x.to_string()) ==
-                               _Profile._positive_sign * len(self.dataframe.PROF_Q[0]))
-            self.flat_min_q = (self.dataframe[self.PROF_Q].apply(lambda x: x.to_string()) ==
-                               _Profile._negative_sign * len(self.dataframe.PROF_Q[0]))
-            self.flat_plu_a = (self.dataframe[self.PROF_A].apply(lambda x: x.to_string()) ==
-                               _Profile._positive_sign * len(self.dataframe.PROF_Q[0]))
-            self.flat_min_a = (self.dataframe[self.PROF_A].apply(lambda x: x.to_string()) ==
-                               _Profile._negative_sign * len(self.dataframe.PROF_Q[0]))
-            self._summary_dict["similar_profiles"] = len(self.dataframe[self.similar_profiles])
-            self._summary_dict["dissimilar_profiles"] = len(self.dataframe[self.dissimilar_profiles])
-            self._summary_dict["mirror_profiles"] = len(self.dataframe[self.mirror_profiles])
-        except KeyError:
-            warnings.warn("Failed to make phylogenetic profiles-based booleans",
-                          SelectionFailWarning)
-        self.summary = pd.DataFrame(self._summary_dict,
-                                    index=[0])
-        if self._all_species_in_query is not None:
-            if not isinstance(self._all_species_in_query, (list, tuple)):
-                raise TypeError("Must be list or tuple.")
-            try:
-                self.all_species_in_query = self.dataframe[self.PROF_Q].apply(lambda x: isiniterable(self._all_species_in_query,
-                                                                                                     x.get_present(),
-                                                                                                     all_present=True))
-            except KeyError:
-                warnings.warn("Failed to make query-species-based selection.")
-        if self._any_species_in_query is not None:
-            if not isinstance(self._any_species_in_query, (list, tuple)):
-                raise TypeError("Must be list or tuple.")
-            try:
-                self.any_species_in_query = self.dataframe[self.PROF_Q].apply(lambda x: isiniterable(self._any_species_in_query,
-                                                                                                     x.get_present(),
-                                                                                                     all_present=False))
-            except KeyError:
-                warnings.warn("Failed to make query-species-based selection.")
-        if self._none_species_in_query is not None:
-            if not isinstance(self._none_species_in_query, (list, tuple)):
-                raise TypeError("Must be list or tuple.")
-            try:
-                self.none_species_in_query = self.dataframe[self.PROF_Q].apply(lambda x: isiniterable(self._none_species_in_query,
-                                                                                                      x.get_absent(),
-                                                                                                      all_present=True))
-            except KeyError:
-                warnings.warn("Failed to make query-species-based selection.")
-        if self._all_species_in_array is not None:
-            if not isinstance(self._all_species_in_array, (list, tuple)):
-                raise TypeError("Must be list or tuple.")
-            try:
-                self.all_species_in_array = self.dataframe[self.PROF_A].apply(lambda x: isiniterable(self._all_species_in_array,
-                                                                                                     x.get_present(),
-                                                                                                     all_present=True))
-            except KeyError:
-                warnings.warn("Failed to make array-species-based selection.")
-        if self._any_species_in_array is not None:
-            if not isinstance(self._any_species_in_array, (list, tuple)):
-                raise TypeError("Must be list or tuple.")
-            try:
-                self.any_species_in_array = self.dataframe[self.PROF_A].apply(lambda x: isiniterable(self._any_species_in_array,
-                                                                                                     x.get_present(),
-                                                                                                     all_present=False))
-            except KeyError:
-                warnings.warn("Failed to make array-species-based selection.")
-        if self._none_species_in_array is not None:
-            if not isinstance(self._none_species_in_array, (list, tuple)):
-                raise TypeError("Must be list or tuple.")
-            try:
-                self.none_species_in_array = self.dataframe[self.PROF_A].apply(lambda x: isiniterable(self._none_species_in_array,
-                                                                                                      x.get_absent(),
-                                                                                                      all_present=True))
-            except KeyError:
-                warnings.warn("Failed to make array-species-based selection.")