Skip to content

Commit

Permalink
Project rename#48 (#67)
Browse files Browse the repository at this point in the history
* Initial commit

* Removed

* UPD: Replaced orgs_ids ref pickle with csv. Depends on dtypes from apis.Columns

* Initial commit

* UPD:Read csv not pickle. Prepare GENES and ORG cols

* UPD:Replaced pickled KEGG name-ID dicts with hard-coded ones.

* Rename

* Rename

* UPD: Read csv not pickle. Prepare the dtypes.

* UPD:Extension change

* FIX:Typo

* Initial commit

* Deleted

* UPD:Read csv not pickle

* UPD:Read csv not pickle

* Removed pickled KEGG_ID_name and name_KEGG_ID dicts. Coded in tests.tests.DatabasesTests.setUp

* Removed databases.ProfInt

* Removed databases.ProfInt tests data after the class was removed

* Removed databases.ProfInt-related tests

* Removed SGA1 test_data pickle

* UPD: Removed unsused big file

* merge master into UPD#48

* UPD:Removed prowler.stats.Selector class and permutation-related functions

* Removed after prowler.stats.Selector removal

* UPD:Removed stats.Selector-related tests

* UPD: Removed permutation-related tests

* Removed pickled kegg database test data

* UPD: PEP

* Removed pickled reference kegg database

* Removed old CSV kegg database

* Initial commit updated CSV ref kegg database

* Initial commit test text kegg database

* UPD: Proper test for databases.KEGG.parse_database

* UPD: Removed pickled data. UPD: Changed nwrk ref data to minified version so there is no need to manage ORGS or GENES cols. UPD: Changed reading-in ref file to csv.
  • Loading branch information
dizak committed Sep 12, 2018
1 parent 475f16b commit 976ee33
Show file tree
Hide file tree
Showing 30 changed files with 161 additions and 3,926 deletions.
88 changes: 0 additions & 88 deletions prowler/databases.py
Original file line number Diff line number Diff line change
Expand Up @@ -434,91 +434,3 @@ def parse(self,
names=self.names)
self.bioprocesses = self.bioprocesses.astype({k: v for k, v in self.dtypes.items()
if k in self.bioprocesses.columns})


class ProfInt(Columns):
"""
Concatenation of SGA and profilized KO.
"""
def __init__(self):
self.names = {"authors": self.AUTH,
"definition": self.DEF,
"entry": self.ENTRY,
"genes": self.GENES,
"journal": self.JOURN,
"name": self.NAME,
"orgs": self.ORGS,
"profile": self.PROF,
"reference": self.REF,
"sequence": self.SEQ,
"title": self.TITLE}

def merger(self,
KO_df,
ORF_KO,
sga):
"""Return Ortho_Interactions.sga appended by
Ortho_Interactions.KO_df. Merge key: ORF
"""
KO_df.rename(columns=self.names,
inplace=True)
self.merged = pd.merge(sga,
ORF_KO,
left_on=self.ORF_Q,
right_on=self.ORF_ID,
how="left")
self.merged = pd.merge(self.merged,
ORF_KO,
left_on=self.ORF_A,
right_on=self.ORF_ID,
how="left",
suffixes=(self.QUERY_SUF, self.ARRAY_SUF))
self.merged.drop([self.ORF_ID_Q,
self.ORF_ID_A],
axis=1,
inplace=True)
self.merged.dropna(inplace=True)
self.merged = pd.merge(self.merged,
KO_df,
left_on=self.KEGG_ID_Q,
right_on=self.ENTRY,
how="left")
self.merged = pd.merge(self.merged,
KO_df,
left_on=self.KEGG_ID_A,
right_on=self.ENTRY,
how="left",
suffixes=(self.QUERY_SUF, self.ARRAY_SUF))
self.merged.drop([self.KEGG_ID_Q,
self.KEGG_ID_A],
axis=1,
inplace=True)
self.merged.dropna(inplace=True)

def profilize(self,
reference_species,
method="pairwise"):
"""
Append databases.merged with Profiles Similarity Score and/or string
representation of the phylogenetic profiles.
Parameters
-------
reference_species: list of str
Species list compared to contained in the orthogroup. Basis for the
profiles construction.
method: str, default <pairwise>
Distance measure to use in Profiles Similarity Score calculation.
"""
if method != "pairwise":
self.dtypes[self.PSS] = "float32"
self.merged[self.PROF_A] = self.merged[self.ORGS_A].apply(lambda x:
_Profile(x, reference_species))
self.merged[self.PROF_Q] = self.merged[self.ORGS_Q].apply(lambda x:
_Profile(x, reference_species))
self.merged[self.PSS] = self.merged.apply(lambda x:
x[self.PROF_Q].calculate_pss(x[self.PROF_A],
method=method),
axis=1).astype(self.dtypes[self.PSS])
self.merged = self.merged.astype({k: v for k, v in self.dtypes.items()
if k in self.merged.columns})
242 changes: 0 additions & 242 deletions prowler/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,95 +115,6 @@ def calculate_enrichment(selected,
if k in selected_bins.columns})
return selected_bins

def permute_profiles(dataframe,
iterations,
return_series=False,
multiprocessing=False,
mp_backend="joblib"):
"""
Returns list of PSS bins after each permutation.
The algorithm:
1. Extract ORFs and PROFs columns.
2. Make the non-redundant list of ORF-PROF.
4. Shuffle PROF column using pandas.Series.sample method.
5. Merge with the stripped DataFrame on ORF (how="left").
6. Calculate the results.
Parameters
-------
dataframe: pandas.DataFrame
Dataframe on which test is performed.
iterations: int
Number of permutations to perform.
multiprocessing: bool, default <False>
pathos multiprocessing is used if <True>. Divides iterations
between cores.
"""
def _permute_profiles(dataframe,
iteration):
"""
Returns a interactions network with permuted profiles and re-calculated
PSS.
Parameters
------
dataframe: pandas.DataFrame
Dataframe to be permuted.
Return
-------
pandas.DataFrame
Dataframe with the profiles permuted among ORFs names and PSS
re-calculated.
"""
sub_Q = dataframe[[Columns.ORF_Q,
Columns.PROF_Q]].rename(columns={Columns.ORF_Q:
Columns.ORF,
Columns.PROF_Q:
Columns.PROF}).drop_duplicates(subset=[Columns.ORF]).reset_index(drop=True)
sub_A = dataframe[[Columns.ORF_A,
Columns.PROF_A]].rename(columns={Columns.ORF_A:
Columns.ORF,
Columns.PROF_A:
Columns.PROF}).drop_duplicates(subset=[Columns.ORF]).reset_index(drop=True)
sub_QA = pd.concat([sub_Q,
sub_A]).drop_duplicates(subset=[Columns.ORF])
right_df = pd.concat([sub_QA[Columns.ORF].reset_index(drop=True),
sub_QA[Columns.PROF].sample(n=len(sub_QA),
replace=True).reset_index(drop=True)],
axis=1)
del sub_Q, sub_A, sub_QA
permuted = pd.merge(left=dataframe.drop([Columns.PROF_Q, Columns.PROF_A, Columns.PSS], axis=1),
right=right_df,
left_on=[Columns.ORF_Q],
right_on=[Columns.ORF],
how="left").merge(right_df,
left_on=[Columns.ORF_A],
right_on=[Columns.ORF],
how="left",
suffixes=[Columns.QUERY_SUF, Columns.ARRAY_SUF])
permuted[Columns.PSS] = permuted.apply(lambda x:
x[Columns.PROF_Q].calculate_pss(x[Columns.PROF_A]),
axis=1)
del right_df
gc.collect()
return pd.DataFrame(permuted.groupby(by=[Columns.PSS]).size())
if multiprocessing is True:
f = partial(_permute_profiles, dataframe)
chunksize = iterations / ptmp.cpu_count()
out = ptmp.ProcessingPool().map(f, list(range(iterations)), chunksize=chunksize)
else:
out = []
for i in tqdm(list(range(iterations))):
out.append(_permute_profiles(dataframe, i))
if return_series:
return pd.concat([i[1].rename(columns={0: i[0]})
for i in enumerate(out)],
axis=1).fillna(value=0)
else:
return out

def binomial_pss_test(desired_pss,
selected,
total,
Expand Down Expand Up @@ -236,156 +147,3 @@ def binomial_pss_test(desired_pss,
test = np.random.binomial(n, p, test_size)
return {"complete": sum(test <= real_val),
"average": sum(test) / len(test)}


class Selector(Columns,
_Profile):
"""
Allows convenient selections of the Interactions Network.
"""
def __init__(self,
dataframe,
profiles_similarity_threshold,
p_value=0.05,
positive_interactions_minimum_GIS=0.16,
negative_interactions_maximum_GIS=-0.12,
all_species_in_query=None,
any_species_in_query=None,
none_species_in_query=None,
all_species_in_array=None,
any_species_in_array=None,
none_species_in_array=None):
if not isinstance(dataframe, pd.DataFrame):
raise TypeError("Must be pandas.DataFrame")
if not all([isinstance(i, float) for i in [p_value,
positive_interactions_minimum_GIS,
negative_interactions_maximum_GIS]]):
raise TypeError("Must be float.")
if not isinstance(profiles_similarity_threshold, int):
raise TypeError("Must be int.")
self._summary_dict = {}
self.dataframe = dataframe
self._profiles_similarity_threshold = profiles_similarity_threshold
self._p_value = p_value
self._GIS_min = positive_interactions_minimum_GIS
self._GIS_max = negative_interactions_maximum_GIS
self._all_species_in_query = all_species_in_query
self._any_species_in_query = any_species_in_query
self._none_species_in_query = none_species_in_query
self._all_species_in_array = all_species_in_array
self._any_species_in_array = any_species_in_array
self._none_species_in_array = none_species_in_array
self._summary_dict["total"] = len(self.dataframe)
try:
self.compensatory_interactions = ((self.dataframe[self.DMF] >
self.dataframe[self.SMF_Q]) &
(self.dataframe[self.DMF] >
self.dataframe[self.SMF_A]))
self.inv_compensatory_interactions = ((self.dataframe[self.DMF] <
self.dataframe[self.SMF_Q]) &
(self.dataframe[self.DMF] <
self.dataframe[self.SMF_A]))
self.SMF_below_one = (self.dataframe[self.SMF_Q] < 1.0) &\
(self.dataframe[self.SMF_A] < 1.0)
self._summary_dict["compensatory_interactions"] = len(self.dataframe[self.compensatory_interactions]),
self._summary_dict["inv_compensatory_interactions"] = len(self.dataframe[self.inv_compensatory_interactions]),
except KeyError:
warnings.warn("Failed to make fitness-based booleans.",
SelectionFailWarning)
try:
self.p_value = (self.dataframe[self.GIS_P] <= self._p_value)
except KeyError:
warnings.warn("Failed to make p-value-based booleans.",
SelectionFailWarning)
try:
self.positive_interactions = (self.dataframe[self.GIS] > self._GIS_min)
self.negative_interactions = (self.dataframe[self.GIS] < self._GIS_max)
except KeyError:
warnings.warn("Failed to make Genetic Interactions Score-based booleans.",
SelectionFailWarning)
try:
self.PSS_bins = pd.DataFrame(self.dataframe.groupby(by=[self.PSS]).size())
self.similar_profiles = (self.dataframe["PSS"] >=
self._profiles_similarity_threshold)
self.dissimilar_profiles = (self.dataframe["PSS"] <=
self._profiles_similarity_threshold)
self.mirror_profiles = (self.dataframe["PSS"] <=
self._profiles_similarity_threshold)
self.no_flat_plu_q = (self.dataframe[self.PROF_Q].apply(lambda x: x.to_string()) !=
_Profile._positive_sign * len(self.dataframe.PROF_Q[0]))
self.no_flat_min_q = (self.dataframe[self.PROF_Q].apply(lambda x: x.to_string()) !=
_Profile._negative_sign * len(self.dataframe.PROF_Q[0]))
self.no_flat_plu_a = (self.dataframe[self.PROF_A].apply(lambda x: x.to_string()) !=
_Profile._positive_sign * len(self.dataframe.PROF_Q[0]))
self.no_flat_min_a = (self.dataframe[self.PROF_A].apply(lambda x: x.to_string()) !=
_Profile._negative_sign * len(self.dataframe.PROF_Q[0]))
self.flat_plu_q = (self.dataframe[self.PROF_Q].apply(lambda x: x.to_string()) ==
_Profile._positive_sign * len(self.dataframe.PROF_Q[0]))
self.flat_min_q = (self.dataframe[self.PROF_Q].apply(lambda x: x.to_string()) ==
_Profile._negative_sign * len(self.dataframe.PROF_Q[0]))
self.flat_plu_a = (self.dataframe[self.PROF_A].apply(lambda x: x.to_string()) ==
_Profile._positive_sign * len(self.dataframe.PROF_Q[0]))
self.flat_min_a = (self.dataframe[self.PROF_A].apply(lambda x: x.to_string()) ==
_Profile._negative_sign * len(self.dataframe.PROF_Q[0]))
self._summary_dict["similar_profiles"] = len(self.dataframe[self.similar_profiles])
self._summary_dict["dissimilar_profiles"] = len(self.dataframe[self.dissimilar_profiles])
self._summary_dict["mirror_profiles"] = len(self.dataframe[self.mirror_profiles])
except KeyError:
warnings.warn("Failed to make phylogenetic profiles-based booleans",
SelectionFailWarning)
self.summary = pd.DataFrame(self._summary_dict,
index=[0])
if self._all_species_in_query is not None:
if not isinstance(self._all_species_in_query, (list, tuple)):
raise TypeError("Must be list or tuple.")
try:
self.all_species_in_query = self.dataframe[self.PROF_Q].apply(lambda x: isiniterable(self._all_species_in_query,
x.get_present(),
all_present=True))
except KeyError:
warnings.warn("Failed to make query-species-based selection.")
if self._any_species_in_query is not None:
if not isinstance(self._any_species_in_query, (list, tuple)):
raise TypeError("Must be list or tuple.")
try:
self.any_species_in_query = self.dataframe[self.PROF_Q].apply(lambda x: isiniterable(self._any_species_in_query,
x.get_present(),
all_present=False))
except KeyError:
warnings.warn("Failed to make query-species-based selection.")
if self._none_species_in_query is not None:
if not isinstance(self._none_species_in_query, (list, tuple)):
raise TypeError("Must be list or tuple.")
try:
self.none_species_in_query = self.dataframe[self.PROF_Q].apply(lambda x: isiniterable(self._none_species_in_query,
x.get_absent(),
all_present=True))
except KeyError:
warnings.warn("Failed to make query-species-based selection.")
if self._all_species_in_array is not None:
if not isinstance(self._all_species_in_array, (list, tuple)):
raise TypeError("Must be list or tuple.")
try:
self.all_species_in_array = self.dataframe[self.PROF_A].apply(lambda x: isiniterable(self._all_species_in_array,
x.get_present(),
all_present=True))
except KeyError:
warnings.warn("Failed to make array-species-based selection.")
if self._any_species_in_array is not None:
if not isinstance(self._any_species_in_array, (list, tuple)):
raise TypeError("Must be list or tuple.")
try:
self.any_species_in_array = self.dataframe[self.PROF_A].apply(lambda x: isiniterable(self._any_species_in_array,
x.get_present(),
all_present=False))
except KeyError:
warnings.warn("Failed to make array-species-based selection.")
if self._none_species_in_array is not None:
if not isinstance(self._none_species_in_array, (list, tuple)):
raise TypeError("Must be list or tuple.")
try:
self.none_species_in_array = self.dataframe[self.PROF_A].apply(lambda x: isiniterable(self._none_species_in_array,
x.get_absent(),
all_present=True))
except KeyError:
warnings.warn("Failed to make array-species-based selection.")
Loading

0 comments on commit 976ee33

Please sign in to comment.