Skip to content

Commit

Permalink
Complete unittests#6 (#22)
Browse files Browse the repository at this point in the history
* UPD@UtilsTests:test_remove_from_list

* UPD@UtilsTests:test_all_possible_combinations_counter

* UPD:Removed unused func

* UPD:Removed KEGG.profilize

* FIX:Accidentally broken indent

* UPD:Changed default sep to \t

* UPD@SGA1:skipping bad lines

* UPD@SGA1:Changed names from dict to tuples as the order of things is needed to give columns correct names. UPD@parse:Added cleanup arg for dropping NaNs, duplicates and so on

* UPD:SGA1Tests

* Initial commit

* UPD@Costanzo_API:Converted ugly big diff to checking dict

* UPD:Renamed Costanzo_API to CostanzoAPI

* ENH:BioprocessesTests

* Initial commit

* Removed

* ENH:ProfIntTests

* Initial commit

* UPD:Moved files to subfolder names after class

* UPD:Changed paths after moving test files to the subfolder. UPD:Renamed class after convention

* UPD:moved to corresponding subfolders

* UPD:Removed as it is not used

* UPD:Changed paths to corresponding subfolders. Removed leading dot slash

* UPD@setUp: ProfInt() at the top. Prep of nwrk for test_profilize. ENH:test_profilize
  • Loading branch information
dizak committed May 3, 2018
1 parent de66cb4 commit d967221
Show file tree
Hide file tree
Showing 26 changed files with 1,188 additions and 13,795 deletions.
109 changes: 17 additions & 92 deletions prowler/apis.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,49 +119,6 @@ def org_name_2_kegg_id(self,
self.query_ids_found.append(organism)
return organism_ser[self.KEGG_ORG_ID].iloc[0]

def get_id_conv_tbl(self,
source_id_type,
organism,
out_file_name,
skip_dwnld=False,
strip_pref=True):
"""Get genes or proteins IDs to KEGG IDs convertion table in
pandas.DataFrame format. Data are downloaded to a local file and then
made into pandas.DataFrame. File can be reused.
Args:
source_id_type (str): determines type of the source IDs
organism (str): determines name of the organism bounded to the
source IDs
out_file_name (str): name for file to be downloaded
skip_dwnld (bool) = read existing file when <True>. Default <False>
"""
org_id = self.org_name_2_kegg_id(organism)
if skip_dwnld is True:
pass
else:
url = "{0}/{1}/{2}/{3}".format(self.home,
self.operations["conv_2_outside_ids"],
self.id_conversions[source_id_type],
org_id)
res = rq.get(url)
with open(out_file_name, "w") as fout:
fout.write(res.content)
self.id_conversions_df = pd.read_csv(out_file_name,
names=[source_id_type,
self.KEGG_ID],
header=None,
sep="\t")
if strip_pref is True:
self.id_conversions_df.replace({"{0}:".format(org_id): ""},
regex=True,
inplace=True)
self.id_conversions_df.replace({"{0}:".format(self.id_conversions[source_id_type]): ""},
regex=True,
inplace=True)
else:
pass

def get_org_db_X_ref(self,
organism,
target_db,
Expand Down Expand Up @@ -222,7 +179,7 @@ def get_db_entries(self,
fout.write(res.content)


class Costanzo_API:
class CostanzoAPI:
"""Provides connectivity with the Costanzo's SOM website of the Genetic
Landscape of the Cell project, allowing data files download.
Expand All @@ -245,18 +202,19 @@ class Costanzo_API:

def __init__(self):
self.home = "http://drygin.ccbr.utoronto.ca/~costanzo2009"
self.raw = "sgadata_costanzo2009_rawdata_101120.txt.gz"
self.raw_matrix = "sgadata_costanzo2009_rawdata_matrix_101120.txt.gz"
self.lenient_cutoff = "sgadata_costanzo2009_lenientCutoff_101120.txt.gz"
self.intermediate_cutoff = "sgadata_costanzo2009_intermediateCutoff_101120.txt.gz"
self.stringent_cutoff = "sgadata_costanzo2009_stringentCutoff_101120.txt.gz"
self.bioprocesses = "bioprocess_annotations_costanzo2009.xls"
self.chemical_genomics = "chemgenomic_data_costanzo2009.xls"
self.query_list = "sgadata_costanzo2009_query_list_101120.txt"
self.array_list = "sgadata_costanzo2009_array_list.txt"
self.data = {"raw": "sgadata_costanzo2009_rawdata_101120.txt.gz",
"raw_matrix": "sgadata_costanzo2009_rawdata_matrix_101120.txt.gz",
"lenient_cutoff": "sgadata_costanzo2009_lenientCutoff_101120.txt.gz",
"intermediate_cutoff": "sgadata_costanzo2009_intermediateCutoff_101120.txt.gz",
"stringent_cutoff": "sgadata_costanzo2009_stringentCutoff_101120.txt.gz",
"bioprocesses": "bioprocess_annotations_costanzo2009.xls",
"chemical_genomics": "chemgenomic_data_costanzo2009.xls",
"query_list": "sgadata_costanzo2009_query_list_101120.txt",
"array_list": "sgadata_costanzo2009_array_list.txt"}

def get_data(self,
data):
data,
output_directory="."):
"""Get files from Costanzo's SOM website.
Args:
Expand All @@ -273,44 +231,11 @@ def get_data(self,
out_file_name (str): name for file to be downloaded. Automatically
same as appropriate Costanzo_API attrib when set to <None>
"""
if data == "raw":
url = "{0}/{1}".format(self.home,
self.raw)
out_file_name = self.raw
elif data == "raw_matrix":
url = "{0}/{1}".format(self.home,
self.raw_matrix)
out_file_name = self.raw_matrix
elif data == "lenient_cutoff":
url = "{0}/{1}".format(self.home,
self.lenient_cutoff)
out_file_name = self.lenient_cutoff
elif data == "intermediate_cutoff":
url = "{0}/{1}".format(self.home,
self.intermediate_cutoff)
out_file_name = self.intermediate_cutoff
elif data == "stringent_cutoff":
url = "{0}/{1}".format(self.home,
self.stringent_cutoff)
out_file_name = self.stringent_cutoff
elif data == "bioprocesses":
url = "{0}/{1}".format(self.home,
self.bioprocesses)
out_file_name = self.bioprocesses
elif data == "chemical_genomics":
url = "{0}/{1}".format(self.home,
self.chemical_genomics)
out_file_name = self.chemical_genomics
elif data == "query_list":
url = "{0}/{1}".format(self.home,
self.query_list)
out_file_name = self.query_list
elif data == "array_list":
url = "{0}/{1}".format(self.home,
self.array_list)
out_file_name = self.array_list
else:
if data not in self.data.keys():
raise ValueError("unknown option for data arg")
url = "{0}/{1}".format(self.home,
self.data[data])
out_file_name = self.data[data]
res = rq.get(url)
with open(out_file_name, "w") as fout:
with open("{}/{}".format(output_directory, out_file_name), "w") as fout:
fout.write(res.content)
46 changes: 23 additions & 23 deletions prowler/databases.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,13 +183,6 @@ def parse_organism_info(self,
strip_prefix=True)
self.X_reference = self._api.org_db_X_ref_df

def profilize(self):
"""
Append the database with phylogenetic profiles.
"""
self.database[self.PROF] = self.database[self.ORGS].apply(lambda x:
_Profile(x, self.reference_species).to_string())


class SGA1(Columns):
"""
Expand All @@ -200,24 +193,25 @@ class SGA1(Columns):
"""
def __init__(self):
self.names = {'Array_ORF': self.ORF_A,
'Array_gene_name': self.GENE_A,
'Array_SMF': self.SMF_A,
'Array_SMF_standard_deviation': self.SMF_SD_A,
'Query_ORF': self.ORF_Q,
'Query_gene_name': self.GENE_Q,
'Query_SMF': self.SMF_Q,
'Query_SMF_standard_deviation': self.SMF_SD_Q,
'DMF': self.DMF,
'DMF_standard_deviation': self.DMF_SD,
'Genetic_interaction_score': self.GIS,
'Standard_deviation': self.GIS_SD,
'p-value': self.GIS_P}
self.names = (('Query_ORF', self.ORF_Q),
('Query_gene_name', self.GENE_Q),
('Array_ORF', self.ORF_A),
('Array_gene_name', self.GENE_A),
('Genetic_interaction_score', self.GIS),
('Standard_deviation', self.GIS_SD),
('p-value', self.GIS_P),
('Query_SMF', self.SMF_Q),
('Query_SMF_standard_deviation', self.SMF_SD_Q),
('Array_SMF', self.SMF_A),
('Array_SMF_standard_deviation', self.SMF_SD_A),
('DMF', self.DMF),
('DMF_standard_deviation', self.DMF_SD))

def parse(self,
filename,
remove_white_spaces=True,
in_sep=","):
in_sep="\t",
cleanup=True):
"""Return Ortho_Interactions.interact_df (pandas.DataFrame) from
parsed <csv> file. The minimal filtration is based of a given GIS_P
and presence of DMF value. Further filtration results in DMF
Expand All @@ -237,12 +231,18 @@ def parse(self,
with <_> when True (default)
in_sep (str): separator for pandas.read_csv method
"""
self.sga = pd.read_csv(filename, sep=in_sep)
self.sga = pd.read_csv(filename,
sep=in_sep,
names=[k for k, v in self.names],
error_bad_lines=False,
warn_bad_lines=True)
if remove_white_spaces is True:
self.sga.columns = [i.replace(" ", "_") for i in self.sga.columns]
self.sga.rename(columns=self.names, inplace=True)
self.sga.rename(columns=dict(self.names), inplace=True)
self.sga = self.sga.astype({k: v for k, v in self.dtypes.iteritems()
if k in self.sga.columns})
if cleanup:
self.sga = self.sga.dropna().drop_duplicates().reset_index(drop=True)


class SGA2(Columns):
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Binary file not shown.
Binary file added test_data/BioprocessesTests/test_bioproc_100r.xls
Binary file not shown.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Binary file added test_data/ProfIntTests/ref_merged.pickle
Binary file not shown.
Binary file added test_data/ProfIntTests/test_X_reference.pickle
Binary file not shown.
Binary file added test_data/ProfIntTests/test_kegg_database.pickle
Binary file not shown.
Binary file added test_data/ProfIntTests/test_sga.pickle
Binary file not shown.
Binary file added test_data/SGA1Tests/ref_sga_v1_1000r.pickle
Binary file not shown.
Loading

0 comments on commit d967221

Please sign in to comment.