Skip to content

Commit

Permalink
Add support for PostgreSQL
Browse files Browse the repository at this point in the history
  • Loading branch information
fujirock committed Aug 25, 2015
1 parent 1497943 commit 89bb17e
Show file tree
Hide file tree
Showing 8 changed files with 47 additions and 10 deletions.
4 changes: 4 additions & 0 deletions README.md
Expand Up @@ -26,6 +26,10 @@ If you want to use the default RDBMS (oracle), you'll need:
Note that the client code has been tested against Oracle, but is written in a DB-agnostic way to simplify usage in
other scenarios.

If you want to use PostgreSQL, you'll need:

* Psycopg 2.6.1

## Python libraries

The complete 'pinned' library dependencies can be found in the following files, for Ubuntu and Mac OS X:
Expand Down
3 changes: 2 additions & 1 deletion schema/sc_data.sql
@@ -1,6 +1,6 @@
-- DDL for defining client-side database schema of the SureChEMBL data feed.
--
-- Designed for Oracle and MySQL. See per-table comments for minor adjustments
-- Designed for Oracle, PostgreSQL and MySQL. See per-table comments for minor adjustments
-- that may be needed for your RDBMS.

/*** Drop statements (for convenience only)
Expand All @@ -21,6 +21,7 @@ DROP SEQUENCE schembl_document_id;

-- MySQL: Add AUTO_INCREMENT to 'id'
-- Oracle: Comment-in the sequence
-- PostgreSQL: Comment-in the sequence

CREATE TABLE schembl_document (
id INTEGER NOT NULL,
Expand Down
Binary file modified src/scripts/__init__.pyc
Binary file not shown.
19 changes: 17 additions & 2 deletions src/scripts/data_loader.py
Expand Up @@ -192,8 +192,12 @@ def load_biblio(self, file_name, preload_ids=False, chunksize=1000):
sql_alc_conn = self.db.connect()
db_api_conn = sql_alc_conn.connection

title_ins = DBBatcher(db_api_conn, 'insert into schembl_document_title (schembl_doc_id, lang, text) values (:1, :2, :3)')
classes_ins = DBBatcher(db_api_conn, 'insert into schembl_document_class (schembl_doc_id, class, system) values (:1, :2, :3)')
if ("cx_oracle" in str(self.db.dialect)):
title_ins = DBBatcher(db_api_conn, 'insert into schembl_document_title (schembl_doc_id, lang, text) values (:1, :2, :3)')
classes_ins = DBBatcher(db_api_conn, 'insert into schembl_document_class (schembl_doc_id, class, system) values (:1, :2, :3)')
else:
title_ins = DBBatcher(db_api_conn, 'insert into schembl_document_title (schembl_doc_id, lang, text) values (%s, %s, %s)')
classes_ins = DBBatcher(db_api_conn, 'insert into schembl_document_class (schembl_doc_id, class, system) values (%s, %s, %s)')


########################################################################
Expand Down Expand Up @@ -509,6 +513,17 @@ def load_chems(self, file_name, update_mappings, chunksize=1000):
chem_struc_ins = DBBatcher(db_api_conn, 'insert into schembl_chemical_structure (schembl_chem_id, smiles, std_inchi, std_inchikey) values (:1, :2, :3, :4)', self.chem_struc_types)
chem_map_del = DBBatcher(db_api_conn, 'delete from schembl_document_chemistry where schembl_doc_id = :1 and schembl_chem_id = :2 and field = :3 and (:4 > -1)')
chem_map_ins = DBBatcher(db_api_conn, 'insert into schembl_document_chemistry (schembl_doc_id, schembl_chem_id, field, frequency) values (:1, :2, :3, :4)')
if ("cx_oracle" in str(self.db.dialect)):
chem_ins = DBBatcher(db_api_conn, 'insert into schembl_chemical (id, mol_weight, logp, med_chem_alert, is_relevant, donor_count, acceptor_count, ring_count, rot_bond_count, corpus_count) values (:1, :2, :3, :4, :5, :6, :7, :8, :9, :10)')
chem_struc_ins = DBBatcher(db_api_conn, 'insert into schembl_chemical_structure (schembl_chem_id, smiles, std_inchi, std_inchikey) values (:1, :2, :3, :4)', self.chem_struc_types)
chem_map_del = DBBatcher(db_api_conn, 'delete from schembl_document_chemistry where schembl_doc_id = :1 and schembl_chem_id = :2 and field = :3 and (:4 > -1)')
chem_map_ins = DBBatcher(db_api_conn, 'insert into schembl_document_chemistry (schembl_doc_id, schembl_chem_id, field, frequency) values (:1, :2, :3, :4)')
else:
chem_ins = DBBatcher(db_api_conn, 'insert into schembl_chemical (id, mol_weight, logp, med_chem_alert, is_relevant, donor_count, acceptor_count, ring_count, rot_bond_count, corpus_count) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)')
chem_struc_ins = DBBatcher(db_api_conn, 'insert into schembl_chemical_structure (schembl_chem_id, smiles, std_inchi, std_inchikey) values (%s, %s, %s, %s)', self.chem_struc_types)
chem_map_del = DBBatcher(db_api_conn, 'delete from schembl_document_chemistry where schembl_doc_id = %s and schembl_chem_id = %s and field = %s and (%s > -1)')
chem_map_ins = DBBatcher(db_api_conn, 'insert into schembl_document_chemistry (schembl_doc_id, schembl_chem_id, field, frequency) values (%s, %s, %s, %s)')


chunk = []

Expand Down
Binary file modified src/scripts/data_loader.pyc
Binary file not shown.
Binary file modified src/scripts/helper_funcs.pyc
Binary file not shown.
Binary file modified src/scripts/new_file_reader.pyc
Binary file not shown.
31 changes: 24 additions & 7 deletions src/update.py
Expand Up @@ -13,7 +13,14 @@
from scripts.new_file_reader import NewFileReader
from scripts.data_loader import DataLoader
from scripts.helper_funcs import retry
import cx_Oracle
try:
import cx_Oracle
except ImportError:
cx_Oracle = None
try:
import psycopg2
except ImportError:
psycopg2 = None


logging.basicConfig( format='%(asctime)s %(levelname)s %(name)s %(message)s', level=logging.INFO)
Expand All @@ -32,6 +39,7 @@ def main():
parser.add_argument('ftp_pass', metavar='fp', type=str, help='Password for accessing the EBI FTP site')
parser.add_argument('db_user', metavar='du', type=str, help='Username for accessing the target database')
parser.add_argument('db_pass', metavar='dp', type=str, help='Password for accessing the target database')
parser.add_argument('--db_type', metavar='dt', type=str, help='Database type ("oracle" or "postgres")', default="oracle")
parser.add_argument('--db_host', metavar='dh', type=str, help='Host where the database can be found', default="127.0.0.1")
parser.add_argument('--db_port', metavar='do', type=str, help='Port over which the database is accessed', default="1521")
parser.add_argument('--db_name', metavar='dn', type=str, help='Database name (for connection string)', default="XE")
Expand All @@ -56,6 +64,11 @@ def main():

logger.info("Loading data files into DB")

if args.db_type == 'oracle':
db_pkg = cx_Oracle
elif args.db_type == 'postgres':
db_pkg = psycopg2

try:
db = _get_db_engine(args)
loader = DataLoader(db,
Expand All @@ -75,9 +88,9 @@ def main():

logger.info("Processing complete, exiting")

except cx_Oracle.DatabaseError, exc:
# Specialized display handling for Oracle exceptions
logger.error( "Oracle exception detected: {}".format( exc ) )
except db_pkg.DatabaseError, exc:
# Specialized display handling for Database exceptions
logger.error( "Database exception detected: {}".format( exc ) )
raise

def _prepare_files(args):
Expand Down Expand Up @@ -168,16 +181,20 @@ def _get_db_engine(args):
"""
Create a database connection.
Currently, oracle is the only supported connection type. If there are stability issues, try adding
Currently, oracle and postgresql are supported connection types. If there are stability issues, try adding
"implicit_returning=False" to the parameter list
:param args: Command line arguments, which must include database connection parameters.
:return: SQL Alchemy database engine object
"""

os.environ["NLS_LANG"] = ".AL32UTF8"

connection_str = "oracle+cx_oracle://{0}:{1}@{2}:{3}/{4}".format(
args.db_user, args.db_pass, args.db_host, args.db_port, args.db_name)
if args.db_type == 'oracle':
connection_str = "oracle+cx_oracle://{0}:{1}@{2}:{3}/{4}".format(
args.db_user, args.db_pass, args.db_host, args.db_port, args.db_name)
elif args.db_type == 'postgres':
connection_str = "postgresql+psycopg2://{0}:{1}@{2}:{3}/{4}".format(
args.db_user, args.db_pass, args.db_host, args.db_port, args.db_name)

logger.info("DB connection string: [{}]".format(connection_str))

Expand Down

0 comments on commit 89bb17e

Please sign in to comment.