## SQL IV - index, join, regular expressions

In [None]:
from sqlite3 import connect
conn = connect('small_pfam.sqlite')
curs = conn.cursor()

See the indices in the database

In [None]:
sql = 'SELECT name FROM sqlite_master WHERE type = "index";'
curs.execute(sql)
for row in curs: print(row)

CREATE INDEX ON statement - creates an index on table column(s)

In [None]:
sql = '''
CREATE INDEX go_category_idx ON gene_ontology(category);
'''
curs.execute(sql)
conn.commit()

In [None]:
sql = 'SELECT name FROM sqlite_master WHERE type = "index";'
curs.execute(sql)
for row in curs: print(row)

In [None]:
sql = '''SELECT category, term FROM gene_ontology 
WHERE category = "process";'''
curs.execute(sql)
curs.fetchall()

DROP INDEX statement - deletes the index by index name

In [None]:
sql = '''
DROP INDEX go_category_idx;
'''
curs.execute(sql)
conn.commit()

In [None]:
sql = 'SELECT name FROM sqlite_master WHERE type = "index";'
curs.execute(sql)
for row in curs: print(row)
    

Regular expression sqlite3

In [None]:
from re import search, IGNORECASE, MULTILINE, DOTALL
def regexp(pattern, strToSearch):
    return bool(search(pattern, strToSearch, IGNORECASE|MULTILINE|DOTALL))
conn.create_function("REGEXP", 2, regexp)
sql = r'''
SELECT * FROM gene_ontology
WHERE term REGEXP 'transcription.*?(\Bfactor|initiation)';
'''
curs.execute(sql)
for row in curs: print(row)


In [None]:
sql_template = '''
SELECT * FROM gene_ontology
WHERE term REGEXP '{}';
'''
pat=r'phospho[^egf]'
sql = sql_template.format(pat)
curs.execute(sql)
for row in curs: print(row)



A complete join query looks like:<br>
 SELECT column1, column2, ...
 <font color='red'>FROM table1 JOIN table2 ON table1.primarykey = table2.foreignkey </font>
<br><br>
Immediately following <font color='red'>JOIN</font> table2 <font color='red'>ON</font> you specify the condition for the join. <br>
Typically, this is the primary key for table1 and the related foreign key in table2.<br><br>
This can be extended for multiple tables:<br>
FROM table1 <font color='red'>JOIN</font>  table2 <font color='red'>ON</font> table1.primarykey = table2.foreignkey <font color='red'>JOIN</font> table3 <font color='red'>ON</font> table2.primarykey = table3.foreignkey <br><br>


In [None]:
sql = '''
SELECT count(*)
FROM pfamA
JOIN gene_ontology
ON pfamA.pfamA_acc = gene_ontology.pfamA_acc
'''
curs.execute(sql)
for row in curs: print(row)

In [None]:
sql = '''
SELECT pfamA.pfamA_acc, pfamA.type, literature_reference.pmid, literature_reference.author
FROM pfamA_literature_reference
JOIN pfamA ON pfamA_literature_reference.pfamA_acc = pfamA.pfamA_acc
JOIN literature_reference ON pfamA_literature_reference.auto_lit = literature_reference.auto_lit
WHERE pfamA.type = 'Coiled-coil';
'''
curs.execute(sql)
for row in curs: print(row)

Joins can be expressed without using JOIN ON and relying on the WHERE clause to perform all of the key comparisons <br>
The following query is equivalent to the one above <br>
https://pfam.xfam.org/help#tabview=tab12

In [None]:
sql = '''
SELECT pfamA.pfamA_acc, pfamA.'type', literature_reference.pmid, literature_reference.author
FROM pfamA, pfamA_literature_reference, literature_reference
WHERE pfamA.pfamA_acc = pfamA_literature_reference.pfamA_acc
AND
pfamA_literature_reference.auto_lit = literature_reference.auto_lit
AND
pfamA.type = 'Coiled-coil';
'''
curs.execute(sql)
for row in curs: print(row)

INNER JOIN

In [None]:
sql = '''
SELECT count(*)
FROM pfamA
JOIN gene_ontology
ON pfamA.pfamA_acc = gene_ontology.pfamA_acc
'''
curs.execute(sql)
for row in curs: print(row)

LEFT JOIN

In [None]:
sql = '''
SELECT count(*)
FROM pfamA
LEFT JOIN gene_ontology
ON pfamA.pfamA_acc = gene_ontology.pfamA_acc
'''
curs.execute(sql)
for row in curs: print(row)

SELF JOIN

In [None]:
sql = '''SELECT go1.pfamA_acc, go2.pfamA_acc, 
go1.term, go2.term
FROM gene_ontology go1 JOIN gene_ontology go2 
ON go1.term=go2.term
WHERE go1.pfamA_acc != go2.pfamA_acc
'''
curs.execute(sql)
for row in curs: print(row)

CARTEZIAN PRODUCT

In [None]:
sql = '''SELECT pf.pfamA_acc, go.term 
FROM pfamA pf JOIN gene_ontology go
WHERE pf.pfamA_acc IN ('PF00001','PF00002') 
AND go.term LIKE 'cofactor%'
'''
curs.execute(sql)
for row in curs: print(row)

TRIGGERS - for data integrity

In [None]:
#CREATE TABLE

sql = '''
CREATE TABLE pfamA_extra(
Extra_id PRIMARY KEY,
pfamA_id text NOT NULL,
Extra_info text NOT NULL
);
'''
curs.execute(sql)
conn.commit()

#CREATE TRIGGER 

sql = '''
CREATE TRIGGER validate_pfam_accession 
BEFORE INSERT ON pfamA_extra
BEGIN 
SELECT CASE WHEN NEW.pfamA_id NOT LIKE 'PFAM%' THEN RAISE ( ABORT, 'Invalid PFAM accession' ) 
END;
END;
'''
curs.execute(sql)
conn.commit()

In [None]:
sql = '''
INSERT INTO pfamA_extra(Extra_id , pfamA_id , Extra_info)
VALUES ( 1, 'test', 'Test trigger' );
'''
curs.execute(sql)

In [None]:
sql = '''
INSERT INTO pfamA_extra(Extra_id , pfamA_id , Extra_info)
VALUES ( 1, 'PFAMtest', 'Test trigger' );
'''
curs.execute(sql)
conn.commit()
for row in curs: print(row)

In [None]:
sql = '''
DROP TRIGGER validate_pfam_accession;
'''
curs.execute(sql)
conn.commit()

Join Exercises<br>
• For protein families find all the literature references<br>
– Hint: use tables pfamA, literature_reference and pfamA_literature_reference<br>
• For each protein family list the database links, GO category and GO terms<br>
– Hint: use tables pfamA, gene_ontology and database_links

In [None]:
curs.close()
conn.commit()