## SQL exercises
### BIOINF 575 - Fall 2022



The. ALTER TABLE command: Changing a table without having to remove it and recreate it.   
A table can be ALTERED to:
* RENAME the TABLE
* RENAME a COLUMN
* ADD a COLUMN
* DROP a COLUMN

Detailed diagram of the command available at:   
https://www.sqlite.org/lang_altertable.html

Examples available at:   
https://www.sqlitetutorial.net/sqlite-alter-table/    
https://www.tutorialspoint.com/sqlite/sqlite_alter_command.htm    
https://www.geeksforgeeks.org/how-to-alter-a-sqlite-table-using-python/

___

<b> <font color = "red">Exercise</font></b>

#### Create a database for genes and gene aliases

Create a SQLite database with a genes table, and a gene_aliases table.   
The genes table should have the following columns: gene_id, gene_symbol, the gene_description, start, stop.     
The gene_aliases table should have the following columns: alias_id, gene_id and the alias.   
Create an index on the column gene_id in the table gene_aliases.


In [1]:
from sqlite3 import connect


In [2]:
connect?

[0;31mDocstring:[0m
connect(database[, timeout, detect_types, isolation_level,
        check_same_thread, factory, cached_statements, uri])

Opens a connection to the SQLite database file *database*. You can use
":memory:" to open a database connection to a database that resides in
RAM instead of on disk.
[0;31mType:[0m      builtin_function_or_method


In [14]:
database_name = "gene_aliases.sqlite"

connection = connect(database_name)

In [15]:
cursor = connection.cursor()

In [17]:
select_sqlite_master = """
SELECT type, name
FROM sqlite_master;
"""
cursor.execute(select_sqlite_master)
cursor.fetchall()

[('table', 'genes'),
 ('table', 'sqlite_sequence'),
 ('table', 'gene_aliases'),
 ('index', 'gene_aliases_gene_id_idx')]

In [18]:
# The genes table should have the following columns: 
# gene_id, gene_symbol, the gene_description, start, stop.


# Write and run a create table statement for the genes table

sql='''
CREATE TABLE IF NOT EXISTS genes (
      gene_id INTEGER PRIMARY KEY AUTOINCREMENT, 
      gene_symbol TEXT NOT NULL,
      gene_description TEXT,
      start BIGINT NOT NULL,
      stop BIGINT NOT NULL);
'''
try:
    cursor.execute(sql)
except connection.DatabaseError:
    print("Creating the genes table resulted in a database error!")
    connection.rollback()
    raise
else:
    connection.commit()
finally:
    print("done!")
    
    


done!


In [21]:
select_sqlite_master = """
SELECT name, type
FROM sqlite_master;
"""
cursor.execute(select_sqlite_master)
cursor.fetchall()

[('genes', 'table'),
 ('sqlite_sequence', 'table'),
 ('gene_aliases', 'table'),
 ('gene_aliases_gene_id_idx', 'index')]

In [22]:
select_sqlite_master = """
SELECT type, name 
FROM sqlite_master;
"""
cursor.execute(select_sqlite_master)
cursor.fetchall()

[('table', 'genes'),
 ('table', 'sqlite_sequence'),
 ('table', 'gene_aliases'),
 ('index', 'gene_aliases_gene_id_idx')]

In [23]:
# The gene_aliases table should have the following columns: 
# alias_id, gene_id and the alias.


# Write and run a create table statement for the gene_aliases table

sql='''
CREATE TABLE IF NOT EXISTS gene_aliases (
      alias_id INTEGER PRIMARY KEY AUTOINCREMENT, 
      gene_id INT NOT NULL, -- this should connect to the gene_id in the genes table
      alias TEXT NOT NULL,
      FOREIGN KEY (gene_id) REFERENCES  genes  (gene_id)

);
'''
try:
    cursor.execute(sql)
except connection.DatabaseError:
    print("Creating the gene_aliases table resulted in a database error!")
    connection.rollback()
    raise
else:
    connection.commit()
finally:
    print("done!")
    
    



done!


In [24]:
select_sqlite_master = """
SELECT type, name 
FROM sqlite_master;
"""
cursor.execute(select_sqlite_master)
cursor.fetchall()

[('table', 'genes'),
 ('table', 'sqlite_sequence'),
 ('table', 'gene_aliases'),
 ('index', 'gene_aliases_gene_id_idx')]

In [26]:
# Create an index on the column gene_id in the table gene_aliases.

create_index_gene_id = """
CREATE INDEX IF NOT EXISTS gene_aliases_gene_id_idx ON gene_aliases (gene_id);
"""
cursor.execute(create_index_gene_id)
connection.commit()

In [27]:
select_sqlite_master = """
SELECT type, name 
FROM sqlite_master;
"""
cursor.execute(select_sqlite_master)
cursor.fetchall()

[('table', 'genes'),
 ('table', 'sqlite_sequence'),
 ('table', 'gene_aliases'),
 ('index', 'gene_aliases_gene_id_idx')]

___

<b> <font color = "red">Exercise</font></b>

#### Populate a database 

Insert data into tables in the genes and gene aliases database you just created.   
Use the data from the file `genes_aliases.txt`.


In [30]:
data_file_name = "genes_aliases.txt"

insert_gene = """
INSERT INTO genes (gene_symbol, gene_description, start, stop)
VALUES (?, ?, ?, ?);
"""

with open(data_file_name) as genes_file:
    header_line = genes_file.readline()
    for line in genes_file:
        line_lst = line.strip().split("\t")
        # print(line_lst)
        gene_data = (line_lst[0], line_lst[2], line_lst[4], line_lst[5])
        print(gene_data)
        cursor.execute(insert_gene, gene_data)
        gene_id = cursor.lastrowid
        print(gene_id)
        

('TERT', 'telomerase reverse transcriptase', '1253167', '1295068')
1
('TP63', 'tumor protein p63', '189596746', '189897276')
2
('CHRNA5', 'cholinergic receptor nicotinic alpha 5 subunit', '78565520', '78595269')
3
('CADM1', 'cell adhesion molecule 1', '115169236', '115504428')
4
('CHRNA3', 'cholinergic receptor nicotinic alpha 3 subunit', '78593052', '78620996')
5
('RACK1', 'receptor for activated C kinase 1', '181236897', '181243906')
6


In [31]:
select_genes = """
SELECT * 
FROM genes;
"""
cursor.execute(select_genes)
cursor.fetchall()

[(1, 'TERT', 'telomerase reverse transcriptase', 1253167, 1295068),
 (2, 'TP63', 'tumor protein p63', 189596746, 189897276),
 (3,
  'CHRNA5',
  'cholinergic receptor nicotinic alpha 5 subunit',
  78565520,
  78595269),
 (4, 'CADM1', 'cell adhesion molecule 1', 115169236, 115504428),
 (5,
  'CHRNA3',
  'cholinergic receptor nicotinic alpha 3 subunit',
  78593052,
  78620996),
 (6, 'RACK1', 'receptor for activated C kinase 1', 181236897, 181243906)]

In [32]:
select_gene_aliases = """
SELECT * 
FROM gene_aliases;
"""
cursor.execute(select_gene_aliases)
cursor.fetchall()

[]

In [33]:
# when all goes well and all records that you want are added do connection.commit()
# in this case we want to also add the gene aliases before we commit, so we do connection.rollback()
connection.rollback()

In [36]:
# no more data in the genes table because of the rollback
# the rollback takes you back to the last commit

select_genes = """
SELECT * 
FROM genes;
"""
cursor.execute(select_genes)
cursor.fetchall()

[]

In [44]:
# now we want to prepare the insert of aliases 

data_file_name = "genes_aliases.txt"

insert_gene = """
INSERT INTO genes (gene_symbol, gene_description, start, stop)
VALUES (?, ?, ?, ?);
"""

insert_gene_alias = """
INSERT INTO gene_aliases (gene_id, alias)
VALUES (?, ?);
"""

with open(data_file_name) as genes_file:
    header_line = genes_file.readline()
    for line in genes_file:
        line_lst = line.strip().split("\t")
        # print(line_lst)
        gene_data = (line_lst[0], line_lst[2], line_lst[4], line_lst[5])
        print(gene_data)
        cursor.execute(insert_gene, gene_data)
        gene_id = cursor.lastrowid
        print(gene_id)
        aliases_lst = line_lst[1].split(",")
        for alias in aliases_lst:
            alias_data = (gene_id, alias.strip())
            cursor.execute(insert_gene_alias, alias_data)
        print()
        

('TERT', 'telomerase reverse transcriptase', '1253167', '1295068')
1

('TP63', 'tumor protein p63', '189596746', '189897276')
2

('CHRNA5', 'cholinergic receptor nicotinic alpha 5 subunit', '78565520', '78595269')
3

('CADM1', 'cell adhesion molecule 1', '115169236', '115504428')
4

('CHRNA3', 'cholinergic receptor nicotinic alpha 3 subunit', '78593052', '78620996')
5

('RACK1', 'receptor for activated C kinase 1', '181236897', '181243906')
6



In [45]:
select_genes = """
SELECT * 
FROM genes;
"""
cursor.execute(select_genes)
cursor.fetchall()

[(1, 'TERT', 'telomerase reverse transcriptase', 1253167, 1295068),
 (2, 'TP63', 'tumor protein p63', 189596746, 189897276),
 (3,
  'CHRNA5',
  'cholinergic receptor nicotinic alpha 5 subunit',
  78565520,
  78595269),
 (4, 'CADM1', 'cell adhesion molecule 1', 115169236, 115504428),
 (5,
  'CHRNA3',
  'cholinergic receptor nicotinic alpha 3 subunit',
  78593052,
  78620996),
 (6, 'RACK1', 'receptor for activated C kinase 1', 181236897, 181243906)]

In [46]:
select_gene_aliases = """
SELECT * 
FROM gene_aliases;
"""
cursor.execute(select_gene_aliases)
cursor.fetchall()

[(1, 1, 'CMM9'),
 (2, 1, 'DKCA2'),
 (3, 1, 'DKCB4'),
 (4, 1, 'EST2'),
 (5, 1, 'PFBMFT1'),
 (6, 1, 'TCS1'),
 (7, 1, 'TP2'),
 (8, 1, 'TRT'),
 (9, 1, 'hEST2'),
 (10, 1, 'hTRT'),
 (11, 2, 'AIS'),
 (12, 2, 'B(p51A)'),
 (13, 2, 'B(p51B)'),
 (14, 2, 'EEC3'),
 (15, 2, 'KET'),
 (16, 2, 'LMS'),
 (17, 2, 'NBP'),
 (18, 2, 'OFC8'),
 (19, 2, 'RHS'),
 (20, 2, 'SHFM4'),
 (21, 2, 'TP53CP'),
 (22, 2, 'TP53L'),
 (23, 2, 'TP73L'),
 (24, 2, 'p40'),
 (25, 2, 'p51'),
 (26, 2, 'p53CP'),
 (27, 2, 'p63'),
 (28, 2, 'p73H'),
 (29, 2, 'p73L'),
 (30, 3, 'LNCR2'),
 (31, 4, 'BL2'),
 (32, 4, 'IGSF4'),
 (33, 4, 'IGSF4A'),
 (34, 4, 'NECL2'),
 (35, 4, 'Necl-2'),
 (36, 4, 'RA175'),
 (37, 4, 'ST17'),
 (38, 4, 'SYNCAM'),
 (39, 4, 'TSLC1'),
 (40, 4, 'sTSLC-1'),
 (41, 4, 'sgIGSF'),
 (42, 4, 'synCAM1'),
 (43, 5, 'BAIPRCK'),
 (44, 5, 'LNCR2'),
 (45, 5, 'NACHRA3'),
 (46, 5, 'PAOD2'),
 (47, 6, 'GNB2L1'),
 (48, 6, 'Gnb2-rs1'),
 (49, 6, 'H12.3'),
 (50, 6, 'HLC-7'),
 (51, 6, 'PIG21')]

In [47]:
select_sequence = """
SELECT *
FROM sqlite_sequence;
"""

cursor.execute(select_sequence)
cursor.fetchall()

[('genes', 6), ('gene_aliases', 51)]

In [48]:
## if I am done with the insert I do commit
# commit after each insert is expensive and will make the code slower

# we want to try a different option to create the ids for gene and alias so we rollback

connection.rollback()

In [49]:
select_genes = """
SELECT * 
FROM genes;
"""
cursor.execute(select_genes)
cursor.fetchall()

[]

In [50]:
select_gene_aliases = """
SELECT * 
FROM gene_aliases;
"""
cursor.execute(select_gene_aliases)
cursor.fetchall()

[]

In [51]:
select_sequence = """
SELECT *
FROM sqlite_sequence;
"""

cursor.execute(select_sequence)
cursor.fetchall()

[]

In [52]:
# now we want to create the ids - so this is OPTION 2

data_file_name = "genes_aliases.txt"

insert_gene = """
INSERT INTO genes (gene_id, gene_symbol, gene_description, start, stop)
VALUES (?, ?, ?, ?, ?);
"""

insert_gene_alias = """
INSERT INTO gene_aliases (alias_id, gene_id, alias)
VALUES (?, ?, ?);
"""

gene_id = 0
alias_id = 0

with open(data_file_name) as genes_file:
    header_line = genes_file.readline()
    for line in genes_file:
        line_lst = line.strip().split("\t")
        # print(line_lst)
        gene_id = gene_id + 1
        print(gene_id)
        gene_data = (gene_id, line_lst[0], line_lst[2], line_lst[4], line_lst[5])
        print(gene_data)
        cursor.execute(insert_gene, gene_data)

        aliases_lst = line_lst[1].split(",")
        for alias in aliases_lst:
            alias_id = alias_id + 1
            alias_data = (alias_id, gene_id, alias.strip())
            cursor.execute(insert_gene_alias, alias_data)
        print()
        

1
(1, 'TERT', 'telomerase reverse transcriptase', '1253167', '1295068')

2
(2, 'TP63', 'tumor protein p63', '189596746', '189897276')

3
(3, 'CHRNA5', 'cholinergic receptor nicotinic alpha 5 subunit', '78565520', '78595269')

4
(4, 'CADM1', 'cell adhesion molecule 1', '115169236', '115504428')

5
(5, 'CHRNA3', 'cholinergic receptor nicotinic alpha 3 subunit', '78593052', '78620996')

6
(6, 'RACK1', 'receptor for activated C kinase 1', '181236897', '181243906')



In [53]:
select_genes = """
SELECT * 
FROM genes;
"""
cursor.execute(select_genes)
cursor.fetchall()

[(1, 'TERT', 'telomerase reverse transcriptase', 1253167, 1295068),
 (2, 'TP63', 'tumor protein p63', 189596746, 189897276),
 (3,
  'CHRNA5',
  'cholinergic receptor nicotinic alpha 5 subunit',
  78565520,
  78595269),
 (4, 'CADM1', 'cell adhesion molecule 1', 115169236, 115504428),
 (5,
  'CHRNA3',
  'cholinergic receptor nicotinic alpha 3 subunit',
  78593052,
  78620996),
 (6, 'RACK1', 'receptor for activated C kinase 1', 181236897, 181243906)]

In [54]:
select_gene_aliases = """
SELECT * 
FROM gene_aliases;
"""
cursor.execute(select_gene_aliases)
cursor.fetchall()

[(1, 1, 'CMM9'),
 (2, 1, 'DKCA2'),
 (3, 1, 'DKCB4'),
 (4, 1, 'EST2'),
 (5, 1, 'PFBMFT1'),
 (6, 1, 'TCS1'),
 (7, 1, 'TP2'),
 (8, 1, 'TRT'),
 (9, 1, 'hEST2'),
 (10, 1, 'hTRT'),
 (11, 2, 'AIS'),
 (12, 2, 'B(p51A)'),
 (13, 2, 'B(p51B)'),
 (14, 2, 'EEC3'),
 (15, 2, 'KET'),
 (16, 2, 'LMS'),
 (17, 2, 'NBP'),
 (18, 2, 'OFC8'),
 (19, 2, 'RHS'),
 (20, 2, 'SHFM4'),
 (21, 2, 'TP53CP'),
 (22, 2, 'TP53L'),
 (23, 2, 'TP73L'),
 (24, 2, 'p40'),
 (25, 2, 'p51'),
 (26, 2, 'p53CP'),
 (27, 2, 'p63'),
 (28, 2, 'p73H'),
 (29, 2, 'p73L'),
 (30, 3, 'LNCR2'),
 (31, 4, 'BL2'),
 (32, 4, 'IGSF4'),
 (33, 4, 'IGSF4A'),
 (34, 4, 'NECL2'),
 (35, 4, 'Necl-2'),
 (36, 4, 'RA175'),
 (37, 4, 'ST17'),
 (38, 4, 'SYNCAM'),
 (39, 4, 'TSLC1'),
 (40, 4, 'sTSLC-1'),
 (41, 4, 'sgIGSF'),
 (42, 4, 'synCAM1'),
 (43, 5, 'BAIPRCK'),
 (44, 5, 'LNCR2'),
 (45, 5, 'NACHRA3'),
 (46, 5, 'PAOD2'),
 (47, 6, 'GNB2L1'),
 (48, 6, 'Gnb2-rs1'),
 (49, 6, 'H12.3'),
 (50, 6, 'HLC-7'),
 (51, 6, 'PIG21')]

In [55]:
select_sequence = """
SELECT *
FROM sqlite_sequence;
"""

cursor.execute(select_sequence)
cursor.fetchall()

[('genes', 6), ('gene_aliases', 51)]

In [56]:
# now I am done with the insert so .. commit
connection.commit()

___

<b> <font color = "red">Exercise</font></b>

#### Query a database 

How many genes we have?  
How many aliases we have?  
Retrieve the aliases for gene TP63. In the result, display the gene symbol and the alias.    
How many aliases are associated with each gene? In the result, display the gene symbol and the count.


In [57]:
def get_header(cursor):
    '''
    Makes a tab delimited header row from the cursor description.
    Arguments:
        cursor: a cursor after a select query
    Returns:
        string: A string consisting of the column names separated by tabs, no new line
    '''
    return '\t'.join([row[0] for row in cursor.description])



In [58]:
# note that if you have a large result 
# this function will try to make a very large string from it
# so it is recommended for results with less than 10 rows and 10 columns
# for other cases use the for loop to go through the rows in the result 

def get_results(cursor):
    '''
    Makes a tab delimited table from the cursor results.
    Arguments:
        cursor: a cursor after a select query
    Returns:
        string: A string consisting of the column names separated by tabs, no new line
    ''' 
    res = list()
    for row in cursor.fetchall():        
        res.append('\t'.join(list(map(str,row))))
    return "\n".join(res)

In [59]:
select_genes = """
SELECT * 
FROM genes;
"""
cursor.execute(select_genes)
print(get_header(cursor))
print(get_results(cursor))

gene_id	gene_symbol	gene_description	start	stop
1	TERT	telomerase reverse transcriptase	1253167	1295068
2	TP63	tumor protein p63	189596746	189897276
3	CHRNA5	cholinergic receptor nicotinic alpha 5 subunit	78565520	78595269
4	CADM1	cell adhesion molecule 1	115169236	115504428
5	CHRNA3	cholinergic receptor nicotinic alpha 3 subunit	78593052	78620996
6	RACK1	receptor for activated C kinase 1	181236897	181243906


In [61]:
# Q1: How many genes we have?

select_genes_no = """
SELECT count(gene_id) genes_count
FROM genes;

"""
cursor.execute(select_genes_no)
print(get_header(cursor))
print(get_results(cursor))

genes_count
6


In [62]:
select_genes_no = """
SELECT max(gene_id) genes_count
FROM genes;

"""
cursor.execute(select_genes_no)
print(get_header(cursor))
print(get_results(cursor))

genes_count
6


In [64]:
# Q2: How many aliases we have?

select_aliases_no = """
SELECT count(alias_id) aliases_count
FROM gene_aliases;

"""
cursor.execute(select_aliases_no)
print(get_header(cursor))
print(get_results(cursor))


aliases_count
51


In [69]:
# Retrieve the aliases for gene TP63. 
# In the result, display the gene symbol and the alias.

# this solution uses hardcoding 
# meaning is not general enough to work for another input without changes to the code
# also, we needed to run the previous query that retrieves all genes to visually identofy the id


select_TP63_aliases = """
SELECT "TP63" symbol, alias 
FROM gene_aliases 
WHERE gene_id = 2;
"""
cursor.execute(select_TP63_aliases)
print(get_header(cursor))
print(get_results(cursor))

symbol	alias
TP63	AIS
TP63	B(p51A)
TP63	B(p51B)
TP63	EEC3
TP63	KET
TP63	LMS
TP63	NBP
TP63	OFC8
TP63	RHS
TP63	SHFM4
TP63	TP53CP
TP63	TP53L
TP63	TP73L
TP63	p40
TP63	p51
TP63	p53CP
TP63	p63
TP63	p73H
TP63	p73L


In [73]:
# a more general solution
# uses parameters

gene_symbol = "CADM1"

select_aliases_for_a_gene = """
SELECT gene_symbol, alias 
FROM gene_aliases 
    INNER JOIN genes ON genes.gene_id = gene_aliases.gene_id
WHERE gene_symbol = ?;
"""
cursor.execute(select_aliases_for_a_gene, [gene_symbol])
print(get_header(cursor))
print(get_results(cursor))

gene_symbol	alias
CADM1	BL2
CADM1	IGSF4
CADM1	IGSF4A
CADM1	NECL2
CADM1	Necl-2
CADM1	RA175
CADM1	ST17
CADM1	SYNCAM
CADM1	TSLC1
CADM1	sTSLC-1
CADM1	sgIGSF
CADM1	synCAM1


In [76]:
# Q4: How many aliases are associated with each gene? 
# In the result, display the gene symbol and the count.

select_alias_no_per_gene_symbol = """
SELECT gene_symbol, count(alias) 
FROM gene_aliases 
    INNER JOIN genes ON genes.gene_id = gene_aliases.gene_id
GROUP BY gene_symbol;

"""
cursor.execute(select_alias_no_per_gene_symbol)
print(get_header(cursor))
print(get_results(cursor))


gene_symbol	count(alias)
CADM1	12
CHRNA3	4
CHRNA5	1
RACK1	5
TERT	10
TP63	19


In [82]:
select_alias_no_per_gene_symbol = """
SELECT genes.gene_id, gene_symbol, count(alias) 
FROM gene_aliases 
    INNER JOIN genes ON genes.gene_id = gene_aliases.gene_id
GROUP BY genes.gene_id, gene_symbol;

"""
cursor.execute(select_alias_no_per_gene_symbol)
print(get_header(cursor))
print(get_results(cursor))

gene_id	gene_symbol	count(alias)
1	TERT	10
2	TP63	19
3	CHRNA5	1
4	CADM1	12
5	CHRNA3	4
6	RACK1	5


In [83]:
# done with this database

cursor.close()
connection.close()