## SQL exercises
### BIOINF 575 - Fall 2021



The. ALTER TABLE command: Changing a table without having to remove it and recreate it.   
A table can be ALTERED to:
* RENAME the TABLE
* RENAME a COLUMN
* ADD a COLUMN
* DROP a COLUMN

Detailed diagram of the command available at:   
https://www.sqlite.org/lang_altertable.html

Examples available at:   
https://www.sqlitetutorial.net/sqlite-alter-table/    
https://www.tutorialspoint.com/sqlite/sqlite_alter_command.htm    
https://www.geeksforgeeks.org/how-to-alter-a-sqlite-table-using-python/

___

<b> <font color = "red">Exercise</font></b>

#### Create a database for genes and gene aliases

Create a SQLite database with a genes table, and a gene_aliases table.   
The genes table should have the following columns: gene_id, gene_symbol, the gene_description, start, stop.     
The gene_aliases table should have the following columns: alias_id, gene_id and the alias.   
Create an index on the column gene_id in the table gene_aliases.


In [2]:
from sqlite3 import connect
connection  = connect("genes_information.sqlite")
cursor = connection.cursor()


In [3]:
sql='''
CREATE TABLE IF NOT EXISTS genes (
      gene_id INTEGER PRIMARY KEY AUTOINCREMENT,
      gene_symbol TEXT NOT NULL,
      gene_description TEXT NOT NULL,
      start BIGINT NOT NULL,                     
      stop BIGINT NOT NULL
    );
'''
try:
    cursor.execute(sql)
except connection.DatabaseError:
    print("Creating the genes table resulted in a database error!")
    connection.rollback()
    raise
else:
    connection.commit()
finally:
    print("done!")

done!


In [4]:
select_master = "SELECT name, type FROM sqlite_master;"
cursor.execute(select_master)
cursor.fetchall()

[('genes', 'table'), ('sqlite_sequence', 'table')]

In [5]:
# The gene_aliases table should have the following columns: alias_id, gene_id and the alias.

create_gene_aliases='''
CREATE TABLE IF NOT EXISTS gene_aliases (
      alias_id INTEGER PRIMARY KEY AUTOINCREMENT,
      gene_id INTEGER NOT NULL,                             -- REFERENCES  gene_id in the genes table
      alias TEXT NOT NULL, 
      FOREIGN KEY (gene_id) REFERENCES  genes  (gene_id)
    );
'''
try:
    cursor.execute(create_gene_aliases)
except connection.DatabaseError:
    print("Creating the gene_aliases table resulted in a database error!")
    connection.rollback()
    raise
else:
    connection.commit()
finally:
    print("done!")

done!


In [6]:
# select_master = "SELECT name, type FROM sqlite_master;"
cursor.execute(select_master)
cursor.fetchall()

[('genes', 'table'), ('sqlite_sequence', 'table'), ('gene_aliases', 'table')]

In [7]:
create_index_ga_geneid = '''
CREATE INDEX idx_gene_aliases_gene_id
ON gene_aliases (gene_id)
'''
cursor.execute(create_index_ga_geneid)
connection.commit()

In [8]:
cursor.execute(select_master)
cursor.fetchall()

[('genes', 'table'),
 ('sqlite_sequence', 'table'),
 ('gene_aliases', 'table'),
 ('idx_gene_aliases_gene_id', 'index')]

___

<b> <font color = "red">Exercise</font></b>

#### Populate a database 

Insert data into tables in the genes and gene aliases database you just created.   
Use the data from the file `genes_aliases.txt`.


In [24]:
insert_genes = """
INSERT INTO genes (gene_symbol, gene_description, start, stop) 
VALUES (?,?,?,?);
"""

insert_gene_aliases = """
INSERT INTO gene_aliases (gene_id, alias) 
VALUES (?,?);
"""

with open("genes_aliases.txt") as genes_file:
    headerline = genes_file.readline().strip()
    # print(headerline)
    for line in genes_file:
        line_list = line.strip().split("\t")
        gene_symbol = line_list[0]
        gene_aliases = line_list[1]
        gene_description = line_list[2]
        gene_start = int(line_list[4])
        gene_stop = int(line_list[5])
        gene_data = (gene_symbol, gene_description, gene_start, gene_stop)
        cursor.execute(insert_genes, gene_data)
        #print(gene_aliases)
        gene_id = 1 # this needs to be updated
        for alias in gene_aliases.split(","):
            alias_data = (gene_id, alias.strip())
            cursor.execute(insert_gene_aliases, alias_data)
        
        break

In [25]:
select_genes = "SELECT * FROM genes;"
cursor.execute(select_genes)
cursor.fetchall()

[(1, 'TERT', 'telomerase reverse transcriptase', 1253167, 1295068)]

In [26]:
select_gene_aliases = "SELECT * FROM gene_aliases;"
cursor.execute(select_gene_aliases)
cursor.fetchall()

[(1, 1, 'CMM9'),
 (2, 1, 'DKCA2'),
 (3, 1, 'DKCB4'),
 (4, 1, 'EST2'),
 (5, 1, 'PFBMFT1'),
 (6, 1, 'TCS1'),
 (7, 1, 'TP2'),
 (8, 1, 'TRT'),
 (9, 1, 'hEST2'),
 (10, 1, 'hTRT')]

In [27]:
connection.rollback()

In [28]:
cursor.execute(select_genes)
cursor.fetchall()

[]

In [29]:
cursor.execute(select_gene_aliases)
cursor.fetchall()

[]

In [30]:
insert_genes = """
INSERT INTO genes (gene_symbol, gene_description, start, stop) 
VALUES (?,?,?,?);
"""

insert_gene_aliases = """
INSERT INTO gene_aliases (gene_id, alias) 
VALUES (?,?);
"""

with open("genes_aliases.txt") as genes_file:
    headerline = genes_file.readline().strip()
    # print(headerline)
    for line in genes_file:
        line_list = line.strip().split("\t")
        gene_symbol = line_list[0]
        gene_aliases = line_list[1]
        gene_description = line_list[2]
        gene_start = int(line_list[4])
        gene_stop = int(line_list[5])
        gene_data = (gene_symbol, gene_description, gene_start, gene_stop)
        cursor.execute(insert_genes, gene_data)
        #print(gene_aliases)
        gene_id = 1 # this needs to be updated
        for alias in gene_aliases.split(","):
            alias_data = (gene_id, alias.strip())
            cursor.execute(insert_gene_aliases, alias_data)
        
        # break

In [31]:
cursor.execute(select_genes)
cursor.fetchall()

[(1, 'TERT', 'telomerase reverse transcriptase', 1253167, 1295068),
 (2, 'TP63', 'tumor protein p63', 189596746, 189897276),
 (3,
  'CHRNA5',
  'cholinergic receptor nicotinic alpha 5 subunit',
  78565520,
  78595269),
 (4, 'CADM1', 'cell adhesion molecule 1', 115169236, 115504428),
 (5,
  'CHRNA3',
  'cholinergic receptor nicotinic alpha 3 subunit',
  78593052,
  78620996),
 (6, 'RACK1', 'receptor for activated C kinase 1', 181236897, 181243906)]

In [32]:
cursor.execute(select_gene_aliases)
cursor.fetchall()

[(1, 1, 'CMM9'),
 (2, 1, 'DKCA2'),
 (3, 1, 'DKCB4'),
 (4, 1, 'EST2'),
 (5, 1, 'PFBMFT1'),
 (6, 1, 'TCS1'),
 (7, 1, 'TP2'),
 (8, 1, 'TRT'),
 (9, 1, 'hEST2'),
 (10, 1, 'hTRT'),
 (11, 1, 'AIS'),
 (12, 1, 'B(p51A)'),
 (13, 1, 'B(p51B)'),
 (14, 1, 'EEC3'),
 (15, 1, 'KET'),
 (16, 1, 'LMS'),
 (17, 1, 'NBP'),
 (18, 1, 'OFC8'),
 (19, 1, 'RHS'),
 (20, 1, 'SHFM4'),
 (21, 1, 'TP53CP'),
 (22, 1, 'TP53L'),
 (23, 1, 'TP73L'),
 (24, 1, 'p40'),
 (25, 1, 'p51'),
 (26, 1, 'p53CP'),
 (27, 1, 'p63'),
 (28, 1, 'p73H'),
 (29, 1, 'p73L'),
 (30, 1, 'LNCR2'),
 (31, 1, 'BL2'),
 (32, 1, 'IGSF4'),
 (33, 1, 'IGSF4A'),
 (34, 1, 'NECL2'),
 (35, 1, 'Necl-2'),
 (36, 1, 'RA175'),
 (37, 1, 'ST17'),
 (38, 1, 'SYNCAM'),
 (39, 1, 'TSLC1'),
 (40, 1, 'sTSLC-1'),
 (41, 1, 'sgIGSF'),
 (42, 1, 'synCAM1'),
 (43, 1, 'BAIPRCK'),
 (44, 1, 'LNCR2'),
 (45, 1, 'NACHRA3'),
 (46, 1, 'PAOD2'),
 (47, 1, 'GNB2L1'),
 (48, 1, 'Gnb2-rs1'),
 (49, 1, 'H12.3'),
 (50, 1, 'HLC-7'),
 (51, 1, 'PIG21')]

In [33]:
connection.commit()

In [34]:
delete_gene_aliases = "DELETE FROM gene_aliases;"
cursor.execute(delete_gene_aliases)

<sqlite3.Cursor at 0x10aeb7420>

In [35]:
connection.commit()

In [37]:
delete_genes = "DELETE FROM genes;"
cursor.execute(delete_genes)
connection.commit()

In [38]:
cursor.execute(select_genes)
cursor.fetchall()

[]

In [39]:
cursor.execute(select_gene_aliases)
cursor.fetchall()

[]

In [40]:
insert_genes = """
INSERT INTO genes (gene_symbol, gene_description, start, stop) 
VALUES (?,?,?,?);
"""

insert_gene_aliases = """
INSERT INTO gene_aliases (gene_id, alias) 
VALUES (?,?);
"""

with open("genes_aliases.txt") as genes_file:
    headerline = genes_file.readline().strip()
    # print(headerline)
    for line in genes_file:
        line_list = line.strip().split("\t")
        gene_symbol = line_list[0]
        gene_aliases = line_list[1]
        gene_description = line_list[2]
        gene_start = int(line_list[4])
        gene_stop = int(line_list[5])
        gene_data = (gene_symbol, gene_description, gene_start, gene_stop)
        cursor.execute(insert_genes, gene_data)
        #print(gene_aliases)
        gene_id = cursor.lastrowid # this needs to be updated
        for alias in gene_aliases.split(","):
            alias_data = (gene_id, alias.strip())
            cursor.execute(insert_gene_aliases, alias_data)
        
        # break

In [41]:
cursor.execute(select_genes)
cursor.fetchall()

[(7, 'TERT', 'telomerase reverse transcriptase', 1253167, 1295068),
 (8, 'TP63', 'tumor protein p63', 189596746, 189897276),
 (9,
  'CHRNA5',
  'cholinergic receptor nicotinic alpha 5 subunit',
  78565520,
  78595269),
 (10, 'CADM1', 'cell adhesion molecule 1', 115169236, 115504428),
 (11,
  'CHRNA3',
  'cholinergic receptor nicotinic alpha 3 subunit',
  78593052,
  78620996),
 (12, 'RACK1', 'receptor for activated C kinase 1', 181236897, 181243906)]

In [42]:
cursor.execute(select_gene_aliases)
cursor.fetchall()

[(52, 7, 'CMM9'),
 (53, 7, 'DKCA2'),
 (54, 7, 'DKCB4'),
 (55, 7, 'EST2'),
 (56, 7, 'PFBMFT1'),
 (57, 7, 'TCS1'),
 (58, 7, 'TP2'),
 (59, 7, 'TRT'),
 (60, 7, 'hEST2'),
 (61, 7, 'hTRT'),
 (62, 8, 'AIS'),
 (63, 8, 'B(p51A)'),
 (64, 8, 'B(p51B)'),
 (65, 8, 'EEC3'),
 (66, 8, 'KET'),
 (67, 8, 'LMS'),
 (68, 8, 'NBP'),
 (69, 8, 'OFC8'),
 (70, 8, 'RHS'),
 (71, 8, 'SHFM4'),
 (72, 8, 'TP53CP'),
 (73, 8, 'TP53L'),
 (74, 8, 'TP73L'),
 (75, 8, 'p40'),
 (76, 8, 'p51'),
 (77, 8, 'p53CP'),
 (78, 8, 'p63'),
 (79, 8, 'p73H'),
 (80, 8, 'p73L'),
 (81, 9, 'LNCR2'),
 (82, 10, 'BL2'),
 (83, 10, 'IGSF4'),
 (84, 10, 'IGSF4A'),
 (85, 10, 'NECL2'),
 (86, 10, 'Necl-2'),
 (87, 10, 'RA175'),
 (88, 10, 'ST17'),
 (89, 10, 'SYNCAM'),
 (90, 10, 'TSLC1'),
 (91, 10, 'sTSLC-1'),
 (92, 10, 'sgIGSF'),
 (93, 10, 'synCAM1'),
 (94, 11, 'BAIPRCK'),
 (95, 11, 'LNCR2'),
 (96, 11, 'NACHRA3'),
 (97, 11, 'PAOD2'),
 (98, 12, 'GNB2L1'),
 (99, 12, 'Gnb2-rs1'),
 (100, 12, 'H12.3'),
 (101, 12, 'HLC-7'),
 (102, 12, 'PIG21')]

In [43]:
connection.commit()

___

<b> <font color = "red">Exercise</font></b>

#### Query a database 

How many genes we have?  
How many aliases we have?  
Retrieve the aliases for gene TP63. In the result, display the gene symbol and the alias.    
How many aliases are associated which each gene? In the result, display the gene symbol and the count.


In [45]:
def get_header(cursor):
    '''
    Makes a tab delimited header row from the cursor description.
    Arguments:
        cursor: a cursor after a select query
    Returns:
        string: A string consisting of the column names separated by tabs, no new line
    '''
    return '\t'.join([row[0] for row in cursor.description])



In [46]:
# note that if you have a large result 
# this function will try to make a very large string from it
# so it is recommended for results with less than 10 rows and 10 columns
# for other cases use the for loop to go through the rows in the result 

def get_results(cursor):
    '''
    Makes a tab delimited table from the cursor results.
    Arguments:
        cursor: a cursor after a select query
    Returns:
        string: A string consisting of the column names separated by tabs, no new line
    ''' 
    res = list()
    for row in cursor.fetchall():        
        res.append('\t'.join(list(map(str,row))))
    return "\n".join(res)

In [47]:
cursor.execute(select_gene_aliases)
print(get_header(cursor))
print(get_results(cursor))

alias_id	gene_id	alias
52	7	CMM9
53	7	DKCA2
54	7	DKCB4
55	7	EST2
56	7	PFBMFT1
57	7	TCS1
58	7	TP2
59	7	TRT
60	7	hEST2
61	7	hTRT
62	8	AIS
63	8	B(p51A)
64	8	B(p51B)
65	8	EEC3
66	8	KET
67	8	LMS
68	8	NBP
69	8	OFC8
70	8	RHS
71	8	SHFM4
72	8	TP53CP
73	8	TP53L
74	8	TP73L
75	8	p40
76	8	p51
77	8	p53CP
78	8	p63
79	8	p73H
80	8	p73L
81	9	LNCR2
82	10	BL2
83	10	IGSF4
84	10	IGSF4A
85	10	NECL2
86	10	Necl-2
87	10	RA175
88	10	ST17
89	10	SYNCAM
90	10	TSLC1
91	10	sTSLC-1
92	10	sgIGSF
93	10	synCAM1
94	11	BAIPRCK
95	11	LNCR2
96	11	NACHRA3
97	11	PAOD2
98	12	GNB2L1
99	12	Gnb2-rs1
100	12	H12.3
101	12	HLC-7
102	12	PIG21


In [48]:
select_genes_no = " SELECT count(*) FROM genes;"

cursor.execute(select_genes_no)
print(get_header(cursor))
print(get_results(cursor))

count(*)
6


In [49]:
select_gene_aliases_no = " SELECT count(*) aliases_no FROM gene_aliases;"

cursor.execute(select_gene_aliases_no)
print(get_header(cursor))
print(get_results(cursor))

aliases_no
51


In [52]:
# Retrieve the aliases for gene TP63. In the result, display the gene symbol and the alias.    

select_TP63_aliases = """
SELECT gene_symbol, alias
FROM genes g
INNER JOIN gene_aliases ga ON g.gene_id = ga.gene_id
WHERE gene_symbol = 'TP63';
"""

cursor.execute(select_TP63_aliases)
print(get_header(cursor))
print(get_results(cursor))

gene_symbol	alias
TP63	AIS
TP63	B(p51A)
TP63	B(p51B)
TP63	EEC3
TP63	KET
TP63	LMS
TP63	NBP
TP63	OFC8
TP63	RHS
TP63	SHFM4
TP63	TP53CP
TP63	TP53L
TP63	TP73L
TP63	p40
TP63	p51
TP63	p53CP
TP63	p63
TP63	p73H
TP63	p73L


In [56]:
# How many aliases are associated which each gene? In the result, display the gene symbol and the count.

select_aliasNo_per_gene = """
SELECT gene_symbol, count(alias)
FROM genes g
JOIN gene_aliases ga ON g.gene_id = ga.gene_id
GROUP BY gene_symbol
"""

cursor.execute(select_aliasNo_per_gene)
print(get_header(cursor))
print(get_results(cursor))

gene_symbol	count(alias)
CADM1	12
CHRNA3	4
CHRNA5	1
RACK1	5
TERT	10
TP63	19


In [57]:
select_aliasNo_per_gene = """
SELECT gene_symbol, count(alias)
FROM genes g
JOIN gene_aliases ga ON g.gene_id = ga.gene_id
GROUP BY gene_symbol
HAVING count(alias) >= 10
"""

cursor.execute(select_aliasNo_per_gene)
print(get_header(cursor))
print(get_results(cursor))

gene_symbol	count(alias)
CADM1	12
TERT	10
TP63	19


In [None]:
hiT55T83