## SQL exercises
### BIOINF 575 - Fall 2021 



The. ALTER TABLE command: Changing a table without having to remove it and recreate it.   
A table can be ALTERED to:
* RENAME the TABLE
* RENAME a COLUMN
* ADD a COLUMN
* DROP a COLUMN

Detailed diagram of the command available at:   
https://www.sqlite.org/lang_altertable.html

Examples available at:   
https://www.sqlitetutorial.net/sqlite-alter-table/    
https://www.tutorialspoint.com/sqlite/sqlite_alter_command.htm    
https://www.geeksforgeeks.org/how-to-alter-a-sqlite-table-using-python/

___

<b> <font color = "red">Exercise</font></b>

#### Create a database for genes and gene aliases

Create a SQLite database with a genes table, and a gene_aliases table.   
The genes table should have the following columns: gene_id, gene_symbol, the gene_description, start, stop.     
The gene_aliases table should have the following columns: alias_id, gene_id and the alias.   
Create an index on the column gene_id in the table gene_aliases.


In [1]:
from sqlite3 import connect

# create the new database, connect to a filename that does not exist 
# that will create a new empty database
connection = connect("genes_data.sqlite")
cursor = connection.cursor()

In [2]:
# create genes table: gene_id, gene_symbol, the gene_description, start, stop.
create_genes ='''
CREATE TABLE IF NOT EXISTS genes (
      gene_id INTEGER PRIMARY KEY AUTOINCREMENT,
      gene_symbol TEXT NOT NULL,  
      gene_description TEXT NOT NULL,
      start BIGINT NOT NULL,                      
      stop BIGINT NOT NULL
    );
'''
try:
    cursor.execute(create_genes)
except connection.DatabaseError:
    print("Creating the genes table resulted in a database error!")
    connection.rollback()
    raise
else:
    connection.commit()
finally:
    print("done!")

done!


In [3]:
select_master = "SELECT name, type FROM sqlite_master"
cursor.execute(select_master)
cursor.fetchall()

[('genes', 'table'), ('sqlite_sequence', 'table')]

In [4]:
# create gene_aliases table: alias_id, gene_id and the alias.
create_gene_aliases ='''
CREATE TABLE IF NOT EXISTS gene_aliases (
      alias_id INTEGER PRIMARY KEY AUTOINCREMENT,
      gene_id INTEGER NOT NULL,  
      alias TEXT NOT NULL,                  
      FOREIGN KEY (gene_id) REFERENCES  genes  (gene_id)
    );
'''
try:
    cursor.execute(create_gene_aliases)
except connection.DatabaseError:
    print("Creating the gene_aliases table resulted in a database error!")
    connection.rollback()
    raise
else:
    connection.commit()
finally:
    print("done!")

done!


In [5]:
# select_master = "SELECT name, type FROM sqlite_master"
cursor.execute(select_master)
cursor.fetchall()

[('genes', 'table'), ('sqlite_sequence', 'table'), ('gene_aliases', 'table')]

___

<b> <font color = "red">Exercise</font></b>

#### Populate a database 

Insert data into tables in the genes and gene aliases database you just created.   
Use the data from the file `genes_aliases.txt`.


In [14]:
insert_genes = '''
INSERT INTO genes (gene_symbol, gene_description, start, stop) 
VALUES (?, ?, ?, ?);
'''

In [15]:
insert_gene_aliases = '''
INSERT INTO gene_aliases (gene_id, alias) 
VALUES (?, ?);
'''

In [26]:
with open("genes_aliases.txt") as genes_file:
    genes_file.readline()
    for line in genes_file:
        line_list = line.strip().split("\t")
        gene_symbol = line_list[0]
        gene_description = line_list[2]
        gene_start = line_list[4]
        gene_stop = line_list[5]
        gene_info = (gene_symbol, gene_description, gene_start, gene_stop)
        cursor.execute(insert_genes, gene_info)
        gene_id = cursor.lastrowid
        #print(gene_info)
        aliases = line_list[1]
        for alias in aliases.split(","):
            alias_info = (gene_id, alias.strip())
            #print(alias_info)
            cursor.execute(insert_gene_aliases, alias_info)
        #break


In [27]:
connection.commit()
# connection.rollback()

In [28]:
select_genes = "SELECT * FROM genes;"
cursor.execute(select_genes)
cursor.fetchall()

[(1, 'TERT', 'telomerase reverse transcriptase', 1253167, 1295068),
 (2, 'TP63', 'tumor protein p63', 189596746, 189897276),
 (3,
  'CHRNA5',
  'cholinergic receptor nicotinic alpha 5 subunit',
  78565520,
  78595269),
 (4, 'CADM1', 'cell adhesion molecule 1', 115169236, 115504428),
 (5,
  'CHRNA3',
  'cholinergic receptor nicotinic alpha 3 subunit',
  78593052,
  78620996),
 (6, 'RACK1', 'receptor for activated C kinase 1', 181236897, 181243906)]

In [29]:
select_gene_aliases = "SELECT * FROM gene_aliases;"
cursor.execute(select_gene_aliases)
cursor.fetchall()

[(1, 1, 'CMM9'),
 (2, 1, 'DKCA2'),
 (3, 1, 'DKCB4'),
 (4, 1, 'EST2'),
 (5, 1, 'PFBMFT1'),
 (6, 1, 'TCS1'),
 (7, 1, 'TP2'),
 (8, 1, 'TRT'),
 (9, 1, 'hEST2'),
 (10, 1, 'hTRT'),
 (11, 2, 'AIS'),
 (12, 2, 'B(p51A)'),
 (13, 2, 'B(p51B)'),
 (14, 2, 'EEC3'),
 (15, 2, 'KET'),
 (16, 2, 'LMS'),
 (17, 2, 'NBP'),
 (18, 2, 'OFC8'),
 (19, 2, 'RHS'),
 (20, 2, 'SHFM4'),
 (21, 2, 'TP53CP'),
 (22, 2, 'TP53L'),
 (23, 2, 'TP73L'),
 (24, 2, 'p40'),
 (25, 2, 'p51'),
 (26, 2, 'p53CP'),
 (27, 2, 'p63'),
 (28, 2, 'p73H'),
 (29, 2, 'p73L'),
 (30, 3, 'LNCR2'),
 (31, 4, 'BL2'),
 (32, 4, 'IGSF4'),
 (33, 4, 'IGSF4A'),
 (34, 4, 'NECL2'),
 (35, 4, 'Necl-2'),
 (36, 4, 'RA175'),
 (37, 4, 'ST17'),
 (38, 4, 'SYNCAM'),
 (39, 4, 'TSLC1'),
 (40, 4, 'sTSLC-1'),
 (41, 4, 'sgIGSF'),
 (42, 4, 'synCAM1'),
 (43, 5, 'BAIPRCK'),
 (44, 5, 'LNCR2'),
 (45, 5, 'NACHRA3'),
 (46, 5, 'PAOD2'),
 (47, 6, 'GNB2L1'),
 (48, 6, 'Gnb2-rs1'),
 (49, 6, 'H12.3'),
 (50, 6, 'HLC-7'),
 (51, 6, 'PIG21')]

In [25]:
# if I want to clear my table
delete_genes = "DELETE FROM genes;"
#cursor.execute(delete_genes)
#connection.commit()

___

<b> <font color = "red">Exercise</font></b>

#### Query a database 

How many genes we have?  
How many aliases we have?  
Retrieve the aliases for gene TP63. In the result, display the gene symbol and the alias.    
How many aliases are associated which each gene? In the result, display the gene symbol and the count.


In [30]:
def get_header(cursor):
    '''
    Makes a tab delimited header row from the cursor description.
    Arguments:
        cursor: a cursor after a select query
    Returns:
        string: A string consisting of the column names separated by tabs, no new line
    '''
    return '\t'.join([row[0] for row in cursor.description])



In [31]:
# note that if you have a large result 
# this function will try to make a very large string from it
# so it is recommended for results with less than 10 rows and 10 columns
# for other cases use the for loop to go through the rows in the result 

def get_results(cursor):
    '''
    Makes a tab delimited table from the cursor results.
    Arguments:
        cursor: a cursor after a select query
    Returns:
        string: A string consisting of the column names separated by tabs, no new line
    ''' 
    res = list()
    for row in cursor.fetchall():        
        res.append('\t'.join(list(map(str,row))))
    return "\n".join(res)

In [32]:
query1 = "SELECT count(gene_id) no_of_genes FROM genes"
cursor.execute(query1)
print(get_header(cursor))
print(get_results(cursor))

no_of_genes
6


In [33]:
query2 = "SELECT count(*) no_of_aliases FROM gene_aliases"
cursor.execute(query2)
print(get_header(cursor))
print(get_results(cursor))


no_of_aliases
51


In [40]:
query3 = """
SELECT gene_symbol, alias FROM genes g
JOIN gene_aliases ga ON g.gene_id = ga.gene_id
WHERE gene_symbol = 'TP63';
"""
cursor.execute(query3)
print(get_header(cursor))
print(get_results(cursor))

gene_symbol	alias
TP63	AIS
TP63	B(p51A)
TP63	B(p51B)
TP63	EEC3
TP63	KET
TP63	LMS
TP63	NBP
TP63	OFC8
TP63	RHS
TP63	SHFM4
TP63	TP53CP
TP63	TP53L
TP63	TP73L
TP63	p40
TP63	p51
TP63	p53CP
TP63	p63
TP63	p73H
TP63	p73L


In [39]:
query4 = query3 = """
SELECT gene_symbol, count(alias) FROM genes g
JOIN gene_aliases ga ON g.gene_id = ga.gene_id
GROUP BY gene_symbol;
"""
cursor.execute(query4)
print(get_header(cursor))
print(get_results(cursor))

gene_symbol	count(alias)
CADM1	12
CHRNA3	4
CHRNA5	1
RACK1	5
TERT	10
TP63	19
