## SQL exercises
### BIOINF 575 - Fall 2023



The. ALTER TABLE command: Changing a table without having to remove it and recreate it.   
A table can be ALTERED to:
* RENAME the TABLE
* RENAME a COLUMN
* ADD a COLUMN
* DROP a COLUMN

Detailed diagram of the command available at:   
https://www.sqlite.org/lang_altertable.html

Examples available at:   
https://www.sqlitetutorial.net/sqlite-alter-table/    
https://www.tutorialspoint.com/sqlite/sqlite_alter_command.htm    
https://www.geeksforgeeks.org/how-to-alter-a-sqlite-table-using-python/

In [14]:
from sqlite3 import connect
import pandas as pd

def display_results(cursor):
    data = cursor.fetchall()
    header = [tp[0] for tp in cursor.description]
    res_df = pd.DataFrame(data, columns = header)
    return res_df

connection = connect("portal_mammals.sqlite")
cursor = connection.cursor()

sql = """
SELECT name, type 
FROM sqlite_master
"""
cursor.execute(sql)

display_results(cursor)



Unnamed: 0,name,type
0,surveys,table
1,species,table
2,plots,table
3,sqlite_sequence,table
4,survey_test,view


In [15]:
sql = """
SELECT *
FROM surveys
LIMIT 10;
"""
cursor.execute(sql)

display_results(cursor)


Unnamed: 0,record_id,month,day,year,plot_id,species_id,sex,hindfoot_length,weight
0,1,7,16,1977,2,NL,M,32.0,
1,2,7,16,1977,3,NL,M,33.0,
2,3,7,16,1977,2,DM,F,37.0,
3,4,7,16,1977,7,DM,M,36.0,
4,5,7,16,1977,3,DM,M,35.0,
5,6,7,16,1977,1,PF,M,14.0,
6,7,7,16,1977,2,PE,F,,
7,8,7,16,1977,1,DM,M,37.0,
8,9,7,16,1977,1,DM,F,34.0,
9,10,7,16,1977,6,PF,F,20.0,


In [18]:
sql = """
DROP VIEW survey_test;
"""
cursor.execute(sql)



OperationalError: no such view: survey_test

In [19]:
sql = """
SELECT name, type 
FROM sqlite_master
"""
cursor.execute(sql)

display_results(cursor)


Unnamed: 0,name,type
0,surveys,table
1,species,table
2,plots,table
3,sqlite_sequence,table


In [20]:
sql = """
ALTER TABLE surveys
DROP COLUMN weight;
"""

cursor.execute(sql)
connection.commit()

In [21]:
sql = """
SELECT *
FROM surveys
LIMIT 10;
"""
cursor.execute(sql)

display_results(cursor)

Unnamed: 0,record_id,month,day,year,plot_id,species_id,sex,hindfoot_length
0,1,7,16,1977,2,NL,M,32.0
1,2,7,16,1977,3,NL,M,33.0
2,3,7,16,1977,2,DM,F,37.0
3,4,7,16,1977,7,DM,M,36.0
4,5,7,16,1977,3,DM,M,35.0
5,6,7,16,1977,1,PF,M,14.0
6,7,7,16,1977,2,PE,F,
7,8,7,16,1977,1,DM,M,37.0
8,9,7,16,1977,1,DM,F,34.0
9,10,7,16,1977,6,PF,F,20.0


In [22]:
connection.rollback()

In [23]:
sql = """
SELECT *
FROM surveys
LIMIT 10;
"""
cursor.execute(sql)

display_results(cursor)

Unnamed: 0,record_id,month,day,year,plot_id,species_id,sex,hindfoot_length
0,1,7,16,1977,2,NL,M,32.0
1,2,7,16,1977,3,NL,M,33.0
2,3,7,16,1977,2,DM,F,37.0
3,4,7,16,1977,7,DM,M,36.0
4,5,7,16,1977,3,DM,M,35.0
5,6,7,16,1977,1,PF,M,14.0
6,7,7,16,1977,2,PE,F,
7,8,7,16,1977,1,DM,M,37.0
8,9,7,16,1977,1,DM,F,34.0
9,10,7,16,1977,6,PF,F,20.0


In [24]:
cursor.close()
connection.close()

___

<b> <font color = "red">Exercise</font></b>

#### Create a database for genes and gene aliases

Create a SQLite database with a genes table, and a gene_aliases table.   
The genes table should have the following columns: gene_id, gene_symbol, the gene_description, start, stop.     
The gene_aliases table should have the following columns: alias_id, gene_id and the alias.   
Create an index on the column gene_id in the table gene_aliases.


In [25]:
connection = connect("gene_aliases.sqlite")
cursor = connection.cursor()

In [26]:
create_genes = """
CREATE TABLE IF NOT EXISTS genes (
gene_id INTEGER PRIMARY KEY, 
gene_symbol TEXT NOT NULL, 
gene_description TEXT, 
start BIGINT NOT NULL, 
stop BIGINT NOT NULL)
"""
cursor.execute(create_genes)
connection.commit()

In [62]:
drop_aliases = """
DROP TABLE IF EXISTS gene_aliases;
"""
cursor.execute(drop_aliases)
connection.commit()

In [63]:
select_tables = """
SELECT name, type 
FROM sqlite_master;
"""

cursor.execute(select_tables)

display_results(cursor)

Unnamed: 0,name,type
0,genes,table
1,genes_symbol_idx,index
2,genes_start_idx,index
3,genes_stop_idx,index


In [65]:
create_aliases = """
CREATE TABLE IF NOT EXISTS gene_aliases (
alias_id INTEGER PRIMARY KEY, 
gene_id INTEGER NOT NULL, 
alias TEXT NOT NULL,
FOREIGN KEY (gene_id) REFERENCES  genes  (gene_id))

"""
cursor.execute(create_aliases)
connection.commit()

In [66]:
select_tables = """
SELECT name, type 
FROM sqlite_master;
"""

cursor.execute(select_tables)

display_results(cursor)

Unnamed: 0,name,type
0,genes,table
1,genes_symbol_idx,index
2,genes_start_idx,index
3,genes_stop_idx,index
4,gene_aliases,table


In [31]:
# create indices

create_genes_sym_idx = '''
CREATE INDEX genes_symbol_idx 
ON genes (gene_symbol)
'''
cursor.execute(create_genes_sym_idx)
connection.commit()

In [32]:
create_genes_start_idx = '''
CREATE INDEX genes_start_idx 
ON genes (start)
'''
cursor.execute(create_genes_start_idx)
connection.commit()

In [33]:
create_genes_stop_idx = '''
CREATE INDEX genes_stop_idx 
ON genes (stop)
'''
cursor.execute(create_genes_stop_idx)
connection.commit()

In [67]:
create_gene_aliases_gene_id_idx = '''
CREATE INDEX gene_aliases_gene_id_idx 
ON gene_aliases (gene_id)
'''
cursor.execute(create_gene_aliases_gene_id_idx)
connection.commit()

In [68]:
select_tables = """
SELECT name, type 
FROM sqlite_master;
"""

cursor.execute(select_tables)

display_results(cursor)

Unnamed: 0,name,type
0,genes,table
1,genes_symbol_idx,index
2,genes_start_idx,index
3,genes_stop_idx,index
4,gene_aliases,table
5,gene_aliases_gene_id_idx,index


___

<b> <font color = "red">Exercise</font></b>

#### Populate a database 

Insert data into tables in the genes and gene aliases database you just created.   
Use the data from the file `genes_aliases.txt`.


In [42]:
insert_gene = """
INSERT INTO genes VALUES (?,?,?,?,?)
"""

In [43]:
insert_gene_alias = """
INSERT INTO gene_aliases VALUES (?,?,?)
"""

In [77]:
with open("genes_aliases.txt") as gene_aliases_file:
    header = gene_aliases_file.readline()
    gene_id = 0
    alias_id = 0
    for line in gene_aliases_file:
        line_lst = line.strip().split("\t")
        gene_sym = line_lst[0]
        gene_desc = line_lst[2]
        gene_start = int(line_lst[4])
        gene_stop = int(line_lst[5])
        gene_id = gene_id + 1
        gene_row = (gene_id, gene_sym, gene_desc, gene_start, gene_stop)
        cursor.execute(insert_gene, gene_row)
        #print(gene_sym)
        for alias in line_lst[1].split(","):
            alias_id = alias_id + 1
            gene_alias_row = (alias_id, gene_id, alias)
            cursor.execute(insert_gene_alias, gene_alias_row)
            #print(alias)
        
        

In [80]:
# connection.rollback()

In [81]:
# all looks good - ready to commit the changes from insert
connection.commit() 

In [82]:
select_tables = """
SELECT * 
FROM genes;
"""

cursor.execute(select_tables)

display_results(cursor)

Unnamed: 0,gene_id,gene_symbol,gene_description,start,stop
0,1,TERT,telomerase reverse transcriptase,1253167,1295068
1,2,TP63,tumor protein p63,189596746,189897276
2,3,CHRNA5,cholinergic receptor nicotinic alpha 5 subunit,78565520,78595269
3,4,CADM1,cell adhesion molecule 1,115169236,115504428
4,5,CHRNA3,cholinergic receptor nicotinic alpha 3 subunit,78593052,78620996
5,6,RACK1,receptor for activated C kinase 1,181236897,181243906


In [83]:
select_tables = """
SELECT * 
FROM gene_aliases;
"""

cursor.execute(select_tables)

display_results(cursor)

Unnamed: 0,alias_id,gene_id,alias
0,1,1,CMM9
1,2,1,DKCA2
2,3,1,DKCB4
3,4,1,EST2
4,5,1,PFBMFT1
5,6,1,TCS1
6,7,1,TP2
7,8,1,TRT
8,9,1,hEST2
9,10,1,hTRT


In [84]:
select_tables = """
SELECT sql 
FROM sqlite_master
WHERE name = "gene_aliases";
"""

cursor.execute(select_tables)

print(cursor.fetchone()[0])

CREATE TABLE gene_aliases (
alias_id INTEGER PRIMARY KEY, 
gene_id INTEGER NOT NULL, 
alias TEXT NOT NULL,
FOREIGN KEY (gene_id) REFERENCES  genes  (gene_id))


___

<b> <font color = "red">Exercise</font></b>

#### Query a database 

How many genes we have?  
How many aliases we have?  
Retrieve the aliases for gene TP63. In the result, display the gene symbol and the alias.    
How many aliases are associated which each gene? In the result, display the gene symbol and the count.


In [85]:
# How many genes we have?

select_genes_count = """
SELECT count(gene_id) 
FROM genes;
"""

cursor.execute(select_genes_count)

display_results(cursor)

Unnamed: 0,count(gene_id)
0,6


In [87]:
# How many aliases we have?  
select_aliases_count = """
SELECT count(alias_id) 
FROM gene_aliases;
"""

cursor.execute(select_aliases_count)

display_results(cursor)

Unnamed: 0,count(alias_id)
0,51


In [94]:
select_aliases_TP63 = """
SELECT gene_symbol, alias 
FROM gene_aliases
    INNER JOIN genes on genes.gene_id = gene_aliases.gene_id
WHERE gene_symbol = "TP63";
"""

cursor.execute(select_aliases_TP63)

display_results(cursor)

Unnamed: 0,gene_symbol,alias
0,TP63,AIS
1,TP63,B(p51A)
2,TP63,B(p51B)
3,TP63,EEC3
4,TP63,KET
5,TP63,LMS
6,TP63,NBP
7,TP63,OFC8
8,TP63,RHS
9,TP63,SHFM4


In [99]:
select_aliases = """
SELECT gene_symbol, count(alias) alias_no
FROM gene_aliases
    INNER JOIN genes on genes.gene_id = gene_aliases.gene_id
GROUP BY gene_symbol
ORDER BY alias_no DESC;
"""

cursor.execute(select_aliases)

display_results(cursor)

Unnamed: 0,gene_symbol,alias_no
0,TP63,19
1,CADM1,12
2,TERT,10
3,RACK1,5
4,CHRNA3,4
5,CHRNA5,1


In [None]:
def get_header(cursor):
    '''
    Makes a tab delimited header row from the cursor description.
    Arguments:
        cursor: a cursor after a select query
    Returns:
        string: A string consisting of the column names separated by tabs, no new line
    '''
    return '\t'.join([row[0] for row in cursor.description])



In [None]:
# note that if you have a large result 
# this function will try to make a very large string from it
# so it is recommended for results with less than 10 rows and 10 columns
# for other cases use the for loop to go through the rows in the result 

def get_results(cursor):
    '''
    Makes a tab delimited table from the cursor results.
    Arguments:
        cursor: a cursor after a select query
    Returns:
        string: A string consisting of the column names separated by tabs, no new line
    ''' 
    res = list()
    for row in cursor.fetchall():        
        res.append('\t'.join(list(map(str,row))))
    return "\n".join(res)