### Study session 12 - SQL

#### BIOINF 575 - Fall 2021

##### SOLUTION


In [1]:
# put the import statements at the top of the notebook

# bring in the functionality from the sqlite3 module
# specifically the connect function

from sqlite3 import connect

___

<b> <font color = "red">Exercise</font></b>

#### Create a database for gene annotations (gene ontology - GO -  terms annotations)

Create a SQLite database with a genes table, a GO_terms table, and a gene_GO_term table.   
The genes table should have the gene id, gene symbol and the gene description.     
The GO_terms table should have the GO term id and the GO term name/description.     
The gene to GO term table should have the gene id and the GO term id.     


In [2]:
# Write your solution here

# Write the create table statements here - all strings

#-------
create_genes = '''
CREATE TABLE IF NOT EXISTS genes (
      gene_id INTEGER PRIMARY KEY AUTOINCREMENT,
      gene_symbol TEXT NOT NULL,
      gene_description TEXT NOT NULL
    );
'''
#-------

create_GO_terms = '''
CREATE TABLE IF NOT EXISTS GO_terms (
      GO_term_id INTEGER PRIMARY KEY AUTOINCREMENT,
      name TEXT NOT NULL
    );
'''
#-------

create_gene_GO_term = '''
CREATE TABLE IF NOT EXISTS gene_GO_term (
      gene_id INTEGER NOT NULL,                             -- REFERENCES  gene_id in the genes table
      GO_term_id INTEGER NOT NULL,                          -- REFERENCES  GO_term_id in the GO_terms table
      
      PRIMARY KEY (gene_id, GO_term_id),                    -- we set up a multi-column primary key - table constraint
                                                                -- the two columns uniquely indetify a row 
                                                                -- this will automatically create an index in the database
      FOREIGN KEY (gene_id) REFERENCES  genes  (gene_id),
      FOREIGN KEY (GO_term_id) REFERENCES  GO_terms  (GO_term_id)
    );
'''
#-------

In [3]:
# we do the same thing multiple times - we can use a function

def run_create_table(sql, cursor, connection):
    try:
        cursor.execute(sql)
    except connection.DatabaseError:
        print("Creating the table resulted in a database error!")
        connection.rollback()
        raise
    else:
        connection.commit()
    finally:
        print("done!")

In [4]:
# connect to a database ... if the file does not exist it will be created
connection = connect("gene_annotations.sqlite")
cursor = connection.cursor()

In [5]:
# check the sqlite_master table - should be empty at this point

select_master = "SELECT name, type FROM sqlite_master;"
cursor.execute(select_master)
cursor.fetchall()

[]

In [6]:
# run the commands
run_create_table(create_genes, cursor, connection)
run_create_table(create_GO_terms, cursor, connection)
run_create_table(create_gene_GO_term, cursor, connection)

done!
done!
done!


In [7]:
# check the sqlite_master table

# select_master = "SELECT name, type FROM sqlite_master;"
cursor.execute(select_master)
cursor.fetchall()


[('genes', 'table'),
 ('sqlite_sequence', 'table'),
 ('GO_terms', 'table'),
 ('gene_GO_term', 'table'),
 ('sqlite_autoindex_gene_GO_term_1', 'index')]

___

<b> <font color = "red">Exercise</font></b>

#### Populate a database 

Insert data into tables in the GO annotations database you just created.   
Use the data from the file `genes_info.txt`.


In [8]:
# Write your solution here

# Write the insert statements here - all strings
# because we insert values in all columns we do not need to specify the columns

insert_gene = "INSERT INTO genes VALUES(?,?,?);"
insert_GO_term = "INSERT INTO GO_terms VALUES(?,?);"
insert_gene_GO_term = "INSERT INTO gene_GO_term VALUES(?,?);"


In [9]:
file_name = "genes_info.txt"

In [10]:
GO_terms_dict = {}
GO_term_index = 0

with open(file_name) as genes_info:
    # read the header line and ignore it
    header_line = genes_info.readline() 
    
    # read the rest of the lines and process them
    for line in genes_info: 
        # break the line by tab and take the information needed in the table
        line_list = line.strip().split("\t")
        
        # we put the gene id in a variable because we need it in gene_GO_term table
        gene_id = line_list[0]
        
        # we take the gene id, symbol, description, 
        # which are the first 3 elements in the list - see header_line to confirm
        # this will be a row in the genes table
        gene_row = line_list[:3]
        cursor.execute(insert_gene, gene_row)
        
        # we take the go terms which are last in the list
        # we breake them up into a list - they are separated by ";"
        GO_terms_list = line_list[-1].split(";")
        
        # we go through the list of go terms to create rows for the GO_terms table
        # we need to keep track of the GO terms so we do not add them multiple times in the table
        # one GO term can be associated with multiple genes and one genes can be associated with multiple GO terms
        # a dictionary has unique keys and can store values foe each key
        # we will use that to keep track of our GO terms
        for GO_term in GO_terms_list:
            # remove whitespace from the beginning and end of the GO_term
            GO_term_name = GO_term.strip()
            
            # if we did not see this GO term before we add it to the dictionary and create an ID for it
            # otherwise we get the id of the term from the dictionary
            if GO_term_name not in GO_terms_dict:
                GO_term_index += 1
                GO_term_id = GO_term_index
                GO_terms_dict[GO_term_name] = GO_term_index
                # we will add this new GO term data to the GO_term table so prepare a row
                GO_term_row = (GO_term_id, GO_term_name)
                cursor.execute(insert_GO_term, GO_term_row)

            else:
                GO_term_id = GO_terms_dict[GO_term_name]
            
            # we add the connection between the gene and the GO term to the gene_GO_term table
            # we prepare a row
            gene_GO_row = (gene_id, GO_term_id)
            cursor.execute(insert_gene_GO_term, gene_GO_row)
       

In [11]:
# we check the tables and if all is well, we can commit these changes

select_genes = "SELECT * FROM genes;"
select_GO_terms = "SELECT * FROM GO_terms;"
select_gene_GO = "SELECT * FROM gene_GO_term;"


In [12]:
cursor.execute(select_genes)
cursor.fetchall()

[(672, 'BRCA1', 'BRCA1 DNA repair associated'),
 (675, 'BRCA2', 'BRCA2 DNA repair associated'),
 (7157, 'TP53', 'tumor protein p53')]

In [13]:
cursor.execute(select_GO_terms)
cursor.fetchall()

[(1, 'intrinsic apoptotic signaling pathway in response to DNA damage'),
 (2, 'transcription cis-regulatory region binding'),
 (3, 'transcription coactivator activity'),
 (4, 'ubiquitin-protein transferase activity'),
 (5, 'histone acetyltransferase activity'),
 (6, 'protease binding'),
 (7, 'single-stranded DNA binding'),
 (8, 'protein binding'),
 (9, 'negative regulation of transcription, DNA-templated'),
 (10, 'positive regulation of transcription by RNA polymerase II'),
 (11, 'promoter-specific chromatin binding'),
 (12, 'circadian behavior'),
 (13, 'circadian behavior"')]

In [14]:
cursor.execute(select_gene_GO)
cursor.fetchall()

[(672, 1),
 (672, 2),
 (672, 3),
 (672, 4),
 (675, 5),
 (675, 6),
 (675, 7),
 (675, 8),
 (7157, 9),
 (7157, 10),
 (7157, 11),
 (7157, 12),
 (7157, 13)]

In [15]:
# if all is well commit otherwise rollback, correct the issue in the code - rerun the code and if all well commit

connection.commit()
#connection.rollback()

In [16]:
## OPTION 2 - make lists and insert all data at the end

# before we can do that we need to clear the table data
delete_genes = "DELETE FROM genes;"
delete_GO_terms = "DELETE FROM GO_terms;"
delete_gene_GO = "DELETE FROM gene_GO_term;"

cursor.execute(delete_genes)
cursor.execute(delete_GO_terms)
cursor.execute(delete_gene_GO)



<sqlite3.Cursor at 0x1101d7810>

In [17]:
# check all 3 tables are empty

cursor.execute(select_genes)
print(cursor.fetchall())

cursor.execute(select_GO_terms)
print(cursor.fetchall())

cursor.execute(select_gene_GO)
print(cursor.fetchall())



[]
[]
[]


In [18]:
# commit the change

connection.commit()

In [19]:
GO_terms_dict = {}
GO_term_index = 0

genes_rows = []
GO_terms_rows = []
gene_GO_rows = []

with open(file_name) as genes_info:
    # read the header line and ignore it
    header_line = genes_info.readline() 
    
    # read the rest of the lines and process them
    for line in genes_info: 
        # break the line by tab and take the information needed in the table
        line_list = line.strip().split("\t")
        
        # we put the gene id in a variable because we need it in gene_GO_term table
        gene_id = line_list[0]
        
        # we take the gene id, symbol, description, 
        # which are the first 3 elements in the list - see header_line to confirm
        # this will be a row in the genes table
        gene_row = line_list[:3]
        genes_rows.append(gene_row)
        
        # we take the go terms which are last in the list
        # we breake them up into a list - they are separated by ";"
        GO_terms_list = line_list[-1].split(";")
        
        # we go through the list of go terms to create rows for the GO_terms table
        # we need to keep track of the GO terms so we do not add them multiple times in the table
        # one GO term can be associated with multiple genes and one genes can be associated with multiple GO terms
        # a dictionary has unique keys and can store values foe each key
        # we will use that to keep track of our GO terms
        for GO_term in GO_terms_list:
            # remove whitespace from the beginning and end of the GO_term
            GO_term_name = GO_term.strip()
            
            # if we did not see this GO term before we add it to the dictionary and create an ID for it
            # otherwise we get the id of the term from the dictionary
            if GO_term_name not in GO_terms_dict:
                GO_term_index += 1
                GO_term_id = GO_term_index
                GO_terms_dict[GO_term_name] = GO_term_index
                # we will add this new GO term data to the GO_term table so prepare a row
                GO_term_row = (GO_term_id, GO_term_name)
                GO_terms_rows.append(GO_term_row)

            else:
                GO_term_id = GO_terms_dict[GO_term_name]
            
            # we add the connection between the gene and the GO term to the gene_GO_term table
            # we prepare a row
            gene_GO_row = (gene_id, GO_term_id)
            gene_GO_rows.append(gene_GO_row)
       

In [20]:
# check the lists

genes_rows


[['672', 'BRCA1', 'BRCA1 DNA repair associated'],
 ['675', 'BRCA2', 'BRCA2 DNA repair associated'],
 ['7157', 'TP53', 'tumor protein p53']]

In [21]:
GO_terms_rows

[(1, 'intrinsic apoptotic signaling pathway in response to DNA damage'),
 (2, 'transcription cis-regulatory region binding'),
 (3, 'transcription coactivator activity'),
 (4, 'ubiquitin-protein transferase activity'),
 (5, 'histone acetyltransferase activity'),
 (6, 'protease binding'),
 (7, 'single-stranded DNA binding'),
 (8, 'protein binding'),
 (9, 'negative regulation of transcription, DNA-templated'),
 (10, 'positive regulation of transcription by RNA polymerase II'),
 (11, 'promoter-specific chromatin binding'),
 (12, 'circadian behavior'),
 (13, 'circadian behavior"')]

In [22]:
gene_GO_rows

[('672', 1),
 ('672', 2),
 ('672', 3),
 ('672', 4),
 ('675', 5),
 ('675', 6),
 ('675', 7),
 ('675', 8),
 ('7157', 9),
 ('7157', 10),
 ('7157', 11),
 ('7157', 12),
 ('7157', 13)]

In [23]:
# insert the data from the lists 
# because we have many rows in the lists we use execute many

cursor.executemany(insert_gene, genes_rows)

<sqlite3.Cursor at 0x1101d7810>

In [24]:
cursor.executemany(insert_GO_term, GO_terms_rows)

<sqlite3.Cursor at 0x1101d7810>

In [25]:
cursor.executemany(insert_gene_GO_term, gene_GO_rows)

<sqlite3.Cursor at 0x1101d7810>

In [26]:
# check the data

cursor.execute(select_genes)
cursor.fetchall()

[(672, 'BRCA1', 'BRCA1 DNA repair associated'),
 (675, 'BRCA2', 'BRCA2 DNA repair associated'),
 (7157, 'TP53', 'tumor protein p53')]

In [27]:
cursor.execute(select_GO_terms)
cursor.fetchall()

[(1, 'intrinsic apoptotic signaling pathway in response to DNA damage'),
 (2, 'transcription cis-regulatory region binding'),
 (3, 'transcription coactivator activity'),
 (4, 'ubiquitin-protein transferase activity'),
 (5, 'histone acetyltransferase activity'),
 (6, 'protease binding'),
 (7, 'single-stranded DNA binding'),
 (8, 'protein binding'),
 (9, 'negative regulation of transcription, DNA-templated'),
 (10, 'positive regulation of transcription by RNA polymerase II'),
 (11, 'promoter-specific chromatin binding'),
 (12, 'circadian behavior'),
 (13, 'circadian behavior"')]

In [28]:
cursor.execute(select_gene_GO)
cursor.fetchall()

[(672, 1),
 (672, 2),
 (672, 3),
 (672, 4),
 (675, 5),
 (675, 6),
 (675, 7),
 (675, 8),
 (7157, 9),
 (7157, 10),
 (7157, 11),
 (7157, 12),
 (7157, 13)]

In [29]:
# commit the changes
connection.commit()

In [30]:
## OPTION 3 - use the autoincrement for the GO_term id since it is not provided to us in the file

# before we can do that we need to clear the table data
delete_genes = "DELETE FROM genes;"
delete_GO_terms = "DELETE FROM GO_terms;"
delete_gene_GO = "DELETE FROM gene_GO_term;"

cursor.execute(delete_genes)
cursor.execute(delete_GO_terms)
cursor.execute(delete_gene_GO)

<sqlite3.Cursor at 0x1101d7810>

In [31]:
# commit the change
connection.commit()

In [32]:
# the insert statement for the GO_term table changed since we do not need to provide the id
insert_GO_term1 = "INSERT INTO GO_terms (name) VALUES (?);"


In [33]:
GO_terms_dict = {}

with open(file_name) as genes_info:
    header_line = genes_info.readline() 
    for line in genes_info: 
        line_list = line.strip().split("\t")
        gene_id = line_list[0]

        gene_row = line_list[:3]
        cursor.execute(insert_gene, gene_row)

        GO_terms_list = line_list[-1].split(";")
        
        for GO_term in GO_terms_list:
            GO_term_name = GO_term.strip()
            if GO_term_name not in GO_terms_dict:
                cursor.execute(insert_GO_term1, [GO_term_name])
                
                # get the autogenerated id from the database after the insert
                GO_term_id = cursor.lastrowid
                
                GO_terms_dict[GO_term_name] = GO_term_id

            else:
                GO_term_id = GO_terms_dict[GO_term_name]

            gene_GO_row = (gene_id, GO_term_id)
            cursor.execute(insert_gene_GO_term, gene_GO_row)
       

In [34]:
# check the data

cursor.execute(select_genes)
cursor.fetchall()

[(672, 'BRCA1', 'BRCA1 DNA repair associated'),
 (675, 'BRCA2', 'BRCA2 DNA repair associated'),
 (7157, 'TP53', 'tumor protein p53')]

In [35]:
cursor.execute(select_GO_terms)
cursor.fetchall()

[(14, 'intrinsic apoptotic signaling pathway in response to DNA damage'),
 (15, 'transcription cis-regulatory region binding'),
 (16, 'transcription coactivator activity'),
 (17, 'ubiquitin-protein transferase activity'),
 (18, 'histone acetyltransferase activity'),
 (19, 'protease binding'),
 (20, 'single-stranded DNA binding'),
 (21, 'protein binding'),
 (22, 'negative regulation of transcription, DNA-templated'),
 (23, 'positive regulation of transcription by RNA polymerase II'),
 (24, 'promoter-specific chromatin binding'),
 (25, 'circadian behavior'),
 (26, 'circadian behavior"')]

In [36]:
cursor.execute(select_gene_GO)
cursor.fetchall()

[(672, 14),
 (672, 15),
 (672, 16),
 (672, 17),
 (675, 18),
 (675, 19),
 (675, 20),
 (675, 21),
 (7157, 22),
 (7157, 23),
 (7157, 24),
 (7157, 25),
 (7157, 26)]

In [37]:
# commit the changes
connection.commit()

___

<b> <font color = "red">Exercise</font></b>

#### Query a database 

How many genes we have?  
How many go terms we have?   
How many go terms are associated which each gene?   
How many genes are associated with each GO term?   
Select all pairs of gene symbol and go term description that have a record in the gene_GO_term table.

In [38]:
# Write your solution here

# Bring in the functions from class for a nice display of the results

def get_header(cursor):
    '''
    Makes a tab delimited header row from the cursor description.
    Arguments:
        cursor: a cursor after a select query
    Returns:
        string: A string consisting of the column names separated by tabs, no new line
    '''
    return '\t'.join([row[0] for row in cursor.description])

def get_results(cursor):
    '''
    Makes a tab delimited table from the cursor results.
    Arguments:
        cursor: a cursor after a select query
    Returns:
        string: A string consisting of the column names separated by tabs, no new line
    ''' 
    res = list()
    for row in cursor.fetchall():        
        res.append('\t'.join(list(map(str,row))))
    return "\n".join(res)


In [39]:
# query1: How many genes we have?

query1 = "SELECT count(gene_id) 'Number of genes' FROM genes;"

cursor.execute(query1)
result = cursor.fetchall()
result

[(3,)]

In [40]:
# to get the number we index the list and the tuple

result[0][0]

3

In [41]:
cursor.execute(query1)

print(get_header(cursor))
print(get_results(cursor))

Number of genes
3


In [42]:
# query2: How many GO terms we have?

query2 = "SELECT count(*) 'Number of GO terms' FROM GO_terms;"

cursor.execute(query2)

print(get_header(cursor))
print(get_results(cursor))

Number of GO terms
13


In [43]:
# query3: How many go terms are associated which each gene?   

query3 = '''
SELECT gene_id, count(GO_term_id) 'Number of GO terms' 
FROM gene_GO_term
GROUP BY gene_id;
'''

cursor.execute(query3)

print(get_header(cursor))
print(get_results(cursor))

gene_id	Number of GO terms
672	4
675	4
7157	5


In [44]:
# if we want the gene_symbol

query3 = '''
SELECT gene_symbol Gene, count(gt.GO_term_id) 'Number of GO terms' 
FROM gene_GO_term gt
    JOIN genes g ON g.gene_id = gt.gene_id
GROUP BY gene_symbol;
'''

cursor.execute(query3)

print(get_header(cursor))
print(get_results(cursor))

Gene	Number of GO terms
BRCA1	4
BRCA2	4
TP53	5


In [45]:
# query4: How many genes are associated with each GO term?  

query4 = '''
SELECT GO_term_id, count(gene_id) 'Number of genes' 
FROM gene_GO_term
GROUP BY GO_term_id;
'''

cursor.execute(query4)

print(get_header(cursor))
print(get_results(cursor))


GO_term_id	Number of genes
14	1
15	1
16	1
17	1
18	1
19	1
20	1
21	1
22	1
23	1
24	1
25	1
26	1


In [46]:
# if we want the GO term name

query4 = '''
SELECT name 'GO term', count(gene_id) 'Number of genes' 
FROM gene_GO_term gt
    JOIN GO_terms t ON t.GO_term_id = gt.GO_term_id
GROUP BY name;
'''

cursor.execute(query4)

print(get_header(cursor))
print(get_results(cursor))

GO term	Number of genes
circadian behavior	1
circadian behavior"	1
histone acetyltransferase activity	1
intrinsic apoptotic signaling pathway in response to DNA damage	1
negative regulation of transcription, DNA-templated	1
positive regulation of transcription by RNA polymerase II	1
promoter-specific chromatin binding	1
protease binding	1
protein binding	1
single-stranded DNA binding	1
transcription cis-regulatory region binding	1
transcription coactivator activity	1
ubiquitin-protein transferase activity	1


In [47]:
# query5: Select all pairs of gene symbol and go term description that have a record in the gene_GO_term table.

query5 = '''
SELECT gene_symbol Gene, name 'GO term' 
FROM gene_GO_term gt
    JOIN GO_terms t ON t.GO_term_id = gt.GO_term_id
    JOIN genes g on g.gene_id = gt.gene_id;
'''

cursor.execute(query5)

print(get_header(cursor))
print(get_results(cursor))

Gene	GO term
BRCA1	intrinsic apoptotic signaling pathway in response to DNA damage
BRCA1	transcription cis-regulatory region binding
BRCA1	transcription coactivator activity
BRCA1	ubiquitin-protein transferase activity
BRCA2	histone acetyltransferase activity
BRCA2	protease binding
BRCA2	single-stranded DNA binding
BRCA2	protein binding
TP53	negative regulation of transcription, DNA-templated
TP53	positive regulation of transcription by RNA polymerase II
TP53	promoter-specific chromatin binding
TP53	circadian behavior
TP53	circadian behavior"


In [48]:
# we are done working with this database
# close cursor and connection

cursor.close()
connection.close()

___

<b> <font color = "red">Exercise</font></b>

#### Query a database 
Download the database from the following link:
https://www.sqlitetutorial.net/sqlite-sample-database/

<img src = https://www.sqlitetutorial.net/wp-content/uploads/2015/11/sqlite-sample-database-color.jpg width = 700>

https://www.sqlitetutorial.net/wp-content/uploads/2015/11/sqlite-sample-database-color.jpg

Retrieve how many tracks are associated with each artist.    
What is the overall mean number of tracks per album?    
For a given artist, retrieve all the albums (name), and tracks (name), they are associated with.   

Feel free to build other quesries for questions you may have.


In [49]:
# Write your solution here

# connect to the database

connection = connect("chinook.db")
cursor = connection.cursor()


In [50]:
# check the tables in the database

sel_tables = "select name, type from sqlite_master" 
cursor.execute(sel_tables)

print(get_header(cursor))
print(get_results(cursor))

name	type
albums	table
sqlite_sequence	table
artists	table
customers	table
employees	table
genres	table
invoices	table
invoice_items	table
media_types	table
playlists	table
playlist_track	table
sqlite_autoindex_playlist_track_1	index
tracks	table
IFK_AlbumArtistId	index
IFK_CustomerSupportRepId	index
IFK_EmployeeReportsTo	index
IFK_InvoiceCustomerId	index
IFK_InvoiceLineInvoiceId	index
IFK_InvoiceLineTrackId	index
IFK_PlaylistTrackTrackId	index
IFK_TrackAlbumId	index
IFK_TrackGenreId	index
IFK_TrackMediaTypeId	index
sqlite_stat1	table


In [51]:
# task1: Retrieve how many tracks are associated with each artist.

task1 = '''
SELECT artistid, count(trackid) 
FROM tracks t 
   JOIN albums ab ON t.albumid = ab.albumid 
GROUP BY artistid
'''

cursor.execute(task1)

print(get_header(cursor))
print(get_results(cursor))

ArtistId	count(trackid)
1	18
2	4
3	15
4	13
5	12
6	31
7	8
8	40
9	12
10	8
11	18
12	17
13	17
14	11
15	11
16	21
17	34
18	36
19	31
20	10
21	56
22	114
23	9
24	17
27	32
36	17
37	14
41	14
42	26
46	14
50	112
51	45
52	35
53	21
54	34
55	12
56	14
57	14
58	92
59	27
68	37
69	22
70	15
72	15
76	40
77	30
78	16
79	9
80	26
81	48
82	52
83	14
84	44
85	24
86	16
87	12
88	42
89	13
90	213
91	20
92	32
93	13
94	17
95	10
96	12
97	14
98	16
99	31
100	57
101	28
102	10
103	18
104	18
105	10
106	15
108	12
109	17
110	29
111	15
112	14
113	49
114	32
115	12
116	14
117	10
118	67
120	9
121	16
122	11
124	41
125	10
126	14
127	48
128	14
130	23
131	34
132	17
133	10
134	12
135	11
136	7
137	19
138	18
139	30
140	11
141	14
142	41
143	26
144	20
145	30
146	38
147	20
148	23
149	92
150	135
151	14
152	52
153	13
155	19
156	53
157	1
158	24
159	1
179	12
180	19
196	1
197	2
198	2
199	2
200	2
201	1
202	1
203	1
204	10
205	14
206	1
207	1
208	2
209	1
210	1
211	1
212	1
213	1
214	2
215	1
216	1
217	1
218	1
219	1
220	1
221	1
222	1
223	1
224	1
225	1
2

In [52]:
# if we want the srtist name and ordered data

task1 = '''
SELECT  count(trackid) 'No of tracks', a.name
FROM tracks t 
   JOIN albums ab ON t.albumid = ab.albumid 
   JOIN artists a ON ab.artistid = a.artistid
GROUP BY a.name
ORDER BY count(trackid) DESC
'''

cursor.execute(task1)

print(get_header(cursor))
print(get_results(cursor))

No of tracks	Name
213	Iron Maiden
135	U2
114	Led Zeppelin
112	Metallica
92	Lost
92	Deep Purple
67	Pearl Jam
57	Lenny Kravitz
56	Various Artists
53	The Office
52	Van Halen
52	Faith No More
49	Os Paralamas Do Sucesso
48	Red Hot Chili Peppers
48	Eric Clapton
45	Queen
44	Foo Fighters
42	Guns N' Roses
41	The Rolling Stones
41	R.E.M.
40	Creedence Clearwater Revival
40	Audioslave
38	Titãs
37	Miles Davis
36	Chico Science & Nação Zumbi
35	Kiss
34	Smashing Pumpkins
34	Green Day
34	Chico Buarque
32	Ozzy Osbourne
32	Jamiroquai
32	Gilberto Gil
31	Legião Urbana
31	Cidade Negra
31	Antônio Carlos Jobim
30	Tim Maia
30	The Cult
30	Cássia Eller
29	Nirvana
28	Lulu Santos
27	Santana
26	The Tea Party
26	Milton Nascimento
26	Djavan
24	Frank Sinatra
24	Battlestar Galactica (Classic)
23	Skank
23	Heroes
23	Amy Winehouse
22	Gene Krupa
21	Spyro Gyra
21	Caetano Veloso
20	The Who
20	James Brown
20	Battlestar Galactica
19	Zeca Pagodinho
19	The Black Crowes
19	House Of Pain
18	The Clash
18	Marvin Gaye
18	Marisa Monte

In [53]:
# task2: What is the overall mean number of tracks per album?

# first let's get the number of Tracks per album before we do the mean
task2 = '''
SELECT count(trackid) 'Tracks per album'
FROM tracks t 
   JOIN albums ab ON t.albumid = ab.albumid 
GROUP BY ab.albumid
ORDER BY count(trackid) DESC
'''

cursor.execute(task2)

print(get_header(cursor))
print(get_results(cursor))

Tracks per album
57
34
30
26
25
25
24
24
24
23
23
23
22
22
22
21
21
20
20
20
20
20
19
19
19
18
18
18
18
18
18
18
18
17
17
17
17
17
17
17
17
17
17
17
17
17
17
17
16
16
16
16
16
16
16
16
16
16
16
16
16
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
11
11
11
11
11
11
11
11
11
11
11
11
11
11
11
11
11
11
11
11
11
11
11
11
11
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
8
8
8
8
8
8
8
8
8
8
8
8
8
8
7
7
7
7
7
7
7
7
6
6
6
5
5
4
4
3
3
3
2
2
2
2
2
2
2
2
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1


Check the list of aggregate functions - we see that for mean we use avg()  
https://www.sqlite.org/lang_aggfunc.html

In [54]:
# let's do the mean of the results from the previous select 
# a select returns data in a table format so we can select from the result of a select

task2 = '''
SELECT avg(c) 'Mean number of tracks per album'
FROM
    (SELECT count(trackid) c
    FROM tracks t 
       JOIN albums ab ON t.albumid = ab.albumid 
    GROUP BY ab.albumid)
'''

cursor.execute(task2)

print(get_header(cursor))
print(get_results(cursor))

Mean number of tracks per album
10.095100864553315


In [55]:
# task3: For a given artist, retrieve all the albums (name), 
# and tracks (name), they are associated with.

task3 = '''
SELECT  a.name, title, t.name
FROM tracks t 
   JOIN albums ab ON t.albumid = ab.albumid 
   JOIN artists a ON ab.artistid = a.artistid
WHERE a.name = "Metallica"
ORDER BY title, t.name
'''

cursor.execute(task3)

print(get_header(cursor))
print(get_results(cursor))

Name	Title	Name
Metallica	...And Justice For All	...And Justice For All
Metallica	...And Justice For All	Blackened
Metallica	...And Justice For All	Dyers Eve
Metallica	...And Justice For All	Eye Of The Beholder
Metallica	...And Justice For All	Harvester Of Sorrow
Metallica	...And Justice For All	One
Metallica	...And Justice For All	The Frayed Ends Of Sanity
Metallica	...And Justice For All	The Shortest Straw
Metallica	...And Justice For All	To Live Is To Die
Metallica	Black Album	Don't Tread On Me
Metallica	Black Album	Enter Sandman
Metallica	Black Album	Holier Than Thou
Metallica	Black Album	My Friend Of Misery
Metallica	Black Album	Nothing Else Matters
Metallica	Black Album	Of Wolf And Man
Metallica	Black Album	Sad But True
Metallica	Black Album	The God That Failed
Metallica	Black Album	The Struggle Within
Metallica	Black Album	The Unforgiven
Metallica	Black Album	Through The Never
Metallica	Black Album	Wherever I May Roam
Metallica	Garage Inc. (Disc 1)	Astronomy
Metallica	Garage Inc

In [56]:
# if we want to make a generic select where we supply the artist name
task3 = '''
SELECT  a.name, title, t.name
FROM tracks t 
   JOIN albums ab ON t.albumid = ab.albumid 
   JOIN artists a ON ab.artistid = a.artistid
WHERE a.name = ?
ORDER BY title, t.name
'''

cursor.execute(task3, ["AC/DC"])

print(get_header(cursor))
print(get_results(cursor))

Name	Title	Name
AC/DC	For Those About To Rock We Salute You	Breaking The Rules
AC/DC	For Those About To Rock We Salute You	C.O.D.
AC/DC	For Those About To Rock We Salute You	Evil Walks
AC/DC	For Those About To Rock We Salute You	For Those About To Rock (We Salute You)
AC/DC	For Those About To Rock We Salute You	Inject The Venom
AC/DC	For Those About To Rock We Salute You	Let's Get It Up
AC/DC	For Those About To Rock We Salute You	Night Of The Long Knives
AC/DC	For Those About To Rock We Salute You	Put The Finger On You
AC/DC	For Those About To Rock We Salute You	Snowballed
AC/DC	For Those About To Rock We Salute You	Spellbound
AC/DC	Let There Be Rock	Bad Boy Boogie
AC/DC	Let There Be Rock	Dog Eat Dog
AC/DC	Let There Be Rock	Go Down
AC/DC	Let There Be Rock	Hell Ain't A Bad Place To Be
AC/DC	Let There Be Rock	Let There Be Rock
AC/DC	Let There Be Rock	Overdose
AC/DC	Let There Be Rock	Problem Child
AC/DC	Let There Be Rock	Whole Lotta Rosie


In [57]:
# answer more interesting questions here

In [58]:
# when we are done working with the database
# close the cursor and the connection

cursor.close()
connection.close()
