## SQL - Major SQL commands - sqlite recap and DBeaver and mysql
### BIOINF 575 - Fall 2023

#### Guidelines for database design:

* Normalization is the process of creating or re-arranging data relationships so that it will be easy to store and retrieve data efficiently.  Data is normalized to achieve the following goals: 
    * Eliminate data redundancies and save space 
    * Make it easier to change data 
    * Simplify the enforcement of referential integrity constraints 
    * Produce a design that is a 'good' representation of the real world (one that is intuitively easy to understand and a good base for further growth)

    * Make it easier to change data by avoiding to provide multiple values separated by commas in a column
    * All columns in a table should depend on the primary key, all extra related information should be in other tables linked by foreign keys

https://support.microsoft.com/en-us/help/283878/description-of-the-database-normalization-basics

##### RESOURCES
https://dev.mysql.com/doc/refman/8.0/en/         
https://www.w3schools.com/mysql/mysql_create_table.asp          
https://www.mysqltutorial.org/mysql-sample-database.aspx       
https://www.tutorialspoint.com/mysql/index.htm    
https://realpython.com/python-mysql/




https://dev.mysql.com/doc/refman/8.0/en/examples.html

#### Connect to the database

```mysql your-database-name```

#### Create a table and add data, then select data from the table

```sql
CREATE TABLE shop (
    article INT UNSIGNED  DEFAULT '0000' NOT NULL,
    dealer  CHAR(20)      DEFAULT ''     NOT NULL,
    price   DECIMAL(16,2) DEFAULT '0.00' NOT NULL,
    PRIMARY KEY(article, dealer));

INSERT INTO shop VALUES
    (1,'A',3.45),(1,'B',3.99),(2,'A',10.99),(3,'B',1.45),
    (3,'C',1.69),(3,'D',1.25),(4,'D',19.95);
    
SELECT * FROM shop ORDER BY article;

```


| article   | dealer   | price   |
| --------- |--------  | ------- |
|       1   | A        |  3.45   |
|       1   | B        |  3.99   |
|       2   | A        | 10.99   |
|       3   | B        |  1.45   |
|       3   | C        |  1.69   |
|       3   | D        |  1.25   |
|       4   | D        | 19.95   |

#### Examples of common querries

https://dev.mysql.com/doc/refman/8.0/en/examples.html

In [None]:
### install pymysql using the following command in a terminal:
# conda install pymysql

In [1]:
from pymysql import connect

#Create a connection object
conn =connect(host='ensembldb.ensembl.org', user='anonymous',  port=5306, db = "saccharomyces_cerevisiae_core_94_4")


In [2]:
# May need a password: password = 'password'
# Can request specific database: db = 'database'
# Get a cursor – it sends SQL statements and receives responses
cursor = conn.cursor()
sql = "show tables"
cursor.execute(sql)
for (table_name,) in cursor: 
    print(table_name) 

#Do your queries, work with responses



alt_allele
alt_allele_attrib
alt_allele_group
analysis
analysis_description
assembly
assembly_exception
associated_group
associated_xref
attrib_type
biotype
coord_system
data_file
density_feature
density_type
dependent_xref
ditag
ditag_feature
dna
dna_align_feature
dna_align_feature_attrib
exon
exon_transcript
external_db
external_synonym
gene
gene_archive
gene_attrib
genome_statistics
identity_xref
interpro
intron_supporting_evidence
karyotype
map
mapping_session
mapping_set
marker
marker_feature
marker_map_location
marker_synonym
meta
meta_coord
misc_attrib
misc_feature
misc_feature_misc_set
misc_set
object_xref
ontology_xref
operon
operon_transcript
operon_transcript_gene
peptide_archive
prediction_exon
prediction_transcript
protein_align_feature
protein_feature
repeat_consensus
repeat_feature
seq_region
seq_region_attrib
seq_region_mapping
seq_region_synonym
simple_feature
stable_id_event
supporting_feature
transcript
transcript_attrib
transcript_intron_supporting_evidence
transcri

In [3]:
select_genes = """
SELECT * 
FROM gene;
"""

cursor.execute(select_genes)
cursor.fetchall()

((35593,
  'protein_coding',
  501,
  28,
  289445,
  290350,
  1,
  None,
  'sgd',
  None,
  1,
  35593,
  'YBR024W',
  None,
  datetime.datetime(2018, 3, 28, 9, 28, 5),
  datetime.datetime(2018, 3, 28, 9, 28, 5)),
 (35592,
  'protein_coding',
  501,
  21,
  11657,
  13360,
  -1,
  None,
  'sgd',
  None,
  1,
  35592,
  'YDL245C',
  None,
  datetime.datetime(2018, 3, 28, 9, 28, 5),
  datetime.datetime(2018, 3, 28, 9, 28, 5)),
 (35591,
  'protein_coding',
  501,
  28,
  683373,
  683732,
  -1,
  None,
  'sgd',
  None,
  1,
  35591,
  'YBR232C',
  None,
  datetime.datetime(2018, 3, 28, 9, 28, 5),
  datetime.datetime(2018, 3, 28, 9, 28, 5)),
 (35590,
  'protein_coding',
  501,
  21,
  1108484,
  1108621,
  1,
  None,
  'sgd',
  None,
  1,
  35590,
  'YDR320W-B',
  None,
  datetime.datetime(2018, 3, 28, 9, 28, 5),
  datetime.datetime(2018, 3, 28, 9, 28, 5)),
 (35589,
  'protein_coding',
  501,
  28,
  281443,
  283344,
  1,
  None,
  'sgd',
  None,
  1,
  35589,
  'YBR021W',
  None,
  dat

In [4]:
cursor.description

(('gene_id', 3, None, 10, 10, 0, False),
 ('biotype', 253, None, 160, 160, 0, False),
 ('analysis_id', 2, None, 5, 5, 0, False),
 ('seq_region_id', 3, None, 10, 10, 0, False),
 ('seq_region_start', 3, None, 10, 10, 0, False),
 ('seq_region_end', 3, None, 10, 10, 0, False),
 ('seq_region_strand', 1, None, 2, 2, 0, False),
 ('display_xref_id', 3, None, 10, 10, 0, True),
 ('source', 253, None, 160, 160, 0, False),
 ('description', 252, None, 262140, 262140, 0, True),
 ('is_current', 1, None, 1, 1, 0, False),
 ('canonical_transcript_id', 3, None, 10, 10, 0, False),
 ('stable_id', 253, None, 512, 512, 0, True),
 ('version', 2, None, 5, 5, 0, True),
 ('created_date', 12, None, 19, 19, 0, True),
 ('modified_date', 12, None, 19, 19, 0, True))

In [5]:
select_genes = """
SELECT gene_id, biotype, is_current
FROM gene
LIMIT 20;
"""

cursor.execute(select_genes)
cursor.fetchall()

((35593, 'protein_coding', 1),
 (35592, 'protein_coding', 1),
 (35591, 'protein_coding', 1),
 (35590, 'protein_coding', 1),
 (35589, 'protein_coding', 1),
 (35588, 'protein_coding', 1),
 (35586, 'tRNA', 1),
 (35587, 'protein_coding', 1),
 (35585, 'protein_coding', 1),
 (35584, 'protein_coding', 1),
 (35583, 'protein_coding', 1),
 (35578, 'protein_coding', 1),
 (35582, 'protein_coding', 1),
 (35581, 'protein_coding', 1),
 (35580, 'protein_coding', 1),
 (35579, 'protein_coding', 1),
 (35577, 'protein_coding', 1),
 (35576, 'protein_coding', 1),
 (35575, 'protein_coding', 1),
 (35574, 'protein_coding', 1))

In [None]:
cursor.description

In [6]:
select_dnaseq = """
SELECT * 
FROM seq_region
LIMIT 20;
"""

cursor.execute(select_dnaseq)
cursor.fetchall()

((1, 'BK006935.2', 1, 230218),
 (2, 'BK006936.2', 1, 813184),
 (3, 'BK006937.2', 1, 316620),
 (4, 'BK006938.2', 1, 1531933),
 (5, 'BK006939.2', 1, 576874),
 (6, 'BK006940.2', 1, 270161),
 (7, 'BK006941.2', 1, 1090940),
 (8, 'BK006934.2', 1, 562643),
 (9, 'BK006942.2', 1, 439888),
 (10, 'BK006943.2', 1, 745751),
 (11, 'BK006944.2', 1, 666816),
 (12, 'BK006945.2', 1, 1078177),
 (13, 'BK006946.2', 1, 924431),
 (14, 'BK006947.3', 1, 784333),
 (15, 'BK006948.2', 1, 1091291),
 (16, 'BK006949.2', 1, 948066),
 (17, 'AJ011856.1', 1, 85779),
 (19, 'XI', 2, 666816),
 (20, 'IX', 2, 439888),
 (21, 'IV', 2, 1531933))

In [None]:
cursor.description

In [7]:
select_dnaseq_syn = """
SELECT * 
FROM seq_region_synonym
LIMIT 20;
"""

cursor.execute(select_dnaseq_syn)
cursor.fetchall()

((1, 33, 'BK006935.2', 50710),
 (2, 28, 'BK006936.2', 50710),
 (3, 22, 'BK006937.2', 50710),
 (4, 21, 'BK006938.2', 50710),
 (5, 27, 'BK006939.2', 50710),
 (6, 26, 'BK006940.2', 50710),
 (7, 35, 'BK006941.2', 50710),
 (8, 25, 'BK006934.2', 50710),
 (9, 20, 'BK006942.2', 50710),
 (10, 23, 'BK006943.2', 50710),
 (11, 19, 'BK006944.2', 50710),
 (12, 24, 'BK006945.2', 50710),
 (13, 30, 'BK006946.2', 50710),
 (14, 29, 'BK006947.3', 50710),
 (15, 32, 'BK006948.2', 50710),
 (16, 31, 'BK006949.2', 50710),
 (17, 34, 'AJ011856.1', 50710))

In [8]:
cursor.description

(('seq_region_synonym_id', 3, None, 10, 10, 0, False),
 ('seq_region_id', 3, None, 10, 10, 0, False),
 ('synonym', 253, None, 1000, 1000, 0, False),
 ('external_db_id', 3, None, 10, 10, 0, True))

In [10]:
# Clean up - do this when done with the database
cursor.close()
conn.close()

### Using custom objects together with SQL
### sqlalchemy

#### We will follow the tutorial here:
    
https://docs.sqlalchemy.org/en/14/orm/tutorial.html#version-check

In [None]:
### pip install SQLAlchemy

In [32]:
# it allows us to create classes in python that will be 
# translated into tables in a database
# and run querries by calling methods 
# like create() and select() rather than writing the query in a string
import sqlalchemy

In [33]:
sqlalchemy.__version__ 

'1.4.49'

In [34]:
from sqlalchemy import create_engine
engine = create_engine("sqlite:///:memory:", echo=True)

In [35]:
from sqlalchemy.orm import declarative_base
Base = declarative_base()

In [36]:
from sqlalchemy import Column, Integer, String
# creating a class for the table users

class User(Base):
    __tablename__ = "users"

    id = Column(Integer, primary_key=True)
    name = Column(String)
    fullname = Column(String)
    nickname = Column(String)

    def __repr__(self):
         return "<User(name='%s', fullname='%s', nickname='%s')>" % (
             self.name,
             self.fullname,
             self.nickname,
         )

In [37]:
# we see the table structure for the class User
# it is a wrapper that allows us to call functions in python 
# that build the sql commands rather than writing the sql command in a string 
User.__table__  


Table('users', MetaData(), Column('id', Integer(), table=<users>, primary_key=True, nullable=False), Column('name', String(), table=<users>), Column('fullname', String(), table=<users>), Column('nickname', String(), table=<users>), schema=None)

In [38]:
# A notebook with a full example will be posted for this. 