In [2]:
import pandas as pd

from sqlalchemy import (create_engine, 
                        inspect, 
                        text, 
                        select, 
                        MetaData, 
                        Table, 
                        and_,
                        or_,
                        desc,
                        asc,
                        func,
                        case,
                        cast,
                        Float,
                        )


chinook_engine = create_engine("sqlite+pysqlite:////workspaces/ERN-sessions/SQL in python/chinook.db")

gravity_engine = create_engine("sqlite+pysqlite://///workspaces/ERN-sessions/SQL in python/gravity.db")

inspect(gravity_engine).get_table_names()

['address',
 'address_status',
 'author',
 'book',
 'book_author',
 'book_language',
 'country',
 'cust_order',
 'customer',
 'customer_address',
 'order_history',
 'order_line',
 'order_status',
 'publisher',
 'shipping_method']

In [3]:
chinook_metadata = MetaData() 

tracks = Table('tracks', chinook_metadata, autoload_with=chinook_engine)
#repr(tracks)

albums = Table('albums', chinook_metadata, autoload_with=chinook_engine)
artists = Table('artists', chinook_metadata, autoload_with=chinook_engine)
tracks.c.keys()

['TrackId',
 'Name',
 'AlbumId',
 'MediaTypeId',
 'GenreId',
 'Composer',
 'Milliseconds',
 'Bytes',
 'UnitPrice']

In [32]:
gravity_metadata = MetaData() 

books = Table('book', gravity_metadata, autoload_with=gravity_engine)
# books.c.keys()

authors = Table('author', gravity_metadata, autoload_with=gravity_engine)
book_authors = Table('book_author', gravity_metadata, autoload_with=gravity_engine)
books.c.keys()

['book_id',
 'title',
 'isbn13',
 'language_id',
 'num_pages',
 'publication_date',
 'publisher_id']

In [5]:
# Calculations can be done relatively simply. 
# We can use case statements to use conditions to apply conditions to our selects.

with chinook_engine.connect() as con:
    stmt = select(tracks.columns.Name, (tracks.columns.Bytes / tracks.columns.Milliseconds).label(
    'Bytes per millisecond'))
    result_proxy = con.execute(stmt) # only return top 5 results
    results = result_proxy.fetchmany(size=10) 
result_df = pd.DataFrame(results)

result_df

Unnamed: 0,Name,Bytes per millisecond
0,For Those About To Rock (We Salute You),32.49844786
1,Balls to the Wall,16.0859172938
2,Fast As a Shark,17.3055732615
3,Restless and Wild,17.1861210628
4,Princess of the Dawn,16.7560452615
5,Put The Finger On You,32.643128045
6,Let's Get It Up,32.645199764
7,Inject The Venom,32.5035810163
8,Snowballed,32.4931512245
9,Evil Walks,32.6806187547


In [6]:
# Lets calculate how expensive each song is per unit time
# can you extend it to get the 5 most expensive songs? 
# you'll need to finish your statement with a .order_by()
# you also can't pass your new label there, so instead pass the column calculation
# remmeber you can use .asc() and .desc() 

with chinook_engine.connect() as con:
    stmt = select(tracks.columns.Name,
                   (tracks.columns.UnitPrice / tracks.columns.Milliseconds).label(
    'Cost per millisecond')).order_by(
        (tracks.columns.UnitPrice / tracks.columns.Milliseconds).desc())
    result_proxy = con.execute(stmt) # only return top 5 results
    results = result_proxy.fetchmany(size=10) 
result_df = pd.DataFrame(results)

result_df

Unnamed: 0,Name,Cost per millisecond
0,É Uma Partida De Futebol,0.0009243697
1,Now Sports,0.0002027027
2,A Statistic,0.0001553429
3,Oprah,0.0001492087
4,Commercial 1,0.0001246694
5,The Real Problem,8.49785e-05
6,Commercial 2,4.66739e-05
7,Bossa,3.40815e-05
8,Casinha Feliz,3.06625e-05
9,Mateus Enter,2.98652e-05


In [7]:
# We can use case to tell SQL to do something when something is the case, 
# it takes the form case(when this is true, do this)
# If we warp that in a function, we can perform a function, but only
# times where something is the case

# We can also use cast to change the type of a returned variable 
with chinook_engine.connect() as con:
    album_1 = func.sum(case((tracks.columns.TrackId == 1, tracks.columns.Milliseconds),
                           else_=0))
    stmt = select(album_1 / (1000 * 60)) # / 60,000 to get in minutes
    result = con.execute(stmt).scalar()

print(result) # length of album 1

5.7286500000


In [8]:
# Find the total length of the entire itunes library

with chinook_engine.connect() as con:
    total_length = func.sum(tracks.columns.Milliseconds) # cast(func.sum(tracks.columns.Milliseconds), Float) # total length of all tracks
    stmt = select (total_length) 
    result = con.execute(stmt).scalar()
print(result)

# As an extension, find the length of album 1 as a percentage of the length of the entire library
with chinook_engine.connect() as con:
    album_1 = func.sum(case((tracks.columns.TrackId == 1, tracks.columns.Milliseconds),
                           else_=0))
    total_length = func.sum(tracks.columns.Milliseconds)
    stmt = select((album_1 / total_length )*100) #
    result = con.execute(stmt).scalar()
print(result)

1378778040
0.0249292482


In [9]:
# with predefined relationship
with chinook_engine.connect() as con:
    stmt = select(artists.columns.Name, tracks.columns.Name)
    result = con.execute(stmt).fetchmany(size=10)
result_df = pd.DataFrame(result)

print(result_df)

# If there isn't a pre-determined relationship, we can use join to specify it.
with chinook_engine.connect() as con:
    stmt = select(tracks.join(albums, tracks.columns.AlbumId == albums.columns.AlbumId))
    result = con.execute(stmt).fetchmany(size=10)
result_df = pd.DataFrame(result)

print(result_df)

    Name                                   Name_1
0  AC/DC  For Those About To Rock (We Salute You)
1  AC/DC                        Balls to the Wall
2  AC/DC                          Fast As a Shark
3  AC/DC                        Restless and Wild
4  AC/DC                     Princess of the Dawn
5  AC/DC                    Put The Finger On You
6  AC/DC                          Let's Get It Up
7  AC/DC                         Inject The Venom
8  AC/DC                               Snowballed
9  AC/DC                               Evil Walks
   TrackId                                     Name  AlbumId  MediaTypeId  \
0        1  For Those About To Rock (We Salute You)        1            1   
1        6                    Put The Finger On You        1            1   
2        7                          Let's Get It Up        1            1   
3        8                         Inject The Venom        1            1   
4        9                               Snowballed        1     

  result = con.execute(stmt).fetchmany(size=10)


In [10]:
# using a pre-determined relationship, make a dataframe with book title and author of each book
# get the first 10

# with predefined relationship
with gravity_engine.connect() as con:
    stmt = select(books.columns.title, authors.columns.author_name)
    result = con.execute(stmt).fetchmany(size=10)
result_df = pd.DataFrame(result)

print(result_df)

# Join the three tables books, book authors, and authors
# You don't need to, but practice specifying relationships for joins
with gravity_engine.connect() as con:
    stmt = select(books.join(book_authors, books.columns.book_id == book_authors.columns.book_id).join(authors))
    result = con.execute(stmt).fetchmany(size=10)
result_df = pd.DataFrame(result)

print(result_df)

                                         title           author_name
0  The World's First Love: Mary  Mother of God  A. Bartlett Giamatti
1  The World's First Love: Mary  Mother of God   A. Elizabeth Delany
2  The World's First Love: Mary  Mother of God            A. Merritt
3  The World's First Love: Mary  Mother of God      A. Roger Merrill
4  The World's First Love: Mary  Mother of God        A. Walton Litz
5  The World's First Love: Mary  Mother of God         A.B. Yehoshua
6  The World's First Love: Mary  Mother of God         A.D.P. Briggs
7  The World's First Love: Mary  Mother of God       A.E. Cunningham
8  The World's First Love: Mary  Mother of God         A.E. van Vogt
9  The World's First Love: Mary  Mother of God        A.G. Pasquella
   book_id                                              title         isbn13  \
0     1570                          Good Poems for Hard Times  9780143037675   
1    10539                     Baseball: a Literary Anthology  9781931082099   
2

  result = con.execute(stmt).fetchmany(size=10)


In [14]:
# WIth complex joins we might get errors, so we can use select from to say,
# with our select, where we want the columns to come from
# we can obviously get complex with these and other things we've used too!

with chinook_engine.connect() as con:
    stmt = select(artists.columns.Name, albums.columns.Title, tracks.columns.Name)
    stmt = stmt.select_from(artists.join(albums).join(tracks))
    result = con.execute(stmt).fetchmany(size=10)
result_df = pd.DataFrame(result)

print(result_df)



    Name                                  Title  \
0  AC/DC  For Those About To Rock We Salute You   
1  AC/DC  For Those About To Rock We Salute You   
2  AC/DC  For Those About To Rock We Salute You   
3  AC/DC  For Those About To Rock We Salute You   
4  AC/DC  For Those About To Rock We Salute You   
5  AC/DC  For Those About To Rock We Salute You   
6  AC/DC  For Those About To Rock We Salute You   
7  AC/DC  For Those About To Rock We Salute You   
8  AC/DC  For Those About To Rock We Salute You   
9  AC/DC  For Those About To Rock We Salute You   

                                    Name_1  
0  For Those About To Rock (We Salute You)  
1                    Put The Finger On You  
2                          Let's Get It Up  
3                         Inject The Venom  
4                               Snowballed  
5                               Evil Walks  
6                                   C.O.D.  
7                       Breaking The Rules  
8                 Night Of The Lo

In [12]:
# The task is to use SQLAlchemy and pandas and, any way you want, find out: 
# The name of the author who has the most books in the database, and what those books are
# A stretch task for homework is to find out how many orders have been placed which contain books by that author
# for the stretch task, email me your answers.


with gravity_engine.connect() as con:
    stmt = select(authors.columns.author_name, books.columns.title).select_from(
        book_authors.join(books, book_authors.columns.book_id == books.columns.book_id)).join(authors, book_authors.columns.author_id == authors.columns.author_id)
    results = con.execute(stmt).fetchall()

df = pd.DataFrame(results)
df_count = df.value_counts('author_name')
df_count

author_name
Stephen King         70
Sandra Brown         46
P.G. Wodehouse       45
Mercedes Lackey      43
Orson Scott Card     40
                     ..
J.P. Seaton           1
J.M.D. Meiklejohn     1
J.M. Synge            1
J.M. Lelen            1
Jack Higgins          1
Name: count, Length: 9094, dtype: int64