# Lecture 1 - Relating


In [1]:
# Let's import the required libraries
import sqlite3
import pandas as pd

# Let's connect to the SQLite database used in CS50
conn = sqlite3.connect("longlist.db")


In [2]:
pd.read_sql_query(
    """
    SELECT *
    FROM sqlite_master
    WHERE type = 'table';
    """,
    conn
)



Unnamed: 0,type,name,tbl_name,rootpage,sql
0,table,authors,authors,2,"CREATE TABLE ""authors"" (\n ""id"" INTEGER,\n ..."
1,table,authored,authored,3,"CREATE TABLE ""authored"" (\n ""author_id"" INT..."
2,table,books,books,4,"CREATE TABLE ""books"" (\n ""id"" INTEGER,\n ..."
3,table,publishers,publishers,5,"CREATE TABLE ""publishers"" (\n ""id"" INTEGER,..."
4,table,ratings,ratings,6,"CREATE TABLE ""ratings"" (\n ""book_id"" INTEGE..."
5,table,translators,translators,7,"CREATE TABLE ""translators"" (\n ""id"" INTEGER..."
6,table,translated,translated,8,"CREATE TABLE ""translated"" (\n ""translator_i..."


### Tables

In [3]:
pd.read_sql_query(
    """
    SELECT *
    FROM authors
    LIMIT 3;
    """,
    conn
)

Unnamed: 0,id,name,country,birth
0,1,Adania Shibli,Palestine,1974
1,2,Ahmed Saadawi,Iraq,1973
2,3,Alia Trabucco Zerán,Chile,1983


In [4]:
pd.read_sql_query(
    """
    SELECT *
    FROM books
    LIMIT 3;
    """,
    conn
)

Unnamed: 0,id,isbn,title,publisher_id,format,pages,published,year
0,1,9788439736967,Boulder,10,paperback,112,2022-08-02,2023
1,2,9781628971538,Whale,3,paperback,368,2023-01-19,2023
2,3,9781642861181,The Gospel According to the New World,32,paperback,184,2023-03-07,2023


In [16]:
pd.read_sql_query(
    """
    SELECT *
    FROM authored
    LIMIT 3;
    """,
    conn
)

Unnamed: 0,author_id,book_id
0,23,1
1,13,2
2,49,3


### Keys

#### Primary key: an identifier that is unique for every item in a table

#### Foreign Keys: a primary key taken from a different table

Note that: foreign key values can be repeated within a table, but primary key values are always unique.

### Subqueries or Nested queries

A subquery is a query inside another query. These are also called nested queries.

In [17]:
pd.read_sql_query(
    """
    SELECT *
    FROM books
    LIMIT 1;
    """,
    conn
)

Unnamed: 0,id,isbn,title,publisher_id,format,pages,published,year
0,1,9788439736967,Boulder,10,paperback,112,2022-08-02,2023


In the books table, we have an ID to indicate the publisher, which is a foreign key taken from the publishers table

In [18]:
pd.read_sql_query(
    """
    SELECT *
    FROM "publishers"
    LIMIT 1;
    """,
    conn
)

Unnamed: 0,id,publisher
0,1,And Other Stories


#### - Task 1: to find out the books published by Fitzcarraldo Editions

In [23]:
pd.read_sql_query(
"""
SELECT "id" ,"publisher"
FROM "publishers" 
WHERE "publisher"='Fitzcarraldo Editions';
""",
conn
)

Unnamed: 0,id,publisher
0,5,Fitzcarraldo Editions


In [22]:
pd.read_sql_query(
"""
SELECT "isbn","title", "publisher_id" 
FROM "books" 
WHERE "publisher_id"= (SELECT "id" FROM "publishers" WHERE "publisher"='Fitzcarraldo Editions');
""",
conn
)

Unnamed: 0,isbn,title,publisher_id
0,9781804270288,While We Were Dreaming,5
1,9781913097660,Still Born,5
2,9781913097875,Paradais,5
3,9781913097721,A New Name: Septology VI-VII,5
4,9781910695593,The Books of Jacob,5
5,9781913097530,In Memory of Memory,5
6,9781913097172,Minor Detail,5
7,9781910695913,The Other Name: Septology I-II,5
8,9781913097097,Hurricane Season,5
9,9781910695715,Drive Your Plow Over the Bones of the Dead,5


#### - Task 2: To find all the ratings for the book In Memory of Memory

In [24]:
pd.read_sql_query(
    """
    SELECT *
    FROM "ratings"
    LIMIT 1;
    """,
    conn
)

Unnamed: 0,book_id,rating
0,1,3


In [27]:
pd.read_sql_query(
    """
    SELECT "rating","book_id"
    FROM "ratings"
    WHERE "book_id"=(
        SELECT "id"
        FROM "books"
        WHERE "title"='In Memory of Memory'
    );
    """,
    conn
)

Unnamed: 0,rating,book_id
0,3,33
1,4,33
2,4,33
3,4,33
4,3,33
...,...,...
1549,4,33
1550,4,33
1551,4,33
1552,4,33


In [29]:
pd.read_sql_query(
"""
SELECT "id","title"
FROM "books"
WHERE "title"='In Memory of Memory';
""",
conn
)

Unnamed: 0,id,title
0,33,In Memory of Memory


#### - Task 3: To compute the average rating for the book In Memory of Memory

In [31]:
pd.read_sql_query(
    """
    SELECT AVG("rating")
    FROM "ratings"
    WHERE "book_id"=(
        SELECT "id"
        FROM "books"
        WHERE "title"='In Memory of Memory'
    );
    """,
    conn
)

Unnamed: 0,"AVG(""rating"")"
0,3.86036


#### - Task 4: To find the author(s) who wrote the book Flights

In [32]:
pd.read_sql_query(
    """
    SELECT *
    FROM "authored"
    LIMIT 1;
    """,
    conn
)

Unnamed: 0,author_id,book_id
0,23,1


In [35]:
pd.read_sql_query(
    """
    SELECT "name"
    FROM "authors"
    WHERE "id"=(    
        SELECT "author_id"
        FROM "authored"
        WHERE "book_id"=(
            SELECT "id"
            FROM "books"
            WHERE "title"='Flights'
        )
    );
    """,
    conn
)

Unnamed: 0,name
0,Olga Tokarczuk


### IN - check whether a value is in a given list or set of values.

- Task 1: To find the names of all books in the database written by Fernanda Melchor

In [36]:
# Find the id of books written by Fernanda
pd.read_sql_query(
"""
SELECT "book_id"
FROM "authored"
WHERE "author_id" = (
    SELECT "id"
    FROM "authors"
    WHERE "name" = 'Fernanda Melchor'
);
""",
conn
)

Unnamed: 0,book_id
0,14
1,48


We have to use the IN keyword as follows:

In [41]:
# Find the id of books written by Fernanda
pd.read_sql_query(
"""
SELECT *
FROM "books"
WHERE "id" IN (
    SELECT "book_id"
    FROM "authored"
    WHERE "author_id" = (
        SELECT "id"
        FROM "authors"
        WHERE "name" = 'Fernanda Melchor'
    )
);
""",
conn
)

Unnamed: 0,id,isbn,title,publisher_id,format,pages,published,year
0,14,9781913097875,Paradais,5,paperback,118,2022-03-23,2022
1,48,9781913097097,Hurricane Season,5,paperback,229,2020-02-19,2020


Use IN to search for multiple authors.
- Task 2: To find the titles of all books in the database written by Fernanda Melchor and Annie Ernaux.

In [8]:
# Find the id of authors
pd.read_sql_query(
"""
SELECT "id","name"
FROM "authors" 
WHERE "name" IN ('Fernanda Melchor', 'Annie Ernaux');
""",
conn
)

Unnamed: 0,id,name
0,7,Annie Ernaux
1,24,Fernanda Melchor


In [None]:
# Find all the book IDs
pd.read_sql_query(
"""
SELECT "book_id","author_id"  
FROM "authored" 
WHERE "author_id" IN (
    SELECT "id" 
    FROM "authors" 
    WHERE "name" IN ('Fernanda Melchor', 'Annie Ernaux')
);
""",
conn 
)

Unnamed: 0,book_id,author_id
0,14,24
1,48,24
2,65,7


In [11]:
# Find all the book
pd.read_sql_query(
"""
SELECT "title" 
FROM "books" 
WHERE "id" IN (
    SELECT "book_id"
    FROM "authored" 
    WHERE "author_id" IN (
        SELECT "id" 
        FROM "authors" 
        WHERE "name" IN ('Fernanda Melchor', 'Annie Ernaux')
    )
);
""",
conn 
)

Unnamed: 0,title
0,Paradais
1,Hurricane Season
2,The Years


In [42]:
conn.close()


### JOIN - combine two or more tables together

It is not possible to join tables without matching values.

#### Tables

In [19]:
# Let's connect to the SQLite database used in CS50
conn = sqlite3.connect("sea_lions.db")

pd.read_sql_query(
    """
    SELECT *
    FROM sqlite_master
    WHERE type = 'table';
    """,
    conn
)

Unnamed: 0,type,name,tbl_name,rootpage,sql
0,table,sea_lions,sea_lions,3,"CREATE TABLE ""sea_lions"" (\n ""id"" INTEGER,\..."
1,table,migrations,migrations,2,"CREATE TABLE ""migrations"" (\n ""id"" INTEGER,..."


In [13]:
pd.read_sql_query(
    """
    SELECT *
    FROM "sea_lions"
    """,
    conn
)

Unnamed: 0,id,name,species
0,10484,Ayah,Zalophus californianus
1,11728,Spot,Zalophus californianus
2,11729,Tiger,Zalophus californianus
3,11732,Mabel,Zalophus californianus
4,11734,Rick,Zalophus californianus
5,11790,Jolee,Zalophus californianus


In [14]:
pd.read_sql_query(
    """
    SELECT *
    FROM "migrations"
    """,
    conn
)

Unnamed: 0,id,distance,days
0,10484,1000,107
1,11728,1531,56
2,11729,1370,37
3,11732,1622,62
4,11734,1491,58
5,11735,2723,82
6,11736,1571,52
7,11737,1957,92


#### JOIN (INNER) - retains only the rows that have matching values in both the left and the right table.

In [50]:
join_inner = '''
SELECT * FROM "sea_lions"
JOIN migrations ON "migrations"."id" = "sea_lions"."id";
'''
df = pd.read_sql_query(join_inner, conn)
print(df)



      id   name                 species     id  distance  days
0  10484   Ayah  Zalophus californianus  10484      1000   107
1  11728   Spot  Zalophus californianus  11728      1531    56
2  11729  Tiger  Zalophus californianus  11729      1370    37
3  11732  Mabel  Zalophus californianus  11732      1622    62
4  11734   Rick  Zalophus californianus  11734      1491    58


#### NATURAL JOIN (INNER)

In [None]:
join = '''
SELECT * FROM "sea_lions"
NATURAL JOIN migrations;
'''
df = pd.read_sql_query(join, conn)
print(df)

      id   name                 species  distance  days
0  10484   Ayah  Zalophus californianus      1000   107
1  11728   Spot  Zalophus californianus      1531    56
2  11729  Tiger  Zalophus californianus      1370    37
3  11732  Mabel  Zalophus californianus      1622    62
4  11734   Rick  Zalophus californianus      1491    58


Notice that the result does not have a duplicate id column in this case. Also, this join works similarly to an INNER JOIN.

### WHERE + JOIN

Use WHERE after joining a table

In [21]:
join = '''
SELECT * FROM "sea_lions"
JOIN migrations ON "migrations"."id" = "sea_lions"."id"
WHERE "migrations"."distance" > 1500
;
'''
df = pd.read_sql_query(join, conn)
print(df)

      id   name                 species     id  distance  days
0  11728   Spot  Zalophus californianus  11728      1531    56
1  11732  Mabel  Zalophus californianus  11732      1622    62


### OUTER JOINS: LEFT JOIN, RIGHT JOIN and FULL JOIN

An OUTER JOIN could lead to empty or NULL values in the joined table

#### LEFT JOIN  - retains all the rows from the left (or first) table.

In [52]:
join = '''
SELECT * FROM "sea_lions"
LEFT JOIN migrations ON "migrations"."id" = "sea_lions"."id";
'''
df = pd.read_sql_query(join, conn)
print(df)



      id   name                 species       id  distance   days
0  10484   Ayah  Zalophus californianus  10484.0    1000.0  107.0
1  11728   Spot  Zalophus californianus  11728.0    1531.0   56.0
2  11729  Tiger  Zalophus californianus  11729.0    1370.0   37.0
3  11732  Mabel  Zalophus californianus  11732.0    1622.0   62.0
4  11734   Rick  Zalophus californianus  11734.0    1491.0   58.0
5  11790  Jolee  Zalophus californianus      NaN       NaN    NaN


#### RIGHT JOIN - retains all the rows from the right (or second) table.

In [53]:
join_inner = '''
SELECT * FROM "sea_lions"
RIGHT JOIN migrations ON "migrations"."id" = "sea_lions"."id";
'''
df = pd.read_sql_query(join_inner, conn)
print(df)



        id   name                 species     id  distance  days
0  10484.0   Ayah  Zalophus californianus  10484      1000   107
1  11728.0   Spot  Zalophus californianus  11728      1531    56
2  11729.0  Tiger  Zalophus californianus  11729      1370    37
3  11732.0  Mabel  Zalophus californianus  11732      1622    62
4  11734.0   Rick  Zalophus californianus  11734      1491    58
5      NaN   None                    None  11735      2723    82
6      NaN   None                    None  11736      1571    52
7      NaN   None                    None  11737      1957    92


#### FULL JOIN (OUTER) - retains all the rows from the first and second table.

In [20]:
join = '''
SELECT * FROM "sea_lions"
FULL JOIN migrations ON "migrations"."id" = "sea_lions"."id";
'''
df = pd.read_sql_query(join, conn)
print(df)



        id   name                 species       id  distance   days
0  10484.0   Ayah  Zalophus californianus  10484.0    1000.0  107.0
1  11728.0   Spot  Zalophus californianus  11728.0    1531.0   56.0
2  11729.0  Tiger  Zalophus californianus  11729.0    1370.0   37.0
3  11732.0  Mabel  Zalophus californianus  11732.0    1622.0   62.0
4  11734.0   Rick  Zalophus californianus  11734.0    1491.0   58.0
5  11790.0  Jolee  Zalophus californianus      NaN       NaN    NaN
6      NaN   None                    None  11735.0    2723.0   82.0
7      NaN   None                    None  11736.0    1571.0   52.0
8      NaN   None                    None  11737.0    1957.0   92.0


In [17]:
conn.close()

### SETS

On running a query, the results we see are called a "result set". This is a kind of set in SQL.

In [22]:
# Let's connect to the SQLite database used in CS50
conn = sqlite3.connect("longlist.db")

INTERSECT

In [26]:
pd.read_sql_query("""
SELECT "name" FROM "translators"
INTERSECT 
SELECT "name" FROM "authors";
""",
conn)


Unnamed: 0,name
0,Ngũgĩ wa Thiong'o


UNION

In [27]:
pd.read_sql_query("""
SELECT "name" FROM "translators"
UNION 
SELECT "name" FROM "authors";
""",
conn)

Unnamed: 0,name
0,Adania Shibli
1,Adrian Nathan West
2,Ahmed Saadawi
3,Alia Trabucco Zerán
4,Alison L. Strayer
...,...
140,Willem Anker
141,Wu Ming-Yi
142,Yōko Ogawa
143,Zou Jingzhi


Every author and every translator is included in this result set, but only once!

EXCEPT

In [28]:
pd.read_sql_query("""
SELECT "name" FROM "translators"
EXCEPT 
SELECT "name" FROM "authors";
""",
conn)

Unnamed: 0,name
0,Adrian Nathan West
1,Alison L. Strayer
2,Angela Rodel
3,Aniruddhan Vasudevan
4,Anna Moschovakis
...,...
68,Sophie Lewis
69,Sora Kim-Russell
70,Stephen Snyder
71,Susan Bernofsky


### AS

In [44]:
pd.read_sql_query("""
SELECT *
FROM "authors";
""",
conn)

Unnamed: 0,id,name,country,birth
0,1,Adania Shibli,Palestine,1974
1,2,Ahmed Saadawi,Iraq,1973
2,3,Alia Trabucco Zerán,Chile,1983
3,4,Amanda Svensson,Sweden,1987
4,5,Andrey Kurkov,Ukraine,1961
...,...,...,...,...
67,68,Virginie Despentes,France,1969
68,69,Willem Anker,South Africa,1979
69,70,Wu Ming-Yi,Taiwan,1971
70,71,Yōko Ogawa,Japan,1962


##### ADD COLUMN using AS

- Task 1: Select all authors and add a column "profession" with a label 'author'

In [47]:
pd.read_sql_query("""
SELECT 'author' AS "profession", "name" 
FROM "authors";
""",
conn)

Unnamed: 0,profession,name
0,author,Adania Shibli
1,author,Ahmed Saadawi
2,author,Alia Trabucco Zerán
3,author,Amanda Svensson
4,author,Andrey Kurkov
...,...,...
67,author,Virginie Despentes
68,author,Willem Anker
69,author,Wu Ming-Yi
70,author,Yōko Ogawa


- Task 2: Select all translators, labeling them as translators

In [48]:
pd.read_sql_query("""
SELECT 'translator' AS "profession", "name" 
FROM "translators";
""",
conn)

Unnamed: 0,profession,name
0,translator,Adrian Nathan West
1,translator,Alison L. Strayer
2,translator,Angela Rodel
3,translator,Aniruddhan Vasudevan
4,translator,Anna Moschovakis
...,...,...
69,translator,Sora Kim-Russell
70,translator,Stephen Snyder
71,translator,Susan Bernofsky
72,translator,Tiffany Tsao


- Task 3: Combine authors and translators into one result set


In [50]:
pd.read_sql_query("""
SELECT 'author' AS "profession", "name" 
FROM "authors"
UNION
SELECT 'translator' AS "profession", "name" FROM "translators";
""",
conn)

Unnamed: 0,profession,name
0,author,Adania Shibli
1,author,Ahmed Saadawi
2,author,Alia Trabucco Zerán
3,author,Amanda Svensson
4,author,Andrey Kurkov
...,...,...
141,translator,Sophie Lewis
142,translator,Sora Kim-Russell
143,translator,Stephen Snyder
144,translator,Susan Bernofsky


#### RENAME COLUMN using AS

In [51]:
pd.read_sql_query("""
SELECT "book_id" AS "ID"
FROM "ratings";
""",
conn)

Unnamed: 0,ID
0,1
1,1
2,1
3,1
4,1
...,...
604168,78
604169,78
604170,78
604171,78


### GROUP BY

In [31]:
pd.read_sql_query("""
SELECT "book_id", AVG("rating"), COUNT("rating")
FROM "ratings"
GROUP BY "book_id";
""",
conn)

Unnamed: 0,book_id,"AVG(""rating"")","COUNT(""rating"")"
0,1,3.774019,2779
1,2,3.971429,175
2,3,3.043860,114
3,4,3.568323,322
4,5,4.059198,3142
...,...,...,...
73,74,3.831412,14052
74,75,3.769430,772
75,76,3.893714,12250
76,77,3.539742,4567


In [39]:
pd.read_sql_query("""
SELECT "book_id", AVG("rating") AS "average rating", COUNT("rating")
FROM "ratings"
GROUP BY "book_id"
ORDER BY "average rating" DESC;
""",
conn)

Unnamed: 0,book_id,average rating,"COUNT(""rating"")"
0,42,4.511254,16350
1,22,4.501044,479
2,45,4.191165,1245
3,65,4.175450,16888
4,11,4.140578,7647
...,...,...,...
73,32,3.294737,95
74,23,3.248219,1684
75,58,3.176692,266
76,62,3.158228,1738


### HAVING + GROUP BY - Conditions on Groups

 HAVING keyword is used to specify a condition for the groups, instead of WHERE (which can only be used to specify conditions for individual rows).

In [53]:
pd.read_sql_query("""
SELECT "book_id", ROUND(AVG("rating"),3) AS "average rating", COUNT("rating")
FROM "ratings"
GROUP BY "book_id"
HAVING "average rating" > 4.17;
""",
conn)

Unnamed: 0,book_id,average rating,"COUNT(""rating"")"
0,22,4.501,479
1,42,4.511,16350
2,45,4.191,1245
3,65,4.175,16888
