# From SQL to pandas challenge 11

In [1]:
# import libraries
import pandas as pd

# load data
# This code is made to load our data stored on Google Drive
def gd_path(file_id):
    """Generate a shareable link from Google Drive file id."""
    return f"https://drive.google.com/uc?export=download&id={file_id}"

# Google Drive file ids
files_id = {
    "titleauthor": "1F1JOiYXStWacOBca6coNVfyVtoST7ZgD",
    "titles": "1PLdn50N9GRa53ZbuVWo0l47F_IXdvlEm",
    "stores": "1f-GCgip7O93CpbAkYvOsc21eKnSOSHsQ", 
    "sales": "1fzFc9rwYmVIPaGOFmhLVxCi3kg19vNU2", 
    "roysched": "1zPRZPoFPEMKyrNR5VSENeYFHGCBZmxbs", 
    "publishers": "1s9E8_AVOziTrowb3wyh2jg3PV763VOyq",
    "pub_info": "1OEgogcGKy--EpuVj0kqq7lyBZNGW6YSv", 
    "jobs": "1V1Za8hUdXD-vJOyRdX4aQV5wanIff2eM", 
    "employee": "1h9mUjsVqpP74b1w0x7KOw37n_n9Ulkt5", 
    "discounts": "111dvSxMcCsTgOuV1wDSKFJxO1Xcxd9VS", 
    "authors": "1fEF89Nhe61EebAljKlwFwfEuokK0o6aJ"
}

# Read data from Google Drive
sales = pd.read_csv(gd_path(files_id["sales"]), sep=";")
titles = pd.read_csv(gd_path(files_id["titles"]), sep=";")
publishers = pd.read_csv(gd_path(files_id["publishers"]), sep=";")
employee = pd.read_csv(gd_path(files_id["employee"]), sep=";")
authors = pd.read_csv(gd_path(files_id["authors"]), sep=";")
titleauthor = pd.read_csv(gd_path(files_id["titleauthor"]), sep=";")
roysched = pd.read_csv(gd_path(files_id["roysched"]), sep=";")

## 1. Using LEFT JOIN: in which cities has "Is Anger the Enemy?" been sold?

In [4]:
# publishers.merge(titles,on='pub_id').loc[titles['title']=='Is Anger the Enemy?']
publishers.merge(titles,on='pub_id').query("title =='Is Anger the Enemy?'")['city']

1    Boston
Name: city, dtype: object

### Hint:

In SQL the syntax is:

```sql
SELECT p.city
FROM publishers AS p
LEFT JOIN titles AS t
ON p.pub_id = t.pub_id
WHERE t.title = 'Is Anger the Enemy?';
```

## 2. Select all the book titles that have a link to the employee Howard Snyder 
    (he works for the publisher that has published those books).

In [8]:
# employee.merge(titles,on='pub_id').loc[employee['fname']=='Howard']
employee.merge(titles,on='pub_id').query("fname =='Howard'&lname=='Snyder'")['title']

135                  You Can Combat Computer Stress!
136                              Is Anger the Enemy?
137                                Life Without Fear
138    Prolonged Data Deprivation: Four Case Studies
139              Emotional Security: A New Algorithm
Name: title, dtype: object

### Hint:

In SQL the syntax is:

```sql
SELECT t.title
FROM employee e
JOIN titles t
ON e.pub_id = t.pub_id
WHERE e.fname = 'Howard'
AND e.lname = 'Snyder';
```

## 3. Using the `merge` of your choice: Select the book title with highest number of sales (qty)

In [10]:
sales.merge(titles,on='title_id').groupby('title_id').agg({'qty':sum}).sort_values(by='qty',ascending=False).head(1)

Unnamed: 0_level_0,qty
title_id,Unnamed: 1_level_1
PS2091,108


### Hint:

In SQL the syntax is:

```sql
SELECT t.title, SUM(qty)
FROM sales AS s 
JOIN titles t
ON s.title_id = t.title_id
GROUP BY t.title_id
ORDER BY SUM(qty) desc
LIMIT 1;
```

# 4. Select all book titles and the full name of their author(s).
      
      - If a book has multiple authors, all authors must be displayed (in 
      multiple rows).
      
      - Books with no authors and authors with no books should not be displayed.

In [11]:
titles.merge(titleauthor,on='title_id').merge(authors,on='au_id')[['title','au_fname','au_lname']]

Unnamed: 0,title,au_fname,au_lname
0,The Busy Executive's Database Guide,Marjorie,Green
1,You Can Combat Computer Stress!,Marjorie,Green
2,The Busy Executive's Database Guide,Abraham,Bennet
3,Cooking with Computers: Surreptitious Balance ...,Michael,O'Leary
4,"Sushi, Anyone?",Michael,O'Leary
5,Cooking with Computers: Surreptitious Balance ...,Stearns,MacFeather
6,Computer Phobic AND Non-Phobic Individuals: Be...,Stearns,MacFeather
7,Straight Talk About Computers,Dean,Straight
8,Silicon Valley Gastronomic Treats,Innes,del Castillo
9,The Gourmet Microwave,Michel,DeFrance


### Hint:

In SQL the syntax is:

```sql
SELECT
    t.title,
    a.au_fname,
    a.au_lname
FROM titles t
INNER JOIN titleauthor ta 
ON t.title_id = ta.title_id
INNER JOIN authors a 
ON ta.au_id = a.au_id;
```

## 5. Select the full name of authors of Psychology books

   Bonus hint: if you want to prevent duplicates but allow authors with shared
   last names to be displayed, you can concatenate the first and last names
   with CONCAT(), and use the DISTINCT clause on the concatenated names.

In [34]:
# [["au_fname", "au_lname"]].apply("-".join, axis=1)
authors.merge(titleauthor,on='au_id').merge(titles,on='title_id').query("type=='psychology'")[["au_fname", "au_lname"]].apply("-".join, axis=1).unique()


array(['Johnson-White', 'Charlene-Locksley', 'Stearns-MacFeather',
       'Livia-Karsen', 'Anne-Ringer', 'Albert-Ringer'], dtype=object)

### Hint:

In SQL the syntax is:

```sql
SELECT DISTINCT CONCAT(a.au_fname, " ", a.au_lname) AS full_name
FROM authors a
INNER JOIN titleauthor ta ON a.au_id = ta.au_id
INNER JOIN titles t ON ta.title_id = t.title_id
WHERE t.type = "Psychology";
```

## 6. Explore the table roysched and try to grasp the meaning of each column. 
   The notes below will help:
   
   - "Royalty" means the percentage of the sale price paid to the author(s).
   
   - Sometimes, the royalty may be smaller for the first few sales (which have
     to cover the publishing costs to the publisher) but higher for the sales 
     above a certain threshold.
     
   - In the "roysched" table each title_id can appear multiple times, with
     different royalty values for each range of sales.
     
   - Select all rows for particular title_id, for example "BU1111", and explore
	 the data.

In [13]:
roysched[roysched['title_id']=='BU1111']

Unnamed: 0,title_id,lorange,hirange,royalty
49,BU1111,0,4000,10
50,BU1111,4001,8000,12
51,BU1111,8001,10000,14
52,BU1111,12001,16000,16
53,BU1111,16001,20000,18
54,BU1111,20001,24000,20
55,BU1111,24001,28000,22
56,BU1111,28001,50000,24


### Hint:

In SQL the syntax is:

```sql
SELECT * FROM roysched WHERE title_id = "BU1111";
```

## 7. Select all the book titles and the maximum royalty they can reach.
    Display only titles that are present in the roysched table.

In [25]:
# titles.merge(roysched,on='title_id').groupby('title').agg({'royalty':max}).sort_values(by='royalty',ascending=False)[['title','royalty']]
# assuming a match of royalty in both tables -> absent in roysched table
titles.merge(roysched,on='title_id').query("royalty_x == royalty_y").groupby('title').agg({'royalty_x':max}).sort_values(by='royalty_x',ascending=False)


Unnamed: 0_level_0,royalty_x
title,Unnamed: 1_level_1
The Gourmet Microwave,24
You Can Combat Computer Stress!,24
But Is It User Friendly?,16
Fifty Years in Buckingham Palace Kitchens,14
Is Anger the Enemy?,12
Silicon Valley Gastronomic Treats,12
Computer Phobic AND Non-Phobic Individuals: Behavior Variations,10
Cooking with Computers: Surreptitious Balance Sheets,10
Emotional Security: A New Algorithm,10
Life Without Fear,10


### Hint:

In SQL the syntax is:

```sql
SELECT t.title, MAX(r.royalty) max_royalty
FROM titles t
INNER JOIN roysched r 
ON t.title_id = r.title_id
GROUP BY t.title
ORDER BY max_royalty DESC;
```