# From SQL to pandas challenge 10

In [27]:

# import libraries
import pandas as pd
import numpy as np

# load data
# This code is made to load our data stored on Google Drive
def gd_path(file_id):
    """Generate a shareable link from Google Drive file id."""
    return f"https://drive.google.com/uc?export=download&id={file_id}"

# Google Drive file ids
files_id = {
    "titleauthor": "1F1JOiYXStWacOBca6coNVfyVtoST7ZgD",
    "titles": "1PLdn50N9GRa53ZbuVWo0l47F_IXdvlEm",
    "stores": "1f-GCgip7O93CpbAkYvOsc21eKnSOSHsQ", 
    "sales": "1fzFc9rwYmVIPaGOFmhLVxCi3kg19vNU2", 
    "roysched": "1zPRZPoFPEMKyrNR5VSENeYFHGCBZmxbs", 
    "publishers": "1s9E8_AVOziTrowb3wyh2jg3PV763VOyq",
    "pub_info": "1OEgogcGKy--EpuVj0kqq7lyBZNGW6YSv", 
    "jobs": "1V1Za8hUdXD-vJOyRdX4aQV5wanIff2eM", 
    "employee": "1h9mUjsVqpP74b1w0x7KOw37n_n9Ulkt5", 
    "discounts": "111dvSxMcCsTgOuV1wDSKFJxO1Xcxd9VS", 
    "authors": "1fEF89Nhe61EebAljKlwFwfEuokK0o6aJ"
}

# Read data from Google Drive
sales = pd.read_csv(gd_path(files_id["sales"]), sep=";")
titles = pd.read_csv(gd_path(files_id["titles"]), sep=";")
publishers = pd.read_csv(gd_path(files_id["publishers"]), sep=";")



## 1. Select everything from the sales table and create a new column called "sales_category" to categorise qty:
   
		qty >= 50 high sales
		20 <= qty < 50 medium sales
		qty < 20 low sales

In [2]:
# https://pandas.pydata.org/pandas-docs/stable/user_guide/cookbook.html
# https://stackoverflow.com/questions/32633977/how-to-create-categorical-variable-based-on-a-numerical-variable

sales.loc[(sales['qty'] >=50),'sales_category']='high_sales'
sales.loc[(sales['qty'] <50)&(sales['qty'] >=20),'sales_category'] = 'medium_sales'
sales.loc[(sales['qty'] <20),'sales_category']='low_sales'

sales.head(3)

Unnamed: 0,stor_id,ord_num,ord_date,qty,payterms,title_id,sales_category
0,6380,6871,1994-09-14 00:00:00,5,Net 60,BU1032,low_sales
1,6380,722a,1994-09-13 00:00:00,3,Net 60,PS2091,low_sales
2,7066,A2976,1993-05-24 00:00:00,50,Net 30,PC8888,high_sales


### Hint:

In SQL the syntax is:

```sql
SELECT *,
CASE
    WHEN qty >= 50 THEN "high sales"
    WHEN qty >= 20 THEN "medium sales"
    ELSE "low sales"
END AS sales_category
FROM sales;
```

## 2. Adding to your answer from the previous question. Find out the total amount of books sold (qty) in each sales category 
    i.e. How many books had high sales, how many had medium sales, and how many had low sales

In [7]:
print(sales[sales['sales_category']=='high_sales']['qty'].sum())
print(sales[sales['sales_category']=='medium_sales']['qty'].sum())
print(sales[sales['sales_category']=='low_sales']['qty'].sum())

125
285
83


### Hint:

In SQL the syntax is:

```sql
SELECT sum(qty),
CASE
	WHEN qty>=50 THEN 'high sales'
    WHEN (qty>=20 AND qty<50) THEN 'medium sales'
    ELSE 'low sales'
END AS sales_category
FROM sales
GROUP BY sales_category;
```

## 3. Adding to your answer from the previous questions: output only those sales categories that have a SUM(qty) greater than 100, and order them in descending order

In [24]:
sales.groupby('sales_category').sum().query('qty > 100').sort_values(by='qty',ascending=False)

Unnamed: 0_level_0,stor_id,qty
sales_category,Unnamed: 1_level_1,Unnamed: 2_level_1
medium_sales,80836,285
high_sales,14132,125


### Hint:

In SQL the syntax is:

```sql
SELECT sum(qty),
CASE
    WHEN qty>=50 THEN 'high sales'
    WHEN (qty>=20 AND qty<50) THEN 'medium sales'
    ELSE 'low sales'
END AS sales_category
FROM sales
GROUP BY sales_category
HAVING sum(qty)>100
ORDER BY sum(qty) DESC;
```

## 4. Find out the average book price, per publisher, for the following book types and price categories:
		book types: business, traditional cook and psychology
		price categories: <= 5 super low, <= 10 low, <= 15 medium, > 15 high
        
        - When displaying the average prices, use ROUND() to hide decimals.

In [46]:
# titles.head(3)

titles.loc[(titles['price'] <=5),'price_category'] = 'super_low'
titles.loc[(titles['price'] <=10)&(titles['price'] >5),'price_category'] = 'low'
titles.loc[(titles['price'] >10)&(titles['price'] <=15),'price_category'] = 'medium'
titles.loc[(titles['price'] >15),'price_category'] = 'high'
df = titles.merge(publishers,on='pub_id').loc[titles['type'].isin(['business', 'trad_cook', 'psychology'])]
# df['price'].apply(np.mean).apply(np.floor)
df.groupby(['pub_name','type','price_category'])['price'].apply(np.mean).apply(np.floor)


pub_name              type          price_category
Algodata Infosystems  business      high              19.0
                                    medium            11.0
                      popular_comp  high              22.0
Binnet & Hardley      UNDECIDED     super_low          0.0
                      mod_cook      high              19.0
                                    super_low          2.0
                      psychology    high              21.0
                      trad_cook     high              20.0
                                    medium            13.0
New Moon Books        psychology    low                7.0
Name: price, dtype: float64

### Hint:

In SQL the syntax is:

```sql
SELECT
    ROUND(AVG(price)),
    type,
    pub_name,
CASE
    WHEN price <= 5 THEN 'super low'
    WHEN (price > 5 AND price <= 10) THEN 'low'
    WHEN (price > 10 AND price <= 15) THEN 'medium'
    ELSE 'high'
END AS price_category
FROM titles
LEFT JOIN publishers
ON titles.pub_id=publishers.pub_id
GROUP BY
    pub_name,
    type,
    price_category
HAVING
    type IN ('business', 'trad_cook', 'psychology');
```