# Project: Answering Business Questions using SQL

# Exploring contents of chinook.db

In [1]:
import sqlite3 as sq3;
import pandas as pd;

def run_query(q):
    with sq3.connect('chinook.db') as conn:
        return pd.read_sql(q,conn)

def run_command(q):
    with sq3.connect('chinook.db') as conn:
        return conn.execute(q)

def show_tables():
        q = "SELECT name, type \
        FROM sqlite_master \
        WHERE type IN (\"table\",\"view\");" ;
    
        print(run_query(q));
    


In [2]:
show_tables()

              name   type
0            album  table
1           artist  table
2         customer  table
3         employee  table
4            genre  table
5          invoice  table
6     invoice_line  table
7       media_type  table
8         playlist  table
9   playlist_track  table
10           track  table


# Exploring which genres sell the most tracks in the USA:

In [3]:
q1 ='''
    WITH 
    usa_cust AS
    (
        SELECT * FROM customer
        WHERE country = "USA"
    ),
    
    genre_qy AS
    (
        SELECT 
            g.name genre, 
            SUM(il.quantity) ct_gen,
            uc.country
        FROM usa_cust uc
        INNER JOIN invoice i ON i.customer_id = uc.customer_id
        INNER JOIN invoice_line il ON il.invoice_id = i.invoice_id
        INNER JOIN track t ON t.track_id = il.track_id
        INNER JOIN genre g ON g.genre_id = t.genre_id
        GROUP BY g.name
        ORDER BY SUM(il.quantity) DESC
    ),
    
    sum_ct AS
    (
        SELECT
            gqy.*,
            (SELECT SUM(gqy.ct_gen) FROM genre_qy gqy) ct_sum
        FROM genre_qy gqy
    )
    
    SELECT
        gqy.genre,
        gqy.ct_gen,
        CAST(gqy.ct_gen AS FLOAT)/CAST(sc.ct_sum AS FLOAT) ct_gen_pct
    FROM genre_qy gqy
    LEFT JOIN sum_ct sc ON sc.genre = gqy.genre
    LIMIT 10;
        
    '''




In [4]:
run_query(q1)

Unnamed: 0,genre,ct_gen,ct_gen_pct
0,Rock,561,0.533777
1,Alternative & Punk,130,0.123692
2,Metal,124,0.117983
3,R&B/Soul,53,0.050428
4,Blues,36,0.034253
5,Alternative,35,0.033302
6,Latin,22,0.020932
7,Pop,22,0.020932
8,Hip Hop/Rap,20,0.019029
9,Jazz,14,0.013321


> 
Based on the above data the following artists are selected based on the general demand for tracks in the artist's genre:
> 
| Artist Name | Genre |
|-------------|-------|
| Red Tone | Punk |
| Slim Jim Bites | Blues |
| Meteor and the Girls | Pop |

# Exploring sales' amounts assigned to each sales support agent:

In [5]:
q2 ='''
WITH
sales_agent AS
(
    SELECT 
    (e.first_name || ' ' || e.last_name) employee_name,
    e.title employee_title,
    e.hire_date employee_startdate,
    SUM(i.total) employee_sales_figures
    FROM employee e
    LEFT JOIN customer c ON c.support_rep_id = e.employee_id
    LEFT JOIN invoice i ON i.customer_id = c.customer_id
    GROUP BY 1
    ORDER BY 2 DESC
)

SELECT * FROM sales_agent

'''



In [6]:
run_query(q2)

Unnamed: 0,employee_name,employee_title,employee_startdate,employee_sales_figures
0,Jane Peacock,Sales Support Agent,2017-04-01 00:00:00,1731.51
1,Margaret Park,Sales Support Agent,2017-05-03 00:00:00,1584.0
2,Steve Johnson,Sales Support Agent,2017-10-17 00:00:00,1393.92
3,Nancy Edwards,Sales Manager,2016-05-01 00:00:00,
4,Laura Callahan,IT Staff,2017-03-04 00:00:00,
5,Robert King,IT Staff,2017-01-02 00:00:00,
6,Michael Mitchell,IT Manager,2016-10-17 00:00:00,
7,Andrew Adams,General Manager,2016-08-14 00:00:00,


>**As expected only Sales Support Agents have sales figures associated with them and the earlier an Agent started work the higher the person's sales figures**

# Exploring data on purchases from different countries:

In [7]:
q3 = '''
WITH
fq AS
(
    SELECT 
        c.country nation,
        COUNT(distinct(c.customer_id)) cust_count,
        SUM(i.total) total_sales,
        CAST(SUM(i.total) AS FLOAT)/CAST(COUNT(distinct(c.customer_id)) AS FLOAT) avg_customer_sale_value,
        CAST(SUM(i.total) AS FLOAT)/CAST(COUNT(distinct(i.invoice_id)) AS FLOAT)  avg_order_value
    FROM customer c
    INNER JOIN invoice i ON i.customer_id = c.customer_id
    GROUP BY c.country
    ORDER BY SUM(i.total) DESC
),

sq AS
(
    SELECT
        CASE
            WHEN cust_count = 1 THEN "Other"
            ELSE nation
        END AS country,
        SUM(cust_count) cust_count,
        SUM(total_sales) total_sales,
        AVG(avg_customer_sale_value) avg_customer_sale_value,
        AVG(avg_order_value) avg_order_value
    FROM fq
    GROUP BY country
    ORDER BY SUM(total_sales) DESC
),

tq AS
(
    SELECT
        country,
        cust_count,
        total_sales,
        avg_customer_sale_value,
        avg_order_value
    FROM
        (
        SELECT 
            s.*,
            CASE 
                WHEN s.country = "Other" THEN 1
                ELSE 0
            END AS flag
        FROM sq s
        ORDER BY flag ASC
        )
    
)

SELECT * FROM tq

'''

In [8]:
run_query(q3)

Unnamed: 0,country,cust_count,total_sales,avg_customer_sale_value,avg_order_value
0,USA,13,1040.49,80.037692,7.942672
1,Canada,8,535.59,66.94875,7.047237
2,Brazil,5,427.68,85.536,7.011148
3,France,5,389.07,77.814,7.7814
4,Germany,4,334.62,83.655,8.161463
5,Czech Republic,2,273.24,136.62,9.108
6,United Kingdom,3,245.52,81.84,8.768571
7,Portugal,2,185.13,92.565,6.383793
8,India,2,183.15,91.575,8.721429
9,Other,15,1094.94,72.996,7.445071


> Based on avg_order_value the following countries are recommended as countries likely to have the most potential for growth:

>    |country|avg_order_value (USD)|
>    |-------|---------------------|
>    |Czech Republic| 9.108|
>    |United Kingdom| 8.768|
>    |India| 8.721|
    


# Exploring whether invoices are album purchases or not

In [9]:
q4 = '''
WITH
    fq AS
    (
        SELECT 
            il.invoice_id,
            MIN(il.track_Id) album_first_track
        FROM invoice_line il
        GROUP BY il.invoice_id
    ),
    
    sq AS
    (
        SELECT 
            COUNT(f.invoice_id) Invoice_Tally,
            CASE
                WHEN
                    (
                    SELECT il.track_id from invoice_line il
                    WHERE il.invoice_id = f.invoice_id
                    
                    EXCEPT 
                    
                    SELECT t.track_id FROM track t
                    WHERE t.album_id = (
                                        SELECT album_id FROM track
                                        WHERE track_id = f.album_first_track
                                        )
                    
                    )IS NULL
                    
                    AND
        
                    (
                    SELECT t.track_id FROM track t
                    WHERE t.album_id = (
                                        SELECT album_id FROM track
                                        WHERE track_id = f.album_first_track
                                        )
                    
                    EXCEPT
                    
                    SELECT il.track_id from invoice_line il
                    WHERE il.invoice_id = f.invoice_id
                    
                    )IS NULL
                
                THEN "Yes"
                ELSE "No"
            END AS Album_Purchase
        FROM fq f
        GROUP BY Album_Purchase
                
    ),

    tq AS
    (
        SELECT
            s.Album_Purchase,
            s.Invoice_Tally,
            CAST(s.Invoice_Tally AS FLOAT)/
            (SELECT 
                SUM(s.Invoice_Tally)
            FROM sq s) Invoice_Percent
        FROM sq s
    )
    

SELECT * FROM tq
'''

In [10]:
run_query(q4)

Unnamed: 0,Album_Purchase,Invoice_Tally,Invoice_Percent
0,No,500,0.814332
1,Yes,114,0.185668


> If the demand trends indicated in the data presented above were to remain the same, it would benefit the Chinook store from a fiscal point of view to minimize the purchasing of entire albums.

# Exploring which artist is in most playlists:


In [11]:
q5 = '''

    WITH
    fq AS
    (
        SELECT 
            ar.name Artist_Name,
            COUNT(pl.playlist_id) Artist_Playlist_Tally
        FROM artist ar
        LEFT JOIN album al ON al.artist_id = ar.artist_id
        LEFT JOIN track t ON t.album_id = al.album_id
        LEFT JOIN playlist_track plt ON plt.track_id = t.track_id
        LEFT JOIN playlist pl ON pl.playlist_id = plt.playlist_id
        GROUP BY ar.name
        ORDER BY COUNT(pl.playlist_id) DESC
        LIMIT 10
    
    ),
    
    sq AS
    (
        SELECT 
            f.Artist_Name,
            MAX(f.Artist_Playlist_Tally) Playlist_Tally
        FROM fq f
    )
    
    SELECT * FROM sq
    
'''

In [12]:
run_query(q5)

Unnamed: 0,Artist_Name,Playlist_Tally
0,Iron Maiden,516


>Iron Maiden is the artist name listed most frequently in playlists

# Exploring how many tracks have been purchased vs not purchased

In [13]:
q6 ='''

    WITH
    fq AS
    (
        SELECT 
            t.track_id
        FROM track t
        
    ),
    
    sq AS
    (
        SELECT
            COUNT(DISTINCT(f.track_id)) Track_Tally,
            CASE
                WHEN
                    (
                    SELECT 
                        t.track_id FROM track t
                    WHERE t.track_id = f.track_id
                    
                    EXCEPT
                    
                    SELECT
                        il.track_id FROM invoice_line il
                    WHERE il.invoice_line_id=(
                                            SELECT 
                                                invoice_line_id 
                                            FROM invoice_line
                                            WHERE track_id=f.track_id
                                            )
                    )IS NULL
                    
                    AND
                    
                    (
                    SELECT
                        il.track_id FROM invoice_line il
                    WHERE il.invoice_line_id=(
                                            SELECT 
                                                invoice_line_id 
                                            FROM invoice_line
                                            WHERE track_id=f.track_id
                                            )
                    EXCEPT
                    
                    SELECT 
                        t.track_id FROM track t
                    WHERE t.track_id = f.track_id
                    )IS NULL
                
                THEN "YES"
                ELSE "NO"
            END AS Track_Purchase
         FROM fq f
         GROUP BY Track_Purchase 

    ),
    
    tq AS
    (
        SELECT
            COUNT(DISTINCT(il.track_id))
        FROM invoice_line il
    ),
    ftq AS
    (
        SELECT
            COUNT(DISTINCT(t.track_id))
        FROM track t
    ),
    fvq AS
    (
        SELECT
            s.Track_Purchase,
            s.Track_Tally,
            CAST(s.Track_Tally AS FLOAT)/
            (SELECT SUM(s.Track_Tally) FROM sq s) Track_Tally_Percent
        FROM sq s
    )
    
    
SELECT * FROM fvq
'''

In [14]:
run_query(q6)

Unnamed: 0,Track_Purchase,Track_Tally,Track_Tally_Percent
0,NO,1697,0.484442
1,YES,1806,0.515558


>More tracks purchased vs. not purchased (1806:1697)

# Exploring if the range of tracks in the store is reflective of the tracks' sales popularity

In [15]:
q7 ='''

    WITH
    fq AS
    (
        SELECT 
            t.track_id,
            g.name genre_name
        FROM track t
        INNER JOIN genre g ON g.genre_id = t.genre_id
        
    ),
    
    sq AS
    (
        SELECT
            COUNT(DISTINCT(f.track_id)) Track_Tally,
            f.genre_name,
            CASE
                WHEN
                    (
                    SELECT 
                        t.track_id FROM track t
                    WHERE t.track_id = f.track_id
                    
                    EXCEPT
                    
                    SELECT
                        il.track_id FROM invoice_line il
                    WHERE il.invoice_line_id=(
                                            SELECT 
                                                invoice_line_id 
                                            FROM invoice_line
                                            WHERE track_id=f.track_id
                                            )
                    )IS NULL
                    
                    AND
                    
                    (
                    SELECT
                        il.track_id FROM invoice_line il
                    WHERE il.invoice_line_id=(
                                            SELECT 
                                                invoice_line_id 
                                            FROM invoice_line
                                            WHERE track_id=f.track_id
                                            )
                    EXCEPT
                    
                    SELECT 
                        t.track_id FROM track t
                    WHERE t.track_id = f.track_id
                    )IS NULL
                
                THEN "YES"
                ELSE "NO"
            END AS Track_Purchase
         FROM fq f
         GROUP BY Track_Purchase,f.genre_name 
         ORDER BY f.genre_name
    ),
    
    tq AS
    (
        SELECT
            COUNT(DISTINCT(il.track_id))
        FROM invoice_line il
    ),
    ftq AS
    (
        SELECT
            COUNT(DISTINCT(t.track_id))
        FROM track t
    )
    
SELECT * FROM sq
'''

In [16]:
run_query(q7)

Unnamed: 0,Track_Tally,genre_name,Track_Purchase
0,6,Alternative,NO
1,34,Alternative,YES
2,156,Alternative & Punk,NO
3,176,Alternative & Punk,YES
4,25,Blues,NO
5,56,Blues,YES
6,15,Bossa Nova,NO
7,58,Classical,NO
8,16,Classical,YES
9,17,Comedy,NO


>When tracks from the store are examined for their sales popularity based on their genres, it is clear that there are more distinct genres that have none or fewer number of tracks sold. Therefore, in general the range of tracks in the store is not reflective of their sales popularity.

# Exploring if protected vs. non-protected media types have an effect on popularity

In [17]:
q8 ='''

    WITH
    fq AS
    (
        SELECT 
            t.track_id,
            m.name media_type_name
        FROM track t
        INNER JOIN media_type m ON m.media_type_id = t.media_type_id
        
    ),
    
    sq AS
    (
        SELECT
            COUNT(DISTINCT(f.track_id)) Track_Tally,
            f.media_type_name,
            CASE
                WHEN
                    (
                    SELECT 
                        t.track_id FROM track t
                    WHERE t.track_id = f.track_id
                    
                    EXCEPT
                    
                    SELECT
                        il.track_id FROM invoice_line il
                    WHERE il.invoice_line_id=(
                                            SELECT 
                                                invoice_line_id 
                                            FROM invoice_line
                                            WHERE track_id=f.track_id
                                            )
                    )IS NULL
                    
                    AND
                    
                    (
                    SELECT
                        il.track_id FROM invoice_line il
                    WHERE il.invoice_line_id=(
                                            SELECT 
                                                invoice_line_id 
                                            FROM invoice_line
                                            WHERE track_id=f.track_id
                                            )
                    EXCEPT
                    
                    SELECT 
                        t.track_id FROM track t
                    WHERE t.track_id = f.track_id
                    )IS NULL
                
                THEN "YES"
                ELSE "NO"
            END AS Track_Purchase
         FROM fq f
         GROUP BY Track_Purchase,f.media_type_name 
         ORDER BY f.media_type_name
    ),
    
    tq AS
    (
        SELECT
            COUNT(DISTINCT(il.track_id))
        FROM invoice_line il
    ),
    ftq AS
    (
        SELECT
            COUNT(DISTINCT(t.track_id))
        FROM track t
    )
    
SELECT * FROM sq
'''

In [18]:
run_query(q8)

Unnamed: 0,Track_Tally,media_type_name,Track_Purchase
0,3,AAC audio file,NO
1,8,AAC audio file,YES
2,1393,MPEG audio file,NO
3,1641,MPEG audio file,YES
4,86,Protected AAC audio file,NO
5,151,Protected AAC audio file,YES
6,211,Protected MPEG-4 video file,NO
7,3,Protected MPEG-4 video file,YES
8,4,Purchased AAC audio file,NO
9,3,Purchased AAC audio file,YES


> Non-protected media types have a higher number of sales (higher sales popularity--1652:154)