# Lab | PostgreSQL queries

- Open the PgAdmin server from PostgreSQL.
- Create a database called **applestore**  and a table called **apple_table**. Use code from `applesotre_DATABASE.sql` to create table and insert data. 

Here is the description of columns for your knowledge:
- `id` : App ID
- `track_name`: App Name
- `size_bytes`: Size (in Bytes)
- `currency`: Currency Type
- `price`: Price amount
- `rating_count_tot`: User Rating counts (for all version)
- `rating_count_ver`: User Rating counts (for current version)
- `user_rating`: Average User Rating value (for all version)
- `user_rating_ver`: Average User Rating value (for current version)
- `ver`: Latest version code
- `cont_rating`: Content Rating
- `prime_genre`: Primary Genre
- `sup_devicesnum`: Number of supporting devices
- `ipadSc_urlsnum`: Number of screenshots showed for display
- `langnum`: Number of supported languages
- `vpp_lic`: Vpp Device Based Licensing Enabled

Answer the following questions using the **apple_table**:

In [1]:
import pandas as pd

In [2]:
import sqlalchemy as db

In [3]:
# for python-dotenv method

from dotenv import load_dotenv
load_dotenv()
import os

In [4]:
# PostgreSQL

db_server = "postgresql"
db_user = "postgres"
db_password = os.environ.get('PASSWORD')
db_host = "localhost"
db_database = "w09_05_lab"
db_port = 5432

# create the engine
engine = db.create_engine(
    f"{db_server}://{db_user}:{db_password}@{db_host}:{db_port}/{db_database}"
)

# open the connection
conn = engine.connect()

# Close the connection
# conn.close()

In [5]:
conn

<sqlalchemy.engine.base.Connection at 0x7fbad130cdc0>

In [43]:
query = '''
SELECT
    *
FROM
    apple_table
LIMIT 5
'''
pd.read_sql(query, conn)

Unnamed: 0,id,track_name,size_bytes,currency,price,rating_count_tot,rating_count_ver,user_rating,user_rating_ver,ver,cont_rating,prime_genre,sup_devicesnum,ipadsc_urlsnum,langnum,vpp_lic
0,281656475,PAC-MAN Premium,100788224,USD,3.99,21292,26,4.0,4.5,6.3.5,4+,Games,38,5,10,1
1,281796108,Evernote - stay organized,158578688,USD,0.0,161065,26,4.0,3.5,8.2.2,4+,Productivity,37,5,23,1
2,281940292,"WeatherBug - Local Weather, Radar, Maps, Alerts",100524032,USD,0.0,188583,2822,3.5,4.5,5.0.0,4+,Weather,37,5,3,1
3,282614216,"eBay: Best App to Buy, Sell, Save! Online Shop...",128512000,USD,0.0,262241,649,4.0,4.5,5.10.0,12+,Shopping,37,5,9,1
4,282935706,Bible,92774400,USD,0.0,985920,5320,4.5,5.0,7.5.1,4+,Reference,37,5,45,1


**1. What are the different genres?**  
- Use `prime_genre` column.

In [32]:
query = '''
SELECT
    distinct prime_genre
FROM
    apple_table;
'''
query_1 = pd.read_sql(query, conn)

print('')
print(f'\033[1;41m The different genres are: ')
print('')
print(f'\033[0;43m{query_1.prime_genre.to_list()}')
print('')


[1;41m The different genres are: 

[0;43m['Shopping', 'Games', 'Education', 'Reference', 'Business', 'Social Networking', 'Food & Drink', 'Sports', 'Catalogs', 'Weather', 'Book', 'Music', 'Entertainment', 'Medical', 'Utilities', 'Travel', 'Navigation', 'Photo & Video', 'Finance', 'Health & Fitness', 'News', 'Productivity', 'Lifestyle']



**2. Which is the genre with the highest number of ratings?**
- To sum the rating use `rating_count_tot` column.

In [49]:
query = '''
SELECT
    prime_genre,
    SUM(rating_count_tot) AS total_rating
FROM
    apple_table
GROUP BY
    prime_genre
ORDER BY
    total_rating DESC
LIMIT 5
'''
query_2 = pd.read_sql(query, conn)
query_2

Unnamed: 0,prime_genre,total_rating
0,Games,52878491
1,Social Networking,7598316
2,Photo & Video,5008946
3,Entertainment,4030518
4,Music,3980199


In [50]:
print('')
print(f'\033[1;43m Genre with the highest number of ratings: {query_2.prime_genre[0]} ')
print('')


[1;43m Genre with the highest number of ratings: Games 



<div>
    <img src="./images/w09_05_PostgreSQL_queries_LAB_01.png" width="1000"/>
</div>

**3. Which is the genre with most apps?**
- Use `prime_genre` column.

In [52]:
query = '''
SELECT
    prime_genre,
    COUNT(id)
FROM
    apple_table
GROUP BY
    prime_genre
ORDER BY
    COUNT(id) DESC
LIMIT 5
'''
query_3 = pd.read_sql(query, conn)
query_3

Unnamed: 0,prime_genre,count
0,Games,3862
1,Entertainment,535
2,Education,453
3,Photo & Video,349
4,Utilities,248


In [95]:
print('')
print(f'\033[1;43m Genre with the most apps: {query_3.prime_genre[0]} ')
print('')


[1;43m Genre with the most apps: Games 



**4. Which is the genre with the fewest apps?**
- Use `prime_genre` column.

In [54]:
query = '''
SELECT
    prime_genre,
    COUNT(id)
FROM
    apple_table
GROUP BY
    prime_genre
ORDER BY
    COUNT(id)
LIMIT 5
'''
query_4 = pd.read_sql(query, conn)
query_4

Unnamed: 0,prime_genre,count
0,Catalogs,10
1,Medical,23
2,Navigation,46
3,Business,57
4,Food & Drink,63


In [96]:
print('')
print(f'\033[1;43m Genre with the fewest apps: {query_4.prime_genre[0]} ')
print('')


[1;43m Genre with the fewest apps: Catalogs 



**5. Find the top 10 apps most rated.**
- Use `track_name` and `rating_count_tot` column.

In [97]:
query = '''
SELECT
    track_name
FROM
    apple_table
ORDER BY
    rating_count_tot DESC
LIMIT 10
'''
query_5 = pd.read_sql(query, conn)
query_5

Unnamed: 0,track_name
0,Facebook
1,Instagram
2,Clash of Clans
3,Temple Run
4,Pandora - Music & Radio
5,Pinterest
6,Bible
7,Candy Crush Saga
8,Spotify Music
9,Angry Birds


In [99]:
print('')
print(f'\033[1;43m Top 10 apps most rated: {query_5.track_name.to_list()} ')
print('')


[1;43m Top 10 apps most rated: ['Facebook', 'Instagram', 'Clash of Clans', 'Temple Run', 'Pandora - Music & Radio', 'Pinterest', 'Bible', 'Candy Crush Saga', 'Spotify Music', 'Angry Birds'] 



**6. Find the top 10 apps best rated by users.**
- Use `track_name` and `user_rating` column.

In [102]:
query = '''
SELECT
    track_name,
    user_rating
FROM
    apple_table
ORDER BY
    user_rating DESC
LIMIT 10
'''
query_6 = pd.read_sql(query, conn)
query_6

Unnamed: 0,track_name,user_rating
0,Plants vs. Zombies HD,5.0
1,Flashlight Òã,5.0
2,TurboScanã¢ Pro - document & receipt scanner:...,5.0
3,Learn to Speak Spanish Fast With MosaLingua,5.0
4,The Photographer's Ephemeris,5.0
5,ÐÈSudoku +,5.0
6,:) Sudoku +,5.0
7,King of Dragon Pass,5.0
8,Plants vs. Zombies,5.0
9,Infinity Blade,5.0


In [101]:
print('')
print(f'\033[1;43m Top 10 apps best rated by users: {query_6.track_name.to_list()} ')
print('')


[1;43m Top 10 apps best rated by users: ['Plants vs. Zombies HD', 'Flashlight \x89Òã', 'TurboScan\x89ã¢ Pro - document & receipt scanner: scan multiple pages and photos to PDF', 'Learn to Speak Spanish Fast With MosaLingua', "The Photographer's Ephemeris", '\x89ÐÈSudoku +', ':) Sudoku +', 'King of Dragon Pass', 'Plants vs. Zombies', 'Infinity Blade'] 



**7. Using the same query from the previous exercise add the column `rating_count_tot`.**
- You'll notice that some of the top-rated don't have many reviews.
- Use `track_name`, `user_rating` and `rating_count_tot`.

In [104]:
query = '''
SELECT
    track_name,
    user_rating,
    rating_count_tot
FROM
    apple_table
ORDER BY
    user_rating DESC
LIMIT 10
'''
query_7 = pd.read_sql(query, conn)
query_7

Unnamed: 0,track_name,user_rating,rating_count_tot
0,Plants vs. Zombies HD,5.0,163598
1,Flashlight Òã,5.0,130450
2,TurboScanã¢ Pro - document & receipt scanner:...,5.0,28388
3,Learn to Speak Spanish Fast With MosaLingua,5.0,9
4,The Photographer's Ephemeris,5.0,663
5,ÐÈSudoku +,5.0,5397
6,:) Sudoku +,5.0,11447
7,King of Dragon Pass,5.0,882
8,Plants vs. Zombies,5.0,426463
9,Infinity Blade,5.0,326482


**8. Now, find the top 5 ordering by ratings and number of votes.**
- Use `track_name`, `user_rating` and `rating_count_tot` columns.

In [106]:
query = '''
SELECT
    track_name,
    user_rating,
    rating_count_tot
FROM
    apple_table
ORDER BY
    user_rating DESC,
    rating_count_tot DESC
LIMIT 5
'''
query_8 = pd.read_sql(query, conn)
query_8

Unnamed: 0,track_name,user_rating,rating_count_tot
0,Head Soccer,5.0,481564
1,Plants vs. Zombies,5.0,426463
2,Sniper 3D Assassin: Shoot to Kill Gun Game,5.0,386521
3,Geometry Dash Lite,5.0,370370
4,Infinity Blade,5.0,326482


**9. Find the total number of games available in more than 1 language.**
- Use `track_name` and `langnum` columns.

In [108]:
query = '''
SELECT
    COUNT(track_name)
FROM
    apple_table
WHERE
    prime_genre = 'Games' AND langnum > 1
'''
query_9 = pd.read_sql(query, conn)
query_9

Unnamed: 0,count
0,1660


In [116]:
print('')
print(f'\033[1;43m Total number of games available in more than 1 language: {query_9.values} ')
print('')


[1;43m Total number of games available in more than 1 language: [[1660]] 



**10. Find the number of free vs paid apps.**
- Use `price` column.
- You can use `CASE WHEN` to filter free or paid apps.

In [81]:
query = '''
SELECT
    CASE WHEN price != 0 THEN 'paid' ELSE 'free' END AS is_paid,
    COUNT(id)
FROM
    apple_table
GROUP BY
    is_paid
'''
query_10 = pd.read_sql(query, conn)
query_10

Unnamed: 0,is_paid,count
0,paid,3141
1,free,4056


In [129]:
paid_apps = query_10.iloc[0]['count']
free_apps = query_10.iloc[0]['count']

In [133]:
print('')
print(f'\033[0;41m Paid apps: {paid_apps} ')
print('')
print(f'\033[0;42m Free apps: {free_apps} ')
print('')


[0;41m Paid apps: 3141 

[0;42m Free apps: 3141 



**11. Find the number of free vs paid apps for each genre.**
- Use `price` and `prime_genre` column.
- You can use `CASE WHEN` to filter free or paid apps.

In [86]:
query = '''
SELECT
    prime_genre,
    COUNT(CASE WHEN price != 0 THEN 'id' ELSE null END) AS paid_apps,
    COUNT(CASE WHEN price = 0 THEN 'id' ELSE null END) AS free_apps,
    COUNT(id) AS total
FROM
    apple_table
GROUP BY
    prime_genre
ORDER BY
    total DESC
'''
query_11 = pd.read_sql(query, conn)
query_11

Unnamed: 0,prime_genre,paid_apps,free_apps,total
0,Games,1605,2257,3862
1,Entertainment,201,334,535
2,Education,321,132,453
3,Photo & Video,182,167,349
4,Utilities,139,109,248
5,Health & Fitness,104,76,180
6,Productivity,116,62,178
7,Social Networking,24,143,167
8,Lifestyle,50,94,144
9,Music,71,67,138


In [134]:
query = '''
WITH temporaryTable AS (
    SELECT
        prime_genre,
        COUNT(CASE WHEN price != 0 THEN 'id' ELSE null END) AS paid_apps,
        COUNT(CASE WHEN price = 0 THEN 'id' ELSE null END) AS free_apps,
        COUNT(id) AS total
    FROM
        apple_table
    GROUP BY prime_genre
    ORDER BY total DESC
)
        SELECT 
            free_apps / total AS free_apps_perc
        FROM
            temporaryTable;
'''
query_12 = pd.read_sql(query, conn)
query_12

Unnamed: 0,free_apps_perc
0,0
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0


In [137]:
query = '''
WITH temporaryTable(free_apps_perc) as (
    SELECT
        COUNT(CASE WHEN price = 0 THEN 'id' ELSE null END) / COUNT(id)
)
    SELECT
        prime_genre,
        COUNT(CASE WHEN price != 0 THEN 'id' ELSE null END) AS paid_apps,
        COUNT(CASE WHEN price = 0 THEN 'id' ELSE null END) AS free_apps,
        COUNT(id) AS total
    FROM
        apple_table
    GROUP BY prime_genre
    ORDER BY total DESC
'''
query_12 = pd.read_sql(query, conn)
query_12

ProgrammingError: (psycopg2.errors.UndefinedColumn) column "price" does not exist
LINE 4:         COUNT(CASE WHEN price = 0 THEN 'id' ELSE null END) /...
                                ^

[SQL: 
WITH temporaryTable(free_apps_perc) as (
    SELECT
        COUNT(CASE WHEN price = 0 THEN 'id' ELSE null END) / COUNT(id)
)
    SELECT
        prime_genre,
        COUNT(CASE WHEN price != 0 THEN 'id' ELSE null END) AS paid_apps,
        COUNT(CASE WHEN price = 0 THEN 'id' ELSE null END) AS free_apps,
        COUNT(id) AS total
    FROM
        apple_table
    GROUP BY prime_genre
    ORDER BY total DESC
]
(Background on this error at: http://sqlalche.me/e/14/f405)

In [94]:
# Another option

query = '''
SELECT
    prime_genre,
    CASE WHEN price != 0 THEN 'paid' ELSE 'free' END AS is_paid,
    COUNT(id)
FROM
    apple_table
GROUP BY prime_genre, is_paid
ORDER BY 1, 2, 3 DESC
'''
query_13 = pd.read_sql(query, conn)
query_13

Unnamed: 0,prime_genre,is_paid,count
0,Book,free,66
1,Book,paid,46
2,Business,free,20
3,Business,paid,37
4,Catalogs,free,9
5,Catalogs,paid,1
6,Education,free,132
7,Education,paid,321
8,Entertainment,free,334
9,Entertainment,paid,201
