# 9.7 Mini Project: SQLite DB Creation
- Member: 
    - Congxin (David) Xu - cx2rx
    - Diyu Zheng - dz2fc

### Database ER Diagram

<img src="Mini Project ER Diagram.jpg" width="1000">|

In [1]:
import sqlite3
import pandas as pd

In [2]:
# Create a connection to the database.
conn = sqlite3.connect('apps_DB.db')

# Create a cursor. A cursor is used to move around the database
cursor = conn.cursor()

#### Creating Tables

In [3]:
# SQLite does not have a seperate Boolean storage class
sql_create_apps = """
    CREATE TABLE apps(
        App_Id INTEGER,
        App_Name TEXT,
        Category_Id INTEGER,
        Login_Method TEXT,
        File_Size INTEGER,
        Photo_Access TEXT, 
        Location_Access TEXT,
        Allow_Notification TEXT,
        PRIMARY KEY(App_Id),
        FOREIGN KEY(Category_Id) REFERENCES categories(Category_Id)
    );
"""

In [4]:
# Use the cursor to execute the statement
cursor.execute(sql_create_apps)

<sqlite3.Cursor at 0x2aabd144f10>

In [5]:
sql_create_categories = """
    CREATE TABLE categories(
        Category_Id INTEGER,
        Category_Name TEXT,
        PRIMARY KEY(Category_Id)
    );
"""

In [6]:
# Use the cursor to execute the statement
cursor.execute(sql_create_categories)

<sqlite3.Cursor at 0x2aabd144f10>

In [7]:
sql_create_users = """
    CREATE TABLE users(
        User_Id INTEGER,
        User_Name TEXT,
        App_Id INTEGER,
        User_Rating INTEGER,
        PRIMARY KEY(User_Id, App_Id)
    );
"""

In [8]:
# Use the cursor to execute the statement
cursor.execute(sql_create_users)

<sqlite3.Cursor at 0x2aabd144f10>

In [9]:
app = pd.read_excel("Mini SQL DB Design.xlsx", sheet_name="App")

In [10]:
# change boolean to integer 
# boolean_map = {'Yes':1,'No':0}
# app = app.replace({'Photo_Access':boolean_map,'Location_Access':boolean_map,'Allow_Notification':boolean_map})

In [10]:
app.to_sql('apps', conn, index = False, if_exists = 'append')
app.head()

Unnamed: 0,App_Id,App_Name,Category_Id,Login_Method,File_Size,Photo_Access,Location_Access,Allow_Notification
0,1001,Aetna Health,11,Face ID,93,No,Yes,No
1,1002,Airbnb,12,Password,283,No,Yes,Yes
2,1003,Amazon,13,Password,160,No,Yes,Yes
3,1004,BofA,14,Face ID,173,No,Yes,Yes
4,1005,Booking.com,12,Password,164,No,Yes,No


In [11]:
category=pd.read_excel("Mini SQL DB Design.xlsx", sheet_name="Category")
category.to_sql('categories',conn,index = False,if_exists = 'append')
category.head()

Unnamed: 0,Category_Id,Category_Name
0,11,Insurance
1,12,Travel
2,13,Shopping
3,14,Financial
4,15,Game


In [12]:
user=pd.read_excel("Mini SQL DB Design.xlsx", sheet_name = "User")
user.to_sql('users', conn, index = False, if_exists = 'append')
user.head()

Unnamed: 0,User_Id,User_Name,App_Id,User_Rating
0,6001,Roy,1003,4
1,6001,Roy,1010,2
2,6001,Roy,1009,5
3,6001,Roy,1017,2
4,6001,Roy,1001,3


In [222]:
# Query 1: Find the app name and file size in each category with the largest file size
query1 = """
    SELECT App_Name, MAX(File_Size) as max_file_size FROM apps
    GROUP BY Category_Id;
"""
result1 = cursor.execute(query1)
pd.DataFrame(result1.fetchall())

Unnamed: 0,0,1
0,GEICO,199
1,Airbnb,283
2,Amazon,160
3,Chase,259
4,Fruit Ninja,268
5,Instagram,254


In [236]:
# Query 2: Find the average rating for apps for each login method and each category.
query2 = """
SELECT Category_Name, AVG(User_Rating),Login_Method FROM users
LEFT JOIN apps t
    ON users.App_Id = t.App_Id 
LEFT JOIN categories l
    ON t.Category_Id = l.Category_Id
GROUP BY Login_Method, l.Category_Id
"""
result2 = cursor.execute(query2)
pd.DataFrame(result2.fetchall())

Unnamed: 0,0,1,2
0,Insurance,3.4,Face ID
1,Financial,2.5,Face ID
2,Insurance,3.5,Password
3,Travel,3.428571,Password
4,Shopping,2.5,Password
5,Financial,2.0,Password
6,Game,2.0,Password
7,Social,2.857143,Password


In [226]:
# Query 3: Find the total file size of the apps that require location access but not photo access for each person
query3="""
SELECT User_Name,SUM(File_Size) FROM users 
LEFT JOIN apps t
    ON users.App_Id = t.App_Id 
LEFT JOIN categories l
    ON t.Category_Id = l.Category_Id
WHERE Photo_Access = 0 AND Location_Access = 1
GROUP BY User_Id
"""

result3 = cursor.execute(query3)
pd.DataFrame(result3.fetchall())

Unnamed: 0,0,1
0,Roy,1059
1,David,1297
2,Diyu,56
3,Taylor,834


In [157]:
# Query 4: Order apps by popularity and if there is tie, sort by average rating in descending order.
query4 = """
SELECT App_Name, AVG(User_Rating) as avg_rating, COUNT(App_Name) as count_app FROM users
LEFT JOIN apps
    ON users.App_Id = apps.App_Id 
GROUP BY App_Name
ORDER BY avg_rating DESC, count_app DESC
"""
result4 = cursor.execute(query4)
pd.DataFrame(result4.fetchall())

Unnamed: 0,0,1,2
0,Instagram,4.5,2
1,Booking.com,4.0,2
2,Fly Delta,4.0,2
3,BofA,4.0,1
4,Aetna Health,3.5,2
5,StateFarm,3.5,2
6,GEICO,3.333333,3
7,Airbnb,3.0,2
8,Twitter,3.0,1
9,Amazon,2.5,2


In [13]:
# Use the cursor to close the connection to the database, now that we're done.
cursor.close()

# When you run this file, if everything went well, look in the directory
# where you saved the script (create_db.py). You should see a new file there:
# mytest.db. 
# If you do not, double-check what script you ran and where it is saved. 