In [2]:
import mysql.connector as mysql
import haversine as hs
from tabulate import tabulate

In [4]:
try:
    connection = mysql.connect(host="localhost", database="geolife", user="root", password="123", port=3306)
    cursor = connection.cursor()
except Exception as e:
    print("ERROR: Failed to connect to db:", e)

#### 1. How many users, activities, and trackpoints are there in the dataset (after it is inserted into the database).

In [52]:
# Combine counts using UNION ALL
cursor.execute( """
        SELECT 'User', COUNT(*) FROM User
        UNION ALL
        SELECT 'Activity', COUNT(*) FROM Activity
        UNION ALL
        SELECT 'TrackPoint', COUNT(*) FROM TrackPoint
    """)
rows = cursor.fetchall()
print(tabulate(rows, headers=['Table', 'Count'], tablefmt="pretty"))

+------------+---------+
|   Table    |  Count  |
+------------+---------+
|    User    |   182   |
|  Activity  |  16033  |
| TrackPoint | 9681069 |
+------------+---------+


#### 2. Find the average number of activities per user.

In [53]:
cursor.execute("""
    SELECT AVG(activity_count)
    FROM (
        SELECT COUNT(*) AS activity_count
        FROM Activity
        GROUP BY user_id
    ) AS activity_counts
""")
rows = cursor.fetchall()
print(tabulate(rows, headers=['Average Activities Per User'], tablefmt="pretty"))

+-----------------------------+
| Average Activities Per User |
+-----------------------------+
|           93.2151           |
+-----------------------------+


#### 3. Find the top 20 users with the highest number of activities.

In [59]:
cursor.execute("""
    SELECT user_id, COUNT(*) AS activity_count
    FROM Activity
    GROUP BY user_id
    ORDER BY activity_count DESC
    LIMIT 20
""")
rows = cursor.fetchall()
print(tabulate(rows, headers=['User ID', 'Activity Count'], tablefmt="pretty"))

+---------+----------------+
| User ID | Activity Count |
+---------+----------------+
|   128   |      2102      |
|   153   |      1793      |
|   025   |      715       |
|   163   |      704       |
|   062   |      691       |
|   144   |      563       |
|   041   |      399       |
|   085   |      364       |
|   004   |      346       |
|   140   |      345       |
|   167   |      320       |
|   068   |      280       |
|   017   |      265       |
|   003   |      261       |
|   014   |      236       |
|   126   |      215       |
|   030   |      210       |
|   112   |      208       |
|   011   |      201       |
|   039   |      198       |
+---------+----------------+


#### 4. Find all users who have taken a taxi.

In [47]:
cursor.execute("""
    SELECT DISTINCT user_id
    FROM Activity
    WHERE transportation_mode = 'taxi'
""")
rows = cursor.fetchall()
print(tabulate(rows, headers=['User ID'], tablefmt="pretty"))


+---------+
| User ID |
+---------+
|   010   |
|   058   |
|   062   |
|   078   |
|   080   |
|   085   |
|   098   |
|   111   |
|   128   |
|   163   |
+---------+


#### 5. Find all types of transportation modes and count how many activities that are tagged with these transportation mode labels. Do not count the rows where the mode is null.

In [64]:
cursor.execute("""
    SELECT transportation_mode, COUNT(*) AS activity_count
    FROM Activity
    WHERE transportation_mode IS NOT NULL
    GROUP BY transportation_mode
    ORDER BY activity_count DESC
""")
rows = cursor.fetchall()
print(tabulate(rows, headers=['Transportation Mode', 'Activity Count'], tablefmt='pretty'))


+---------------------+----------------+
| Transportation Mode | Activity Count |
+---------------------+----------------+
|        walk         |      480       |
|         car         |      419       |
|        bike         |      263       |
|         bus         |      199       |
|       subway        |      133       |
|        taxi         |       37       |
|      airplane       |       3        |
|        train        |       2        |
|         run         |       1        |
|        boat         |       1        |
+---------------------+----------------+


#### 6. a) Find the year with the most activities.

In [None]:
# TODO: content inside execute here is wrong. also clean up unused functions i creator :)
cursor.execute("""
    SELECT transportation_mode, COUNT(*) AS activity_count
    FROM Activity
    WHERE transportation_mode IS NOT NULL
    GROUP BY transportation_mode
    ORDER BY activity_count DESC
""")
rows = cursor.fetchall()
print(tabulate(rows, headers=['Transportation Mode', 'Activity Count'], tablefmt='pretty'))
