# RESULTS

In [44]:
from pprint import pprint 
from DbConnector import DbConnector
from haversine import haversine, Unit
from tabulate import tabulate

In [45]:
try:
    connection = DbConnector()
    client = connection.client
    db = connection.db
except Exception as e:
    print("ERROR: Failed to connect to db:", e)

You are connected to the database: geolife
-----------------------------------------------



# PART 1

#### Showing the 10 first rows of each table in the database, after data is inserted

In [46]:
documents = db.user.find().limit(10)
for doc in documents: 
    pprint(doc)

{'_id': '000', 'has_labels': False}
{'_id': '001', 'has_labels': False}
{'_id': '002', 'has_labels': False}
{'_id': '003', 'has_labels': False}
{'_id': '004', 'has_labels': False}
{'_id': '005', 'has_labels': False}
{'_id': '006', 'has_labels': False}
{'_id': '007', 'has_labels': False}
{'_id': '008', 'has_labels': False}
{'_id': '009', 'has_labels': False}


In [47]:
documents = db.activity.find().limit(10)
for doc in documents:
    # Slice the trackpoints to only show the first 5 
    if 'trackpoints' in doc:
        doc['trackpoints'] = doc['trackpoints'][:5]
    pprint(doc)

{'_id': ObjectId('671566f2cbbb30e475822a64'),
 'end_time': datetime.datetime(2008, 10, 23, 11, 11, 12),
 'start_time': datetime.datetime(2008, 10, 23, 2, 53, 4),
 'trackpoints': [ObjectId('671566f2cbbb30e4758226d8'),
                 ObjectId('671566f2cbbb30e4758226d9'),
                 ObjectId('671566f2cbbb30e4758226da'),
                 ObjectId('671566f2cbbb30e4758226db'),
                 ObjectId('671566f2cbbb30e4758226dc')],
 'transportation_mode': None,
 'user_id': '000'}
{'_id': ObjectId('671566f2cbbb30e475822b59'),
 'end_time': datetime.datetime(2008, 10, 24, 2, 47, 6),
 'start_time': datetime.datetime(2008, 10, 24, 2, 9, 59),
 'trackpoints': [ObjectId('671566f2cbbb30e475822a65'),
                 ObjectId('671566f2cbbb30e475822a66'),
                 ObjectId('671566f2cbbb30e475822a67'),
                 ObjectId('671566f2cbbb30e475822a68'),
                 ObjectId('671566f2cbbb30e475822a69')],
 'transportation_mode': None,
 'user_id': '000'}
{'_id': ObjectId('671566f2cb

In [48]:
documents = db.trackpoint.find().limit(10)
for doc in documents:
    pprint(doc)

{'_id': ObjectId('671566f2cbbb30e4758226d8'),
 'altitude': '492',
 'date_days': '39744.1201851852',
 'date_time': datetime.datetime(2008, 10, 23, 2, 53, 4),
 'lat': '39.984702',
 'lon': '116.318417'}
{'_id': ObjectId('671566f2cbbb30e4758226d9'),
 'altitude': '492',
 'date_days': '39744.1202546296',
 'date_time': datetime.datetime(2008, 10, 23, 2, 53, 10),
 'lat': '39.984683',
 'lon': '116.31845'}
{'_id': ObjectId('671566f2cbbb30e4758226da'),
 'altitude': '492',
 'date_days': '39744.1203125',
 'date_time': datetime.datetime(2008, 10, 23, 2, 53, 15),
 'lat': '39.984686',
 'lon': '116.318417'}
{'_id': ObjectId('671566f2cbbb30e4758226db'),
 'altitude': '492',
 'date_days': '39744.1203703704',
 'date_time': datetime.datetime(2008, 10, 23, 2, 53, 20),
 'lat': '39.984688',
 'lon': '116.318385'}
{'_id': ObjectId('671566f2cbbb30e4758226dc'),
 'altitude': '492',
 'date_days': '39744.1204282407',
 'date_time': datetime.datetime(2008, 10, 23, 2, 53, 25),
 'lat': '39.984655',
 'lon': '116.318263'}


# PART 2

#### 1. How many users, activities, and trackpoints are there in the dataset (after it is inserted into the database).

In [49]:
user_count = db.user.count_documents({})
activity_count = db.activity.count_documents({})
tp_count = db.trackpoint.count_documents({})
rows = [
    ["User", user_count],
    ["Activity", activity_count],
    ["Trackpoint", tp_count]
]
print(tabulate(rows, headers=["Collection", "Count"], tablefmt="pretty"))


+------------+---------+
| Collection |  Count  |
+------------+---------+
|    User    |   182   |
|  Activity  |  16050  |
| Trackpoint | 9686758 |
+------------+---------+


#### 2. Find the average number of activities per user.

In [50]:
pipeline = [
    {
        "$group": {
            "_id": "$user_id",
            "activity_count": {"$sum": 1}  # Count activities per user
        }
    },
    {
        "$group": {
            "_id": None,
            "average_activities_per_user": {"$avg": "$activity_count"}
        }
    }
]
result = db.activity.aggregate(pipeline)
for doc in result:
    pprint(doc)

{'_id': None, 'average_activities_per_user': 92.77456647398844}


#### 3. Find the top 20 users with the highest number of activities.

In [51]:
pipeline = [
    {
        "$group": {
            "_id": "$user_id",
            "activity_count": {"$sum": 1}
        }
    },
    {
        "$sort": {
            'activity_count': -1
        }
    },
    {
        "$limit": 20
    }
]
result = db.activity.aggregate(pipeline)
for doc in result:
    pprint(doc)

{'_id': '128', 'activity_count': 2102}
{'_id': '153', 'activity_count': 1794}
{'_id': '025', 'activity_count': 715}
{'_id': '163', 'activity_count': 704}
{'_id': '062', 'activity_count': 691}
{'_id': '144', 'activity_count': 563}
{'_id': '041', 'activity_count': 399}
{'_id': '085', 'activity_count': 364}
{'_id': '004', 'activity_count': 346}
{'_id': '140', 'activity_count': 345}
{'_id': '167', 'activity_count': 320}
{'_id': '068', 'activity_count': 280}
{'_id': '017', 'activity_count': 265}
{'_id': '003', 'activity_count': 261}
{'_id': '014', 'activity_count': 236}
{'_id': '126', 'activity_count': 215}
{'_id': '030', 'activity_count': 210}
{'_id': '112', 'activity_count': 208}
{'_id': '011', 'activity_count': 201}
{'_id': '039', 'activity_count': 198}


#### 4. Find all users who have taken a taxi.

In [52]:
pipeline = [
    {
        "$match": {
            "transportation_mode": "taxi"
        }
    },
    {
        "$group": {
            "_id": "$user_id"
        }
    },
    {
        "$sort": {
            "_id": 1
        }
    }
]
result = db.activity.aggregate(pipeline)
for doc in result:
    pprint(doc)


{'_id': '010'}
{'_id': '058'}
{'_id': '062'}
{'_id': '078'}
{'_id': '080'}
{'_id': '085'}
{'_id': '098'}
{'_id': '111'}
{'_id': '128'}
{'_id': '163'}


#### 5. Find all types of transportation modes and count how many activities that are tagged with these transportation mode labels. Do not count the rows where the mode is null.

In [53]:
pipeline = [
    {
        "$match": {
            "transportation_mode": {"$ne": None}
        }
    },
    {
        "$group": {
            "_id": "$transportation_mode",
            "activity_count": {"$sum": 1}
        }
    },
    {
        "$sort": {
            "activity_count": -1
        }
    }
]
results = db.activity.aggregate(pipeline)
for doc in results:
    pprint(doc)

{'_id': 'walk', 'activity_count': 480}
{'_id': 'car', 'activity_count': 419}
{'_id': 'bike', 'activity_count': 263}
{'_id': 'bus', 'activity_count': 199}
{'_id': 'subway', 'activity_count': 133}
{'_id': 'taxi', 'activity_count': 37}
{'_id': 'airplane', 'activity_count': 3}
{'_id': 'train', 'activity_count': 2}
{'_id': 'boat', 'activity_count': 1}
{'_id': 'run', 'activity_count': 1}


#### 6. a) Find the year with the most activities.

In [62]:
pipeline = [
    {
        "$project": {
            "activity_year": {"$year": "$start_time"}
        }
    },
    {
        "$group": {
            "_id": "$activity_year",
            "activity_count": {"$sum": 1}
        }
    },
    {
        "$sort": {
            "activity_count": -1
        }
    }
]
results = db.activity.aggregate(pipeline)
pprint(results.next())

{'_id': 2008, 'activity_count': 5895}


#### 6. b) Is this also the year with most recorded hours? 

In [69]:
pipeline = [
    {
        "$project": {
            "activity_year": {"$year": "$start_time"},
            "hours_diff": {
                "$divide": [{"$dateDiff": {
                            "startDate": "$start_time",
                            "endDate": "$end_time",
                            "unit": "second"
                        }
                    },
                    3600  # Convert seconds to hours
                ]
            }
        }
    },
    {
        "$group": {
            "_id": "$activity_year", 
            "total_hours": {"$sum": "$hours_diff"}
        }
    },
    {
        "$project": {
            "total_hours": {"$round": ["$total_hours", 0]}
        }
    },
    {
        "$sort": {
            "total_hours": -1  # Sort by total_hours in descending order
        }
    }
]

results = db.activity.aggregate(pipeline)
pprint(results.next())

{'_id': 2009, 'total_hours': 11616.0}


No, 2009 is the year with the most activities.

#### 7. Find the total distance (in km) walked in 2008, by user with id=112.

In [None]:
cursor.execute("""
    SELECT tp.lat, tp.lon
    FROM TrackPoint tp
    JOIN Activity a ON tp.activity_id = a.id
    WHERE a.user_id = '112'
      AND a.transportation_mode = 'walk'
      AND YEAR(a.start_date_time) = 2008
    ORDER BY a.id, tp.date_time
""")
trackpoints = cursor.fetchall()



total_distance_km = 0
for i in range(1, len(trackpoints)):
    prev_point = (trackpoints[i-1][0], trackpoints[i-1][1])  # (lat, lon)
    curr_point = (trackpoints[i][0], trackpoints[i][1])      # (lat, lon)
    distance = haversine(prev_point, curr_point, unit=Unit.KILOMETERS)
    total_distance_km += distance

print(f"Total distance walked in 2008 by user 112: {total_distance_km:.2f} km")

#### 8. Find the top 20 users who have gained the most altitude meters.

In [None]:
cursor.execute("""
SELECT
    a.user_id AS user_id,
    SUM(CASE WHEN tp2.altitude > tp1.altitude THEN tp2.altitude - tp1.altitude ELSE 0 END) 
    AS total_elevation_gain
FROM Activity AS a 
JOIN TrackPoint AS tp1 ON a.id = tp1.activity_id
JOIN TrackPoint AS tp2 ON a.id = tp2.activity_id AND tp2.id = tp1.id + 1
WHERE tp1.altitude IS NOT NULL AND tp2.altitude IS NOT NULL 
    AND tp1.altitude != -777 AND tp2.altitude != -777
GROUP BY user_id
ORDER BY total_elevation_gain DESC
LIMIT 20
""")

rows = cursor.fetchall()
print(tabulate(rows, headers=['User ID', 'Total Elevation Gain (feet)'], tablefmt='pretty'))
print("""\n I only remove altitude values which we KNOW are invalid, 
      ie. they are -777 as specified it the assignment or NULL.""")

#### 9. Find all users who have invalid activities, and the number of invalid activities per user

In [None]:
cursor.execute("""
    WITH InvalidActivities AS (
        SELECT
            tp1.activity_id
        FROM
            TrackPoint tp1
        JOIN
            TrackPoint tp2 ON tp1.activity_id = tp2.activity_id
                        AND tp2.id = tp1.id + 1
        WHERE
            TIMESTAMPDIFF(MINUTE, tp1.date_time, tp2.date_time) >= 5
        GROUP BY
            tp1.activity_id
    )
    SELECT  
        user_id,
        COUNT(activity_id) AS invalid_activities_count
    FROM
        InvalidActivities as ia
    JOIN Activity as a ON a.id = ia.activity_id
    GROUP BY
        user_id
""")
rows = cursor.fetchall()
print(tabulate(rows, headers=['User ID', 'Invalid Activity Count'], tablefmt='pretty'))

#### 10. Find the users who have tracked an activity in the Forbidden City of Beijing.  


In [None]:
cursor.execute("""
    SELECT user_id
    FROM Activity a
    JOIN TrackPoint tp ON a.id = tp.activity_id
    WHERE tp.lat BETWEEN 39.915 AND 39.917
      AND tp.lon BETWEEN 116.396 AND 116.398
    GROUP BY user_id
""")
rows = cursor.fetchall()
print(tabulate(rows, headers=['User ID'], tablefmt='pretty'))
print("\n I provided a tiny range around the given coordinates to simplify the query.")

#### 11. Find all users who have registered transportation_mode and their most used transportation_mode. 

In [None]:

cursor.execute("""
    SELECT user_id, transportation_mode 
    FROM
        (SELECT user_id, transportation_mode, 
        RANK() OVER(PARTITION BY user_id ORDER BY transportation_mode ASC) as lexrnk 
        FROM
            (SELECT user_id, transportation_mode, COUNT(*) AS activity_count,
            RANK() OVER(PARTITION BY user_id ORDER BY COUNT(*) DESC) as rnk
            FROM Activity
            WHERE transportation_mode IS NOT NULL
            GROUP BY user_id, transportation_mode) AS Ranked
        WHERE rnk = 1
        ) AS LexRanked
    WHERE lexrnk = 1
    ORDER BY user_id
    """)
rows = cursor.fetchall()
print(tabulate(rows, headers=['User ID',' Most Used Transportation Mode'], tablefmt='pretty'))
print("""\n The innermost subquery finds the user's most used transportation mode(s). 
      The second subquery handles tie breaks alphabetically.""")