# RESULTS

In [2]:
from pprint import pprint 
from DbConnector import DbConnector
from haversine import haversine, Unit
from tabulate import tabulate
from datetime import timedelta

In [3]:
try:
    connection = DbConnector()
    client = connection.client
    db = connection.db
except Exception as e:
    print("ERROR: Failed to connect to db:", e)

You are connected to the database: geolife
-----------------------------------------------



# PART 1

### Showing the 10 first documents of each collection in the database, after data is inserted

#### User:

In [3]:
documents = db.user.find().limit(10)
for doc in documents: 
    pprint(doc)

{'_id': '000', 'has_labels': False}
{'_id': '001', 'has_labels': False}
{'_id': '002', 'has_labels': False}
{'_id': '003', 'has_labels': False}
{'_id': '004', 'has_labels': False}
{'_id': '005', 'has_labels': False}
{'_id': '006', 'has_labels': False}
{'_id': '007', 'has_labels': False}
{'_id': '008', 'has_labels': False}
{'_id': '009', 'has_labels': False}


#### Activity:

Since each activity has a reference to the ObjectId of all its trackpoints (could be up to 2500), I only print the first five trackpoints for each Activity to save space: 

In [4]:
documents = db.activity.find().limit(10)
for doc in documents:
    if 'trackpoints' in doc:
        doc['trackpoints'] = doc['trackpoints'][:5]
    pprint(doc)

{'_id': 1,
 'end_time': datetime.datetime(2008, 10, 23, 11, 11, 12),
 'start_time': datetime.datetime(2008, 10, 23, 2, 53, 4),
 'trackpoints': [ObjectId('67177386bbd1c9b22735a590'),
                 ObjectId('67177386bbd1c9b22735a591'),
                 ObjectId('67177386bbd1c9b22735a592'),
                 ObjectId('67177386bbd1c9b22735a593'),
                 ObjectId('67177386bbd1c9b22735a594')],
 'transportation_mode': None,
 'user_id': '000'}
{'_id': 2,
 'end_time': datetime.datetime(2008, 10, 24, 2, 47, 6),
 'start_time': datetime.datetime(2008, 10, 24, 2, 9, 59),
 'trackpoints': [ObjectId('67177387bbd1c9b22735a91c'),
                 ObjectId('67177387bbd1c9b22735a91d'),
                 ObjectId('67177387bbd1c9b22735a91e'),
                 ObjectId('67177387bbd1c9b22735a91f'),
                 ObjectId('67177387bbd1c9b22735a920')],
 'transportation_mode': None,
 'user_id': '000'}
{'_id': 3,
 'end_time': datetime.datetime(2008, 10, 26, 15, 4, 7),
 'start_time': datetime.datetim

#### Trackpoint

In [5]:
documents = db.trackpoint.find().limit(10)
for doc in documents:
    pprint(doc)

{'_id': ObjectId('67177386bbd1c9b22735a590'),
 'activity_id': 1,
 'altitude': 492,
 'date_days': 39744.1201851852,
 'date_time': datetime.datetime(2008, 10, 23, 2, 53, 4),
 'lat': 39.984702,
 'lon': 116.318417}
{'_id': ObjectId('67177386bbd1c9b22735a591'),
 'activity_id': 1,
 'altitude': 492,
 'date_days': 39744.1202546296,
 'date_time': datetime.datetime(2008, 10, 23, 2, 53, 10),
 'lat': 39.984683,
 'lon': 116.31845}
{'_id': ObjectId('67177386bbd1c9b22735a592'),
 'activity_id': 1,
 'altitude': 492,
 'date_days': 39744.1203125,
 'date_time': datetime.datetime(2008, 10, 23, 2, 53, 15),
 'lat': 39.984686,
 'lon': 116.318417}
{'_id': ObjectId('67177386bbd1c9b22735a593'),
 'activity_id': 1,
 'altitude': 492,
 'date_days': 39744.1203703704,
 'date_time': datetime.datetime(2008, 10, 23, 2, 53, 20),
 'lat': 39.984688,
 'lon': 116.318385}
{'_id': ObjectId('67177386bbd1c9b22735a594'),
 'activity_id': 1,
 'altitude': 492,
 'date_days': 39744.1204282407,
 'date_time': datetime.datetime(2008, 10, 

# PART 2

#### 1. How many users, activities, and trackpoints are there in the dataset (after it is inserted into the database).

In [6]:
user_count = db.user.count_documents({})
activity_count = db.activity.count_documents({})
tp_count = db.trackpoint.count_documents({})
rows = [
    ["User", user_count],
    ["Activity", activity_count],
    ["Trackpoint", tp_count]
]
print(tabulate(rows, headers=["Collection", "Count"], tablefmt="pretty"))

+------------+---------+
| Collection |  Count  |
+------------+---------+
|    User    |   182   |
|  Activity  |  16050  |
| Trackpoint | 9686758 |
+------------+---------+


#### 2. Find the average number of activities per user.

In [7]:
pipeline = [
    {
        "$group": {
            "_id": "$user_id",
            "activity_count": {"$sum": 1}  # Count activities per user
        }
    },
    {
        "$group": {
            "_id": None,
            "average_activities_per_user": {"$avg": "$activity_count"}
        }
    }
]
result = db.activity.aggregate(pipeline)
for doc in result:
    pprint(doc)

{'_id': None, 'average_activities_per_user': 92.77456647398844}


#### 3. Find the top 20 users with the highest number of activities.

In [8]:
pipeline = [
    {
        "$group": {
            "_id": "$user_id",
            "activity_count": {"$sum": 1}
        }
    },
    {
        "$sort": {
            'activity_count': -1
        }
    },
    {
        "$limit": 20
    }
]
result = db.activity.aggregate(pipeline)
for doc in result:
    pprint(doc)

{'_id': '128', 'activity_count': 2102}
{'_id': '153', 'activity_count': 1794}
{'_id': '025', 'activity_count': 715}
{'_id': '163', 'activity_count': 704}
{'_id': '062', 'activity_count': 691}
{'_id': '144', 'activity_count': 563}
{'_id': '041', 'activity_count': 399}
{'_id': '085', 'activity_count': 364}
{'_id': '004', 'activity_count': 346}
{'_id': '140', 'activity_count': 345}
{'_id': '167', 'activity_count': 320}
{'_id': '068', 'activity_count': 280}
{'_id': '017', 'activity_count': 265}
{'_id': '003', 'activity_count': 261}
{'_id': '014', 'activity_count': 236}
{'_id': '126', 'activity_count': 215}
{'_id': '030', 'activity_count': 210}
{'_id': '112', 'activity_count': 208}
{'_id': '011', 'activity_count': 201}
{'_id': '039', 'activity_count': 198}


#### 4. Find all users who have taken a taxi.

In [9]:
pipeline = [
    {
        "$match": {
            "transportation_mode": "taxi"
        }
    },
    {
        "$group": {
            "_id": "$user_id"
        }
    },
    {
        "$sort": {
            "_id": 1
        }
    }
]
result = db.activity.aggregate(pipeline)
for doc in result:
    pprint(doc)


{'_id': '010'}
{'_id': '058'}
{'_id': '062'}
{'_id': '078'}
{'_id': '080'}
{'_id': '085'}
{'_id': '098'}
{'_id': '111'}
{'_id': '128'}
{'_id': '163'}


#### 5. Find all types of transportation modes and count how many activities that are tagged with these transportation mode labels. Do not count the rows where the mode is null.

In [10]:
pipeline = [
    {
        "$match": {
            "transportation_mode": {"$ne": None}
        }
    },
    {
        "$group": {
            "_id": "$transportation_mode",
            "activity_count": {"$sum": 1}
        }
    },
    {
        "$sort": {
            "activity_count": -1
        }
    }
]
results = db.activity.aggregate(pipeline)
for doc in results:
    pprint(doc)

{'_id': 'walk', 'activity_count': 480}
{'_id': 'car', 'activity_count': 419}
{'_id': 'bike', 'activity_count': 263}
{'_id': 'bus', 'activity_count': 199}
{'_id': 'subway', 'activity_count': 133}
{'_id': 'taxi', 'activity_count': 37}
{'_id': 'airplane', 'activity_count': 3}
{'_id': 'train', 'activity_count': 2}
{'_id': 'boat', 'activity_count': 1}
{'_id': 'run', 'activity_count': 1}


#### 6. a) Find the year with the most activities.

In [11]:
pipeline = [
    {
        "$project": {
            "activity_year": {"$year": "$start_time"}
        }
    },
    {
        "$group": {
            "_id": "$activity_year",
            "activity_count": {"$sum": 1}
        }
    },
    {
        "$sort": {
            "activity_count": -1
        }
    }
]
results = db.activity.aggregate(pipeline)
pprint(results.next())

{'_id': 2008, 'activity_count': 5895}


2008 is the year with the most activities.

#### 6. b) Is this also the year with most recorded hours? 

In [12]:
pipeline = [
    {
        "$project": {
            "activity_year": {"$year": "$start_time"},
            "hours_diff": {
                "$divide": [{"$dateDiff": {
                            "startDate": "$start_time",
                            "endDate": "$end_time",
                            "unit": "second"
                        }
                    },
                    3600  # Convert seconds to hours
                ]
            }
        }
    },
    {
        "$group": {
            "_id": "$activity_year", 
            "total_hours": {"$sum": "$hours_diff"}
        }
    },
    {
        "$project": {
            "total_hours": {"$round": ["$total_hours", 0]}
        }
    },
    {
        "$sort": {
            "total_hours": -1  # Sort by total_hours in descending order
        }
    }
]

results = db.activity.aggregate(pipeline)
pprint(results.next())

{'_id': 2009, 'total_hours': 11616.0}


No, 2009 is the year with the most recorded hours.

#### 7. Find the total distance (in km) walked in 2008, by user with id=112.

In [13]:
pipeline = [
    {
      "$project": {
          "trackpoints": 1,
          "activity_year": {"$year": "$start_time"},
          "transportation_mode": 1,
          "user_id": 1
      }
    },
    {
      "$match": {
          "transportation_mode": "walk",
          "activity_year": 2008,
          "user_id": "112",
      }     
    }
]
results = db.activity.aggregate(pipeline)
#Store a list where each element is a list of all trackpoints for a unique activity
activities = [doc["trackpoints"]for doc in results]

total_distance_km = 0
for activity in activities:
    trackpoints = db.trackpoint.aggregate([
        {
            "$match": {
                "_id": {"$in": activity}
            }
        },
        { # Sort the trackpoints as MongoDB cannot guarantee that they are returned in the order they were inserted.
            "$sort": {
                "date_days": 1
            }            
        },
        {
            "$project": {
                "lat": 1,
                "lon": 1,
            }
        }
    ]).to_list()
    
    for i in range(1, len(trackpoints)):
        prev_point = (trackpoints[i-1]['lat'], trackpoints[i-1]['lon'])
        curr_point = (trackpoints[i]['lat'], trackpoints[i]['lon']) 
        distance = haversine(prev_point, curr_point, unit=Unit.KILOMETERS)
        total_distance_km += distance

print(f"Total distance walked in 2008 by user 112: {total_distance_km:.2f} km")

Total distance walked in 2008 by user 112: 115.47 km


#### 8. Find the top 20 users who have gained the most altitude meters.

In [14]:
# Retrieve user_id and trackpoints for each activity
activities = db.activity.aggregate([
    {"$project": {"user_id": 1, "trackpoints": 1}}
])

user_altitude_gain = {}

for activity in activities:
    user_id = activity["user_id"]
    trackpoint_ids = activity["trackpoints"]

    # Retrieve and filter trackpoints with altitude >= 0, sorted by date
    trackpoints = list(db.trackpoint.find(
        {
            "_id": {"$in": trackpoint_ids},
            "altitude": {"$gte": 0}  # Exclude negative and null altitudes
        },
        sort=[("date_days", 1)]
    ))

    # Calculate altitude gain for each activity
    altitude_gain = sum(
        max(tp2["altitude"] - tp1["altitude"], 0)
        for tp1, tp2 in zip(trackpoints[:-1], trackpoints[1:])
    )

    # Accumulate total altitude gain per user
    user_altitude_gain[user_id] = user_altitude_gain.get(user_id, 0) + altitude_gain

# Sort and display the top 20 users by altitude gain
top_users = sorted(user_altitude_gain.items(), key=lambda x: x[1], reverse=True)[:20]
table = [[user, gain] for user, gain in top_users]
print(tabulate(table, headers=["User ID", "Total Altitude Gain"], tablefmt="pretty"))

+---------+---------------------+
| User ID | Total Altitude Gain |
+---------+---------------------+
|   128   |       2027587       |
|   153   |       1618425       |
|   004   |       966470        |
|   003   |       669881        |
|   163   |       613040        |
|   085   |       609489        |
|   144   |       575894        |
|   041   |       558839        |
|   030   |       486584        |
|   062   |       465815        |
|   039   |       408601        |
|   084   |       372020        |
|   167   |       344060        |
|   000   |       338965        |
|   002   |       333699        |
|   025   |       305129        |
|   140   |       254320        |
|   126   |       245897        |
|   017   |       178717        |
|   022   |       171412        |
+---------+---------------------+


For this query we know from the task description that altitudes with value -777 are invalid. I choose to simply exclude all negative values, because Beijing is entirely situated above sea level.

#### 9. Find all users who have invalid activities, and the number of invalid activities per user

In [25]:
# Dictionary to track invalid activity counts per user
user_invalid_activities = {}

# Fetch each activity with associated trackpoints and user_id
activities = db.activity.find({}, {"user_id": 1, "trackpoints": 1})

for activity in activities:
    user_id = activity["user_id"]
    trackpoint_ids = activity["trackpoints"]

    # Retrieve trackpoints sorted by time for each activity
    trackpoints = list(db.trackpoint.find(
        {"_id": {"$in": trackpoint_ids}},
        {"date_time": 1}
    ).sort("date_time", 1))

    # Check consecutive time gaps
    invalid_activity = False
    for i in range(1, len(trackpoints)):
        tp1, tp2 = trackpoints[i - 1], trackpoints[i]
        if tp2["date_time"] - tp1["date_time"] >= timedelta(minutes=5):
            invalid_activity = True
            break

    # Increment invalid activity count if found
    if invalid_activity:
        if user_id in user_invalid_activities:
            user_invalid_activities[user_id] += 1
        else:
            user_invalid_activities[user_id] = 1

# Display results sorted by invalid activity count
sorted_users = sorted(user_invalid_activities.items(), key=lambda x: x[1], reverse=True)
rows = [{"User ID": user, "Invalid Activity Count": count} for user, count in sorted_users]
print(tabulate(rows, headers="keys", tablefmt="pretty"))

+---------+------------------------+
| User ID | Invalid Activity Count |
+---------+------------------------+
|   128   |          720           |
|   153   |          558           |
|   025   |          263           |
|   062   |          249           |
|   163   |          233           |
|   004   |          219           |
|   041   |          201           |
|   085   |          184           |
|   003   |          179           |
|   144   |          157           |
|   039   |          147           |
|   068   |          139           |
|   167   |          134           |
|   017   |          129           |
|   014   |          118           |
|   030   |          112           |
|   126   |          105           |
|   000   |          101           |
|   092   |          101           |
|   037   |          100           |
|   084   |           99           |
|   002   |           98           |
|   104   |           97           |
|   034   |           88           |
|

#### 10. Find the users who have tracked an activity in the Forbidden City of Beijing.  


In [20]:
pipeline = [
    # Lookup trackpoints for each activity
    {"$lookup": {
        "from": "trackpoint",
        "localField": "trackpoints",
        "foreignField": "_id",
        "as": "trackpoint_data"
    }},
    
    # Unwind to work with each trackpoint individually
    {"$unwind": "$trackpoint_data"},
    
    # Filter trackpoints within the Forbidden City coordinates
    {"$match": {
        "trackpoint_data.lat": {"$gte": 39.915, "$lte": 39.917},
        "trackpoint_data.lon": {"$gte": 116.396, "$lte": 116.398}
    }},
    
    # Group by user_id to get users who tracked activities in the location
    {"$group": {"_id": "$user_id"}}
]

results = db.activity.aggregate(pipeline)

# Display the results with tabulate
rows = [{"User ID": result["_id"]} for result in results]
print(tabulate(rows, headers="keys", tablefmt="pretty"))

+---------+
| User ID |
+---------+
|   004   |
|   131   |
|   018   |
|   019   |
+---------+


I provided a tiny range around the given coordinates to simplify the query.

#### 11. Find all users who have registered transportation_mode and their most used transportation_mode. 

In [4]:
pipeline = [
    # Filter out activities without a transportation_mode
    {"$match": {"transportation_mode": {"$ne": None}}},
    
    # Group by user_id and transportation_mode to count occurrences
    {"$group": {
        "_id": {"user_id": "$user_id", "transportation_mode": "$transportation_mode"},
        "mode_count": {"$sum": 1}
    }},
    
    # Group by user_id to find the most used transportation_mode for each user
    # Sort by mode_count descending, then by transportation_mode alphabetically to break ties
    {"$sort": {"mode_count": -1, "_id.transportation_mode": 1}},  
    {"$group": {
        "_id": "$_id.user_id",
        "most_used_mode": {"$first": "$_id.transportation_mode"},
        "usage_count": {"$first": "$mode_count"}
    }},
    
    # Sort users by user_id for easy reading
    {"$sort": {"_id": 1}}
]

results = db.activity.aggregate(pipeline)

# Format the results for display
rows = [{"User ID": result["_id"], "Most Used Mode": result["most_used_mode"], "Usage Count": result["usage_count"]} for result in results]
print(tabulate(rows, headers="keys", tablefmt="pretty"))

+---------+----------------+-------------+
| User ID | Most Used Mode | Usage Count |
+---------+----------------+-------------+
|   010   |      taxi      |      3      |
|   020   |      bike      |     81      |
|   021   |      walk      |      1      |
|   052   |      bus       |      1      |
|   056   |      bike      |     15      |
|   058   |      car       |      2      |
|   060   |      walk      |      1      |
|   062   |      bus       |     173     |
|   064   |      bike      |      1      |
|   065   |      bike      |     10      |
|   067   |      walk      |      1      |
|   069   |      bike      |      1      |
|   073   |      walk      |     52      |
|   075   |      walk      |      1      |
|   076   |      car       |      3      |
|   078   |      walk      |     37      |
|   080   |      bike      |      1      |
|   081   |      bike      |      4      |
|   082   |      walk      |      2      |
|   084   |      walk      |      9      |
|   085   |