## Google data preparation

In [1]:
%reload_ext autoreload
%autoreload 2

import pandas as pd 
import os
import json
from datetime import datetime
from dateutil import parser


try:
    print(run_only_once)
except Exception as e:
    print(os.getcwd())
    os.chdir("./../")
    print(os.getcwd())
    run_only_once = "Dir has already been changed"

C:\Users\Zan\Desktop\start_23\dataholics\notebooks
C:\Users\Zan\Desktop\start_23\dataholics


## Data format description
One trip data should look approximately like this:

`
{'start_time': '2023-02-01T05:52:30.326Z',
 'end_time': '2023-02-01T05:55:57.768Z',
 'activity_type': 'WALKING',
 'duration': 207.442,
 'start_location': [481080467, 116006127],
 'end_location': [481071768, 115985278],
 'distance': 182,
 'waypoints_list': [{'latitudeE7': 481079750, 'longitudeE7': 116005859},
  {'latitudeE7': 481075172, 'longitudeE7': 115995626},
  {'latitudeE7': 481072921, 'longitudeE7': 115984754}]}
`

In [2]:
# main key - timelineObjects
def parse_trip_file(file):
    prepared_data_list = []
    with open(file, 'r', encoding="utf8") as f:
        data = json.load(f)
        
    list_of_trips = data['timelineObjects']
    print(f"There is {len(list_of_trips)} trips found in total!")
    
    for trip in list_of_trips:
        if "activitySegment" in trip:
            current_trip = {}
            trip_a = trip["activitySegment"]
            current_trip["start_time"] = trip_a["duration"]["startTimestamp"]
            current_trip["end_time"] = trip_a["duration"]["endTimestamp"]

            # start_time_iso = datetime.strptime(current_trip["start_time"], "%Y-%m-%dT%H:%M:%S.%fZ")
            # end_time_iso = datetime.strptime(current_trip["end_time"], "%Y-%m-%dT%H:%M:%S.%fZ") 
            start_time_iso = parser.parse(current_trip["start_time"])
            end_time_iso = parser.parse(current_trip["end_time"])
            
            current_trip["activity_type"] = trip_a["activityType"]
            current_trip["duration"] = (end_time_iso - start_time_iso).total_seconds()
            current_trip["start_location"] = [trip_a["startLocation"]["latitudeE7"], trip_a["startLocation"]["longitudeE7"]]
            current_trip["end_location"] = [trip_a["endLocation"]["latitudeE7"], trip_a["endLocation"]["longitudeE7"]]
            # current_trip["user_id"] = user
            current_trip["distance"] = trip_a["distance"]

            current_trip["waypoints_list"] = []
            if "waypointPath" in trip_a:
                waypoints = trip_a["waypointPath"]["waypoints"]
                mapping_dict = {"latE7": "latitudeE7", "lngE7": "longitudeE7"}

                for point in waypoints:
                    prep_points = dict((mapping_dict[key], value) for (key, value) in point.items())
                    current_trip["waypoints_list"].append(prep_points) 
            elif "transitPath" in trip_a:
                current_trip["waypoints_list"] = trip_a["transitPath"]["transitStops"]
                
            prepared_data_list.append(current_trip)
            # else: current_trip["waypoints_list"] = None
            # print(f"Type of activity: {current_trip['activity_type']}, Trip duration {current_trip['duration']} seconds")
        else: # "placeVisit" in trip:
            pass 
        
    return prepared_data_list

In [3]:
data_path = "./data/google_trips/"
all_trip_files = [file for file in os.listdir(data_path) if not file.startswith(".")]

combined_month_data = []
for file in all_trip_files:
    user_name = file.split("_")[0]
    full_file_path = f"{data_path}{file}" 
    print(f"Reading file {full_file_path} for user '{user_name}' ...")
    
    ith_file = {user_name: parse_trip_file(full_file_path)}
    combined_month_data.append(ith_file)

Reading file ./data/google_trips/gordan_2023_FEBRUARY.json for user 'gordan' ...
There is 372 trips found in total!
Reading file ./data/google_trips/gordan_2023_JANUARY.json for user 'gordan' ...
There is 368 trips found in total!
Reading file ./data/google_trips/max_2023_FEBRUARY.json for user 'max' ...
There is 380 trips found in total!
Reading file ./data/google_trips/max_2023_JANUARY.json for user 'max' ...
There is 443 trips found in total!
Reading file ./data/google_trips/max_2023_MARCH.json for user 'max' ...
There is 380 trips found in total!
Reading file ./data/google_trips/zan_2023_FEBRUARY.json for user 'zan' ...
There is 98 trips found in total!
Reading file ./data/google_trips/zan_2023_JANUARY.json for user 'zan' ...
There is 241 trips found in total!


In [4]:
print(f"In total there are {len(combined_month_data)} user files with trips")
with open("./data/prepared_trips/trip_data.json", 'w') as outfile:
    json.dump(combined_month_data, outfile)
    
print("Saved to file")

In total there are 7 user files with trips
Saved to file
