# Process Google Timeline Data

In [1]:
import os
from dotenv import load_dotenv

# VARIABLES
folder = "JG_data" # ONLY need to change this value

json_file = folder + "/location-history.json"
cluster_counts_file_path = folder + "/cluster_counts.csv"
cluster_data_file_path = folder + "/cluster_means.csv"
places_data_file_path = folder + "/places_info.csv"

load_dotenv()
# Make sure to create a .env file and store your API KEY
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

In [2]:
import json
import pandas as pd
import numpy as np
import requests
from sklearn.cluster import DBSCAN
from geopy.distance import great_circle
from datetime import datetime
import matplotlib.pyplot as plt

### Load Data

In [7]:
def load_google_timeline(json_file):
    with open(json_file, "r", encoding="utf-8") as file:
        data = json.load(file)
    
    locations = []
    for entry in data:
        if "visit" in entry:
            visit_info = entry["visit"]
            top_candidate = visit_info.get("topCandidate", {})

            if "placeLocation" in top_candidate:
                latlon_str = top_candidate["placeLocation"].replace("geo:", "")
                lat, lon = map(float, latlon_str.split(",")) 
                
                start_time = entry.get("startTime", "")
                end_time = entry.get("endTime", "")

                locations.append((lat, lon, start_time, end_time))

    return pd.DataFrame(locations, columns=["latitude", "longitude", "start_time", "end_time"])


df = load_google_timeline(json_file)
df.head()

Unnamed: 0,latitude,longitude,start_time,end_time
0,38.032001,-78.510506,2025-01-14T13:33:50.659-05:00,2025-01-14T13:54:32.999-05:00
1,38.033553,-78.507977,2025-01-14T14:00:49.000-05:00,2025-01-14T15:20:06.394-05:00
2,38.030154,-78.503874,2025-01-14T14:00:49.000-05:00,2025-01-14T15:20:06.394-05:00
3,38.034093,-78.508209,2025-01-14T16:43:27.066-05:00,2025-01-14T18:52:58.904-05:00
4,38.033553,-78.507977,2025-01-14T19:00:02.783-05:00,2025-01-14T21:13:26.999-05:00


In [8]:
df['start_time'] = pd.to_datetime(df['start_time'])
df['end_time'] = pd.to_datetime(df['end_time'])
df['duration_time'] = df['end_time'] - df['start_time']
df.head()

Unnamed: 0,latitude,longitude,start_time,end_time,duration_time
0,38.032001,-78.510506,2025-01-14 13:33:50.659000-05:00,2025-01-14 13:54:32.999000-05:00,0 days 00:20:42.340000
1,38.033553,-78.507977,2025-01-14 14:00:49-05:00,2025-01-14 15:20:06.394000-05:00,0 days 01:19:17.394000
2,38.030154,-78.503874,2025-01-14 14:00:49-05:00,2025-01-14 15:20:06.394000-05:00,0 days 01:19:17.394000
3,38.034093,-78.508209,2025-01-14 16:43:27.066000-05:00,2025-01-14 18:52:58.904000-05:00,0 days 02:09:31.838000
4,38.033553,-78.507977,2025-01-14 19:00:02.783000-05:00,2025-01-14 21:13:26.999000-05:00,0 days 02:13:24.216000


### Find Clusters based on Longitude and Latitude

In [None]:
def cluster_locations(df, eps=50, min_samples=2):
    coords = df[["latitude", "longitude"]].values
    db = DBSCAN(eps=eps / 6371000, min_samples=min_samples, metric="haversine").fit(np.radians(coords))
    
    df["cluster"] = db.labels_
    return df

df_clusters = cluster_locations(df)
df_clusters["duration_time"] = pd.to_timedelta(df_clusters["duration_time"])

In [None]:
df_clusters.to_csv(cluster_counts_file_path)

In [12]:
import folium

def plot_clusters_folium(df):
    # map_center = [df["latitude"].mean(), df["longitude"].mean()]
    m = folium.Map(location=[38.028028, -78.508516], zoom_start=14)

    # Define colors for clusters
    cluster_colors = {
        -1: "gray"  # Noise
    }
    unique_clusters = df["cluster"].unique()
    cmap = plt.cm.get_cmap("tab10", len(unique_clusters))

    for i, cluster in enumerate(unique_clusters):
        if cluster not in cluster_colors:
            cluster_colors[cluster] = f"#{''.join(np.random.choice(list('0123456789ABCDEF'), 6))}" 
    
    # adding points to a map
    for _, row in df.iterrows():
        folium.CircleMarker(
            location=[row["latitude"], row["longitude"]],
            radius=5,
            color=cluster_colors[row["cluster"]],
            fill=True,
            fill_color=cluster_colors[row["cluster"]],
            fill_opacity=0.6,
        ).add_to(m)

    return m

plot_clusters_folium(df_clusters)


  cmap = plt.cm.get_cmap("tab10", len(unique_clusters))


In [13]:
print(df_clusters['cluster'].nunique())

34


In [23]:
df_filtered = df_clusters[df_clusters["cluster"] != -1]
cluster_means = df_filtered.groupby("cluster")[["latitude", "longitude"]].mean().reset_index()
cluster_durations = df_filtered.groupby("cluster")["duration_time"].sum().reset_index()
cluster_means = cluster_means.merge(cluster_durations, on="cluster")
cluster_means.to_csv(cluster_data_file_path, index=False)

In [24]:
# plot ONLY the cluster means on the map
plot_clusters_folium(cluster_means)

  cmap = plt.cm.get_cmap("tab10", len(unique_clusters))


### Use Google Places API to find Location Names

In [27]:
import requests

# Define the API endpoint
url = "https://places.googleapis.com/v1/places:searchNearby"

# Define the headers
headers = {
    "Content-Type": "application/json",
    "X-Goog-Api-Key": GOOGLE_API_KEY,  # Replace with your actual API key
    "X-Goog-FieldMask": "places.displayName,places.formattedAddress,places.types,places.id"
}

# Define the payload
data = {
    "maxResultCount": 1,
    "locationRestriction": {
        "circle": {
            "center": {
                "latitude": 38.904423,
                "longitude": -77.2036175000000
            },
            "radius": 20.0
        }
    }
}
# Make the POST request
response = requests.post(url, headers=headers, json=data)
print(response.json())

{'places': [{'id': 'ChIJHX0M4tdKtokRu7PPetQjRTY', 'types': ['grocery_store', 'market', 'liquor_store', 'supermarket', 'food_store', 'store', 'food', 'point_of_interest', 'health', 'establishment'], 'formattedAddress': '7514 Leesburg Pike, Falls Church, VA 22043, USA', 'displayName': {'text': "Trader Joe's", 'languageCode': 'en'}}]}


In [16]:
def get_place_info(lat, lon):
    url = "https://places.googleapis.com/v1/places:searchNearby"

    # Define the headers
    headers = {
        "Content-Type": "application/json",
        "X-Goog-Api-Key": GOOGLE_API_KEY,
        "X-Goog-FieldMask": "places.displayName,places.formattedAddress,places.types,places.id"
    }

    # Define the payload
    data = {
        "maxResultCount": 1,
        "locationRestriction": {
            "circle": {
                "center": {
                    "latitude": lat,
                    "longitude": lon
                },
                "radius": 20.0
            }
        }
    }
    response = requests.post(url, headers=headers, json=data)
    return response.json()


In [25]:
cluster_means = pd.read_csv(cluster_data_file_path)
results = []

for index, row in cluster_means.iterrows():
    lat = row['latitude']
    lon = row['longitude']
    time = row['duration_time']
    cluster_id = row['cluster']
    
    place_data = get_place_info(lat, lon)
    
    places_info = place_data.get('places', [])
    
    # Loop through the places and store the relevant info
    for place in places_info:
        results.append({
            'cluster_id': cluster_id,
            'latitude': lat,
            'longitude': lon,
            'total_duration_time': time,
            'place_id': place.get('id', ''),
            'place_types': place.get('types', []),
            'place_address': place.get('formattedAddress', ''),
            'place_name': place.get('displayName', {}).get('text', '')
        })


place_info_df = pd.DataFrame(results)
place_info_df.head()

Unnamed: 0,cluster_id,latitude,longitude,total_duration_time,place_id,place_types,place_address,place_name
0,0,38.032009,-78.510529,2 days 05:55:57.386000,ChIJp622Q1uGs4kRB5yTaU44YRg,"[point_of_interest, establishment]","151 Engineer's Way, Charlottesville, VA 22904,...",Olsson Hall
1,1,38.033553,-78.507977,2 days 01:42:09.514000,ChIJ3cRJt0iGs4kRRliLzMIIUUA,"[university, point_of_interest, establishment]","Charlottesville, VA, USA",University of Virginia
2,2,38.030154,-78.503874,0 days 05:01:15.533000,ChIJgyKJHESGs4kRGrGJEdGoZuk,"[university, point_of_interest, establishment]","550 Brandon Ave, Charlottesville, VA 22903, USA",UVA Department of Student Health and Wellness
3,4,38.032882,-78.513558,0 days 09:33:52.165000,ChIJpyGb7luGs4kRIMwkg1whbHk,"[gym, event_venue, athletic_field, swimming_po...","450 Whitehead Rd, Charlottesville, VA 22904, USA",Aquatic & Fitness Center
4,5,38.033871,-78.498767,0 days 01:09:18.596000,ChIJ-ZvRdziGs4kRqL3TPH5X5ZM,"[pizza_restaurant, sandwich_shop, catering_ser...","1321 W Main St, Charlottesville, VA 22903, USA",Mellow Mushroom Charlottesville


In [26]:
place_info_df.to_csv(places_data_file_path)