# Process Google Timeline Data

In [1]:
import os
from dotenv import load_dotenv

# VARIABLES
folder = "CB_data" # ONLY need to change this value

json_file = folder + "/location-history.json"
cluster_counts_file_path = folder + "/cluster_counts.csv"
cluster_data_file_path = folder + "/cluster_means.csv"
places_data_file_path = folder + "/places_info.csv"

load_dotenv()
# Make sure to create a .env file and store your API KEY
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

In [2]:
import json
import pandas as pd
import numpy as np
import requests
from sklearn.cluster import DBSCAN
from geopy.distance import great_circle
from datetime import datetime
import matplotlib.pyplot as plt

### Load Data

In [4]:
def load_google_timeline(json_file):
    with open(json_file, "r", encoding="utf-8") as file:
        data = json.load(file)
    
    locations = []
    for entry in data:
        if "visit" in entry:
            visit_info = entry["visit"]
            top_candidate = visit_info.get("topCandidate", {})

            if "placeLocation" in top_candidate:
                latlon_str = top_candidate["placeLocation"].replace("geo:", "")
                lat, lon = map(float, latlon_str.split(",")) 
                
                start_time = entry.get("startTime", "")
                end_time = entry.get("endTime", "")

                locations.append((lat, lon, start_time, end_time))

    return pd.DataFrame(locations, columns=["latitude", "longitude", "start_time", "end_time"])


df = load_google_timeline(json_file)
df.head()

Unnamed: 0,latitude,longitude,start_time,end_time
0,38.033553,-78.507977,2025-01-14T16:50:33.999-05:00,2025-01-15T09:24:39.627-05:00
1,38.031619,-78.510846,2025-01-15T09:25:47.488-05:00,2025-01-15T10:53:54.217-05:00
2,38.029297,-78.510103,2025-01-15T10:53:54.217-05:00,2025-01-15T12:13:04.349-05:00
3,38.031619,-78.510846,2025-01-15T12:13:04.349-05:00,2025-01-15T15:18:36.354-05:00
4,38.033553,-78.507977,2025-01-15T20:19:08.968-05:00,2025-01-15T21:12:17.457-05:00


### Find Clusters based on Longitude and Latitude

In [5]:
def cluster_locations(df, eps=50, min_samples=2):
    coords = df[["latitude", "longitude"]].values
    db = DBSCAN(eps=eps / 6371000, min_samples=min_samples, metric="haversine").fit(np.radians(coords))
    
    df["cluster"] = db.labels_
    return df

df_clusters = cluster_locations(df)

In [6]:
df_clusters.to_csv(cluster_counts_file_path)

In [8]:
import folium

def plot_clusters_folium(df):
    # map_center = [df["latitude"].mean(), df["longitude"].mean()]
    m = folium.Map(location=[38.028028, -78.508516], zoom_start=14)

    # Define colors for clusters
    cluster_colors = {
        -1: "gray"  # Noise
    }
    unique_clusters = df["cluster"].unique()
    cmap = plt.cm.get_cmap("tab10", len(unique_clusters))

    for i, cluster in enumerate(unique_clusters):
        if cluster not in cluster_colors:
            cluster_colors[cluster] = f"#{''.join(np.random.choice(list('0123456789ABCDEF'), 6))}" 
    
    # adding points to a map
    for _, row in df.iterrows():
        folium.CircleMarker(
            location=[row["latitude"], row["longitude"]],
            radius=5,
            color=cluster_colors[row["cluster"]],
            fill=True,
            fill_color=cluster_colors[row["cluster"]],
            fill_opacity=0.6,
        ).add_to(m)

    return m

plot_clusters_folium(df_clusters)


  cmap = plt.cm.get_cmap("tab10", len(unique_clusters))


In [9]:
print(df_clusters['cluster'].nunique())

43


In [10]:
df_filtered = df_clusters[df_clusters["cluster"] != -1]
cluster_means = df_filtered.groupby("cluster")[["latitude", "longitude"]].mean().reset_index()
cluster_means.to_csv(cluster_data_file_path, index=False)

In [11]:
# plot ONLY the cluster means on the map
plot_clusters_folium(cluster_means)

  cmap = plt.cm.get_cmap("tab10", len(unique_clusters))


### Use Google Places API to find Location Names

In [13]:
import requests

# Define the API endpoint
url = "https://places.googleapis.com/v1/places:searchNearby"

# Define the headers
headers = {
    "Content-Type": "application/json",
    "X-Goog-Api-Key": GOOGLE_API_KEY,  # Replace with your actual API key
    "X-Goog-FieldMask": "places.displayName,places.formattedAddress,places.types,places.id"
}

# Define the payload
data = {
    "maxResultCount": 1,
    "locationRestriction": {
        "circle": {
            "center": {
                "latitude": 38.904423,
                "longitude": -77.2036175000000
            },
            "radius": 20.0
        }
    }
}
# Make the POST request
response = requests.post(url, headers=headers, json=data)
print(response.json())

{'places': [{'id': 'ChIJHX0M4tdKtokRu7PPetQjRTY', 'types': ['grocery_store', 'supermarket', 'liquor_store', 'food_store', 'food', 'market', 'health', 'point_of_interest', 'store', 'establishment'], 'formattedAddress': '7514 Leesburg Pike, Falls Church, VA 22043, USA', 'displayName': {'text': "Trader Joe's", 'languageCode': 'en'}}]}


In [14]:
def get_place_info(lat, lon):
    url = "https://places.googleapis.com/v1/places:searchNearby"

    # Define the headers
    headers = {
        "Content-Type": "application/json",
        "X-Goog-Api-Key": GOOGLE_API_KEY,
        "X-Goog-FieldMask": "places.displayName,places.formattedAddress,places.types,places.id"
    }

    # Define the payload
    data = {
        "maxResultCount": 1,
        "locationRestriction": {
            "circle": {
                "center": {
                    "latitude": lat,
                    "longitude": lon
                },
                "radius": 20.0
            }
        }
    }
    response = requests.post(url, headers=headers, json=data)
    return response.json()


In [15]:
cluster_means = pd.read_csv(cluster_data_file_path)
results = []

for index, row in cluster_means.iterrows():
    lat = row['latitude']
    lon = row['longitude']
    cluster_id = row['cluster']
    
    place_data = get_place_info(lat, lon)
    
    places_info = place_data.get('places', [])
    
    # Loop through the places and store the relevant info
    for place in places_info:
        results.append({
            'cluster_id': cluster_id,
            'latitude': lat,
            'longitude': lon,
            'place_id': place.get('id', ''),
            'place_types': place.get('types', []),
            'place_address': place.get('formattedAddress', ''),
            'place_name': place.get('displayName', {}).get('text', '')
        })


place_info_df = pd.DataFrame(results)
place_info_df.head()

Unnamed: 0,cluster_id,latitude,longitude,place_id,place_types,place_address,place_name
0,1.0,38.031619,-78.510846,ChIJB1s6kd6Hs4kRJhuNmO73ZLk,"[point_of_interest, establishment]","85 Engineer's Way, Charlottesville, VA 22903, USA",Rice Hall
1,2.0,38.029297,-78.510098,ChIJjR--lIeHs4kRWTwypENqnhk,"[apartment_building, point_of_interest, establ...","112 Montebello Cir, Charlottesville, VA 22903,...",112 Montebello Circle Apartments
2,3.0,38.031103,-78.513688,ChIJ83CLDFyGs4kRlpbwbea2AWc,"[stadium, sports_complex, event_venue, sports_...","1815 Stadium Rd, Charlottesville, VA 22903, USA",Scott Stadium
3,4.0,38.052777,-78.501302,ChIJI5T2nbeHs4kRuLewu_1gO8g,"[shopping_mall, point_of_interest, establishment]","1117 Emmet St N, Charlottesville, VA 22903, USA",Barracks Road Shopping Center
4,5.0,38.069505,-78.48403,ChIJkchPWniHs4kR80VQjgFxijU,"[fast_food_restaurant, gas_station, sandwich_s...","1215 Seminole Trail, Charlottesville, VA 22901...",Wawa


In [16]:
place_info_df.to_csv(places_data_file_path)