# Process Google Timeline Data

In [11]:
import os
from dotenv import load_dotenv

# VARIABLES
json_file = "EZ_data/location-history.json"
cluster_data_file_path = "EZ_data/cluster_means.csv"

load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

In [2]:
import json
import pandas as pd
import numpy as np
import requests
from sklearn.cluster import DBSCAN
from geopy.distance import great_circle
from datetime import datetime
import matplotlib.pyplot as plt

### Load Data

In [3]:
def load_google_timeline(json_file):
    with open(json_file, "r", encoding="utf-8") as file:
        data = json.load(file)
    
    locations = []
    for entry in data:
        if "visit" in entry:
            visit_info = entry["visit"]
            top_candidate = visit_info.get("topCandidate", {})

            if "placeLocation" in top_candidate:
                latlon_str = top_candidate["placeLocation"].replace("geo:", "")
                lat, lon = map(float, latlon_str.split(",")) 
                
                start_time = entry.get("startTime", "")
                end_time = entry.get("endTime", "")

                locations.append((lat, lon, start_time, end_time))

    return pd.DataFrame(locations, columns=["latitude", "longitude", "start_time", "end_time"])


df = load_google_timeline(json_file)
df.head()

Unnamed: 0,latitude,longitude,start_time,end_time
0,38.028028,-78.508516,2025-02-12T17:56:54.000-05:00,2025-02-13T10:44:48.307-05:00
1,38.031619,-78.510846,2025-02-13T10:51:39.463-05:00,2025-02-13T12:31:04.514-05:00
2,38.031619,-78.510846,2025-02-13T10:51:39.463-05:00,2025-02-13T11:03:44.537-05:00
3,38.028028,-78.508516,2025-02-13T12:36:02.657-05:00,2025-02-13T16:04:27.150-05:00
4,38.052815,-78.501257,2025-02-13T16:15:59.999-05:00,2025-02-13T17:02:51.018-05:00


### Find Clusters based on Longitude and Latitude

In [4]:
def cluster_locations(df, eps=50, min_samples=2):
    coords = df[["latitude", "longitude"]].values
    db = DBSCAN(eps=eps / 6371000, min_samples=min_samples, metric="haversine").fit(np.radians(coords))
    
    df["cluster"] = db.labels_
    return df

df_clusters = cluster_locations(df)

In [14]:
import folium

def plot_clusters_folium(df):
    # map_center = [df["latitude"].mean(), df["longitude"].mean()]
    m = folium.Map(location=[38.028028, -78.508516], zoom_start=14)

    # Define colors for clusters
    cluster_colors = {
        -1: "gray"  # Noise
    }
    unique_clusters = df["cluster"].unique()
    cmap = plt.cm.get_cmap("tab10", len(unique_clusters))

    for i, cluster in enumerate(unique_clusters):
        if cluster not in cluster_colors:
            cluster_colors[cluster] = f"#{''.join(np.random.choice(list('0123456789ABCDEF'), 6))}" 
    
    # adding points to a map
    for _, row in df.iterrows():
        folium.CircleMarker(
            location=[row["latitude"], row["longitude"]],
            radius=5,
            color=cluster_colors[row["cluster"]],
            fill=True,
            fill_color=cluster_colors[row["cluster"]],
            fill_opacity=0.6,
        ).add_to(m)

    return m

plot_clusters_folium(df_clusters)


  cmap = plt.cm.get_cmap("tab10", len(unique_clusters))


In [9]:
print(df_clusters['cluster'].nunique())

34


In [13]:
df_filtered = df_clusters[df_clusters["cluster"] != -1]
cluster_means = df_filtered.groupby("cluster")[["latitude", "longitude"]].mean().reset_index()
cluster_means.to_csv(cluster_data_file_path, index=False)

In [None]:
# plot ONLY the cluster means on the map
plot_clusters_folium(cluster_means)

  cmap = plt.cm.get_cmap("tab10", len(unique_clusters))


### Use Google Places API to find Location Names

In [None]:
import requests

# Define the API endpoint
url = "https://places.googleapis.com/v1/places:searchNearby"

# Define the headers
headers = {
    "Content-Type": "application/json",
    "X-Goog-Api-Key": "YOUR_API_KEY",  # Replace with your actual API key
    "X-Goog-FieldMask": "places.displayName"
}

# Define the payload
data = {
    "includedTypes": ["restaurant"],
    "maxResultCount": 10,
    "locationRestriction": {
        "circle": {
            "center": {
                "latitude": 37.7937,
                "longitude": -122.3965
            },
            "radius": 500.0
        }
    }
}

# Make the POST request
response = requests.post(url, headers=headers, json=data)

In [None]:
# GOOGLE_API_KEY
def get_place_info(lat, lon):
    """Query Google Places API to identify location type"""
    url = f"https://maps.googleapis.com/maps/api/place/nearbysearch/json?location={lat},{lon}&radius=50&key={GOOGLE_PLACES_API_KEY}"
    response = requests.get(url)
    data = response.json()

    if "results" in data and len(data["results"]) > 0:
        place = data["results"][0]
        return place["name"], place.get("types", ["Unknown"])[0]
    return "Unknown", "Unknown"


def label_significant_places(df):
    """Assign place labels to clustered locations"""
    unique_clusters = df[df["cluster"] != -1]["cluster"].unique()
    place_labels = {}

    for cluster in unique_clusters:
        cluster_points = df[df["cluster"] == cluster]
        lat, lon = cluster_points[["latitude", "longitude"]].mean()
        name, place_type = get_place_info(lat, lon)
        place_labels[cluster] = (name, place_type)

    df["place_name"] = df["cluster"].map(lambda c: place_labels.get(c, ("Unknown", "Unknown"))[0])
    df["place_type"] = df["cluster"].map(lambda c: place_labels.get(c, ("Unknown", "Unknown"))[1])
    
    return df

In [None]:
# === Step 4: Evaluate Accuracy ===
def evaluate_accuracy(df, ground_truth):
    """Compare algorithm's output with known labels for accuracy"""
    correct = 0
    total = len(ground_truth)
    
    for cluster, actual_label in ground_truth.items():
        predicted_label = df[df["cluster"] == cluster]["place_type"].mode()[0]
        if predicted_label == actual_label:
            correct += 1
    
    accuracy = correct / total if total > 0 else 0
    print(f"Accuracy: {accuracy:.2%}")
    return accuracy

In [None]:
df = label_significant_places(df)

print(df[["latitude", "longitude", "place_name", "place_type"]])

# If you have known places for accuracy testing, format them as {cluster_id: "expected_place_type"}
ground_truth = {0: "home", 1: "work", 2: "gym"}  # Example manual labels
evaluate_accuracy(df, ground_truth)