## [Description](#Description_)
## [Research](#Research_)
## [Todo](#Todo_)
## [Setup](#Setup_)
## [Imports](#Imports_)
## [Config](#Config_)
## [Utils](#Utils_)
- ### [Markdown](#Markdown_)
## [Data](#Data_)
- ### [Data exploration](#Data_exploration_)
## [Metrics](#Metrics_)
## [Model](#Model_)
## [Training](#Training_)
## [Results](#Results_)

## Description <span id=Description_></span>

### [Kaggle contest](https://www.kaggle.com/competitions/smartphone-decimeter-2023)

## Research <span id=Research_></span>

## Todo <span id=Todo_></span>

## Setup <span id=Setup_></span>

In [None]:
#!pip install -qqq

## Imports <span id=Imports_></span>

In [2]:
import collections
import dataclasses
import functools
import glob
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import sys
import torch
from torch import Tensor
from typing import Callable, Literal, TypeVar
import tqdm

## Config <span id=Config_></span>

In [197]:
@dataclasses.dataclass(frozen=True, kw_only=True)
class Config:
    competition = "smartphone-decimeter-2023"
    data_directory = "data"

## Utils <span id=Utils_></span>

### Markdown <span id=Markdown_></span>

In [1]:
def make_new_markdown_section_with_link(
    section: str, header_size: int = 2
) -> tuple[str, str]:
    header = "#" * header_size
    section_id = section.replace(" ", "_") + "_"
    section_link = f"{header} [{section}](#{section_id})"
    section_header = f"{header} {section} <span id={section_id}></span>"
    return section_link, section_header

In [196]:
def save_kaggle_api_key(api_token: dict, kaggle_directory: str = "~/.kaggle"):
    """
    Given the api token downloaded from https://www.kaggle.com/settings/account,
    saves it, allowing to use the kaggle cli.
    """
    
    kaggle_directory = os.path.expanduser(kaggle_directory)
    os.makedirs(kaggle_directory, exist_ok=True)
    api_token_json = os.path.join(kaggle_directory, "kaggle.json")
    with open(api_token_json, "w") as file:
        json.dump(api_token, file)
    
    os.chmod(api_token_json, 0o600)
    
def kaggle_competitions_search(search_term):
    !kaggle competitions list -s {search_term}


def kaggle_competitions_files(competition):
    !kaggle competitions files {competition}


def kaggle_competitions_download(competition, save_path="data", filename=None):
    os.mkdir(save_path)
    !kaggle competitions download -p {save_path} {"-f " + filename if filename else ""} {competition}

    
def kaggle_competitions_download_file(competition:str, filename:str, save_path:str):
    relative_filename = os.path.join(save_path, filename)
    save_path = os.path.join(save_path, os.path.split(filename)[0])
    if os.path.exists(relative_filename):
        print(f"File `{relative_filename}` already exists.")
    else:
        !kaggle competitions download {competition} -f {filename} -p {save_path}
        zip_relative_filename = relative_filename + ".zip"
        if os.path.exists(zip_relative_filename):
            unzip(zip_relative_filename, save_path=save_path, delete_zip=True)
            
            
def kaggle_competitions_submit(competition, filename, message="submit"):
    !kaggle competitions submit -f {filename} -m {message} {competition}


def kaggle_competitions_submissions(competition):
    !kaggle competitions submissions {competition}

## Data <span id=Data_></span>

https://www.kaggle.com/competitions/smartphone-decimeter-2023/data

Data gathering procedure: https://www.kaggle.com/datasets/google/android-smartphones-high-accuracy-datasets

GNSS to dataframe: https://www.kaggle.com/code/sohier/loading-gnss-logs/notebook

In [53]:
if not os.path.exists(Config.data_directory):
    kaggle_competitions_download(Config.competition)

### Data structure

In [226]:
def create_data_files_dataframe(path):
    data = []
    for split in ["train", "test"]:
        split_path = pathlib.Path(path) / "sdc2023" / split
        for drive_id in os.listdir(split_path):
            drive_path = split_path / drive_id
            for phone_name in os.listdir(drive_path):
                datum_path = drive_path / phone_name
                device_gnss = str(datum_path / "device_gnss.csv")
                device_imu = str(datum_path / "device_imu.csv")
                ground_truth = str(datum_path / "ground_truth.csv") if split == "train" else None
                data.append([split, drive_id, phone_name, device_gnss, device_imu, ground_truth])

    df = pd.DataFrame(
        data=data,
        columns=[
            "split",
            "drive_id",
            "phone_name",
            "device_gnss",
            "device_imu",
            "ground_truth",
        ]
    )
    df.name = path
    return df

In [227]:
df = create_data_files_dataframe(Config.data_directory)

In [247]:
pd.read_csv(df[df["split"] == "train"].iloc[10]["ground_truth"])

Unnamed: 0,MessageType,Provider,LatitudeDegrees,LongitudeDegrees,AltitudeMeters,SpeedMps,AccuracyMeters,BearingDegrees,UnixTimeMillis,SpeedAccuracyMps,BearingAccuracyDegrees,elapsedRealtimeNanos,VerticalAccuracyMeters
0,Fix,GT,37.317900,-121.948512,11.530272,0.001000,0.1,0.882980,1607640762435,,,,
1,Fix,GT,37.317900,-121.948512,11.529630,0.000908,0.1,0.882782,1607640763435,,,,
2,Fix,GT,37.317900,-121.948512,11.529271,0.003716,0.1,0.882991,1607640764435,,,,
3,Fix,GT,37.317900,-121.948512,11.528630,0.001148,0.1,0.882425,1607640765435,,,,
4,Fix,GT,37.317900,-121.948512,11.528630,0.002821,0.1,0.883073,1607640766435,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1402,Fix,GT,37.317901,-121.948373,10.931630,0.000908,0.1,357.867900,1607642164435,,,,
1403,Fix,GT,37.317901,-121.948373,10.931630,0.001000,0.1,357.867740,1607642165435,,,,
1404,Fix,GT,37.317901,-121.948373,10.931630,0.001000,0.1,357.867950,1607642166435,,,,
1405,Fix,GT,37.317901,-121.948373,10.931630,0.000642,0.1,357.868070,1607642167435,,,,


In [None]:
# This problem can be classified as a time series one.
# Not exactly forecasting, but rather as finding a (reversible) transform between two time series
# of different fidelities, with missing values and some known fixed context. Forward transform exists
# and may be accurately formulated analyticaly, given enough domain knowledge. A machine learning approach
# shouldn't depend on this knowledge much, but rather on having enough data. Reverse transform may even exist,
# but we are tasked with creating an approximation of it.

# model: object in space with limited acceleration forces
# (different for vertical, sideways, parallel expressed via limits on second derivative)
# in spherical-like coordinates: LatitudeDegrees, LongitudeDegrees, AltitudeMeters
# ignore accuracyMeters for now
# + context: phone model, time, device imus type

# car -> path
# path, context -> satellite data, device imus
# predict: satellite data, device imus, context -> path
# where context is constant, rest is time-dynamic

# How to deal with missing data? Impute or use a robust model, such as recursive one.

In [248]:
# Todo:
# Do not spend time on domain-specific stuff, focus on methods that can be used in other problems
# Read about Kalman filter
# Read kaggle discussions
# Copy and adapt baselines from kaggle
# Come up with own model

1

In [237]:
pd.read_csv("data/sdc2023/sample_submission.csv")

Unnamed: 0,tripId,UnixTimeMillis,LatitudeDegrees,LongitudeDegrees
0,2020-12-11-19-30-us-ca-mtv-e/pixel4xl,1607715055442,34.640195,-120.589642
1,2020-12-11-19-30-us-ca-mtv-e/pixel4xl,1607715056442,34.640195,-120.589642
2,2020-12-11-19-30-us-ca-mtv-e/pixel4xl,1607715057442,34.640195,-120.589642
3,2020-12-11-19-30-us-ca-mtv-e/pixel4xl,1607715058442,34.640195,-120.589642
4,2020-12-11-19-30-us-ca-mtv-e/pixel4xl,1607715059442,34.640195,-120.589642
...,...,...,...,...
71931,2023-06-15-18-49-us-ca-sjc-ce1/pixel7pro,1686856468000,34.640195,-120.589642
71932,2023-06-15-18-49-us-ca-sjc-ce1/pixel7pro,1686856469000,34.640195,-120.589642
71933,2023-06-15-18-49-us-ca-sjc-ce1/pixel7pro,1686856470000,34.640195,-120.589642
71934,2023-06-15-18-49-us-ca-sjc-ce1/pixel7pro,1686856471000,34.640195,-120.589642


In [242]:
pd.read_csv(df[df["split"] == "test"].iloc[0]["device_imu"])

Unnamed: 0,MessageType,utcTimeMillis,elapsedRealtimeNanos,MeasurementX,MeasurementY,MeasurementZ,BiasX,BiasY,BiasZ
0,UncalGyro,1632164592921,,-0.004887,0.000611,-0.000764,0.000000,0.000000,0.00000
1,UncalAccel,1632164592921,,0.088534,9.617353,-1.295713,0.000000,0.000000,0.00000
2,UncalAccel,1632164592931,,0.120838,9.718450,-1.478166,0.000000,0.000000,0.00000
3,UncalMag,1632164592932,,28.893750,2.625000,-67.481250,11.622952,51.149902,-68.71373
4,UncalGyro,1632164592940,,0.004276,0.000153,-0.000611,0.000000,0.000000,0.00000
...,...,...,...,...,...,...,...,...,...
395529,UncalMag,1632166389688,,56.625000,4.031250,-41.512500,11.622952,51.149902,-68.71373
395530,UncalAccel,1632166389694,,-0.005384,9.649655,-1.421337,0.000000,0.000000,0.00000
395531,UncalMag,1632166389698,,57.018750,4.275000,-41.643750,11.622952,51.149902,-68.71373
395532,UncalGyro,1632166389704,,0.009774,-0.000458,-0.001222,0.000000,0.000000,0.00000


In [241]:
pd.read_csv(df[df["split"] == "test"].iloc[0]["device_gnss"])

Unnamed: 0,MessageType,utcTimeMillis,TimeNanos,LeapSecond,TimeUncertaintyNanos,FullBiasNanos,BiasNanos,BiasUncertaintyNanos,DriftNanosPerSecond,DriftUncertaintyNanosPerSecond,...,SvVelocityYEcefMetersPerSecond,SvVelocityZEcefMetersPerSecond,SvClockBiasMeters,SvClockDriftMetersPerSecond,IsrbMeters,IonosphericDelayMeters,TroposphericDelayMeters,WlsPositionXEcefMeters,WlsPositionYEcefMeters,WlsPositionZEcefMeters
0,Raw,1632164592441,1882759758000000,18,,-1314317050683219873,-0.368740,22.748321,-4.009122,9.740156,...,-402.570571,1289.790984,-62163.775486,-0.001779,0.000000,1.564167,2.654405,-2.694416e+06,-4.296518e+06,3.854900e+06
1,Raw,1632164592441,1882759758000000,18,,-1314317050683219873,-0.368740,22.748321,-4.009122,9.740156,...,1996.006028,-686.953098,-35885.790030,-0.000183,0.000000,4.130041,15.063130,-2.694416e+06,-4.296518e+06,3.854900e+06
2,Raw,1632164592441,1882759758000000,18,,-1314317050683219873,-0.368740,22.748321,-4.009122,9.740156,...,-1022.630436,-2873.800076,95961.928724,-0.001519,0.000000,2.160899,3.803842,-2.694416e+06,-4.296518e+06,3.854900e+06
3,Raw,1632164592441,1882759758000000,18,,-1314317050683219873,-0.368740,22.748321,-4.009122,9.740156,...,-382.467600,-342.195334,16492.925319,-0.001556,0.000000,1.818090,3.137681,-2.694416e+06,-4.296518e+06,3.854900e+06
4,Raw,1632164592441,1882759758000000,18,,-1314317050683219873,-0.368740,22.748321,-4.009122,9.740156,...,1463.046760,2708.244277,71595.108446,0.001242,0.000000,3.542701,8.865548,-2.694416e+06,-4.296518e+06,3.854900e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64680,Raw,1632166388441,1884555758000000,18,,-1314317050683225554,-0.376841,28.853065,-3.676322,12.042134,...,-1273.375245,-2602.532889,-58655.026387,-0.001271,-2348.292803,3.301920,3.180648,-2.694431e+06,-4.296528e+06,3.854914e+06
64681,Raw,1632166388441,1884555758000000,18,,-1314317050683225554,-0.376841,28.853065,-3.676322,12.042134,...,319.566239,3044.324356,-696.293994,-0.000610,-2348.292803,3.089078,2.961294,-2.694431e+06,-4.296528e+06,3.854914e+06
64682,Raw,1632166388441,1884555758000000,18,,-1314317050683225554,-0.376841,28.853065,-3.676322,12.042134,...,751.178460,2581.285685,-142917.252427,-0.001250,-2343.734218,2.767264,2.608380,-2.694431e+06,-4.296528e+06,3.854914e+06
64683,Raw,1632166388441,1884555758000000,18,,-1314317050683225554,-0.376841,28.853065,-3.676322,12.042134,...,1386.499799,728.023782,-74301.154322,0.000867,-2343.734218,3.504022,3.392411,-2.694431e+06,-4.296528e+06,3.854914e+06


In [233]:
pd.read_csv(df.iloc[0]["ground_truth"])

Unnamed: 0,MessageType,Provider,LatitudeDegrees,LongitudeDegrees,AltitudeMeters,SpeedMps,AccuracyMeters,BearingDegrees,UnixTimeMillis,SpeedAccuracyMps,BearingAccuracyDegrees,elapsedRealtimeNanos,VerticalAccuracyMeters
0,Fix,GT,37.336727,-122.066896,52.988949,0.001002,0.1,134.55893,1652475421999,,,,
1,Fix,GT,37.336727,-122.066896,52.988949,0.001414,0.1,134.55862,1652475422999,,,,
2,Fix,GT,37.336727,-122.066896,52.988949,0.001996,0.1,134.55855,1652475423999,,,,
3,Fix,GT,37.336727,-122.066896,52.988949,0.003997,0.1,134.55846,1652475424999,,,,
4,Fix,GT,37.336727,-122.066896,52.988949,0.005087,0.1,134.56021,1652475425999,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2159,Fix,GT,37.336606,-122.066737,53.083950,0.001414,0.1,134.01593,1652477580999,,,,
2160,Fix,GT,37.336606,-122.066737,53.083950,0.001414,0.1,134.01694,1652477581999,,,,
2161,Fix,GT,37.336606,-122.066737,53.085948,0.000000,0.1,134.01639,1652477582999,,,,
2162,Fix,GT,37.336606,-122.066737,53.086948,0.000999,0.1,134.01643,1652477583999,,,,


In [213]:
df.describe()

Unnamed: 0,split,drive_id,phone_name,device_gnss,device_imu,ground_truth
count,196,196,196,196,196,196.0
unique,2,105,23,196,196,157.0
top,train,2021-07-19-20-49-us-ca-mtv-a,pixel5,data/sdc2023/train/2022-05-13-20-57-us-ca-mtv-...,data/sdc2023/train/2022-05-13-20-57-us-ca-mtv-...,
freq,156,4,56,1,1,40.0


In [None]:
Model ???
Loss ???

### Data exploration <span id=Data_exploration_></span>

## Metrics <span id=Metrics_></span>

## Model <span id=Model_></span>

## Training <span id=Training_></span>

## Results <span id=Results_></span>