# Running a 1 Dimensional Kalman Filter using std::vector and CUDA

## System model:

| 1D LTI Projectile Motion | Position Graph with time|
| ---------------------- | ---------------------- |
| ![motion](assets/motion_illustration.png) | ![dog](assets/x_graph.png) |


## Goal : Estimate the value  of Gravitational Acceleration *(g)* based on position measurements of a 1D LTI projectile

Kalman Filter operates by maintaining an ongoing estimate of the state of a system and updating this estimate based on new measurements and a prediction model. 

<img src = 'assets/kf_diagram.png'>

In [1]:
#include <vector>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <iostream>

### Set up the `KalmanFilter` class and constructor

  Here we create a Kalman filter with the specified matrices:
  
  - A - System dynamics matrix, here we model 1D motion with a *(Position x, Velocity v, Acceleration a)* model
  - C - Output matrix
  - Q - Covariance matrix of the process noise random variable *p(w) ~ N(0, Q)*
  - R - Covariance matrix of the measurement noise random variable *p(v) ~ N(0, R)*
  - P - Estimate error covariance
 

In [2]:
class KalmanFilter {

public:

  /**
  * Create a Kalman filter with the specified matrices.
  *   A - System dynamics matrix
  *   C - Output matrix
  *   Q - Process noise covariance
  *   R - Measurement noise covariance
  *   P - Estimate error covariance
  */
  KalmanFilter(
      double dt,
      const std::vector<std::vector<double>>& A,
      const std::vector<std::vector<double>>& C,
      const std::vector<std::vector<double>>& Q,
      const std::vector<std::vector<double>>& R,
      const std::vector<std::vector<double>>& P
  );

  /**
  * Create a blank estimator.
  */
  KalmanFilter();

  /**
  * Initialize the filter with initial states as zero.
  */
  void init();

  /**
  * Initialize the filter with a guess for initial states.
  */
  void init(double t0, const std::vector<double>& x0);

  /**
  * Update the estimated state based on measured values. The
  * time step is assumed to remain constant.
  */
  std::vector<double> update(const std::vector<double>& y);

  /**
  * Update the estimated state based on measured values,
  * using the given time step and dynamics matrix.
  */
  void update(const std::vector<double>& y, double dt, const std::vector<std::vector<double>>& A);

  /**
  * Return the current state and time.
  */
  std::vector<double> state() { return x_hat; };
  double time() { return t; };

private:

  // Matrices for computation
  std::vector<std::vector<double>> A, C, Q, R, P, K, P0;

  // System dimensions
  int m, n;

  // Initial and current time
  double t0, t;

  // Discrete time step
  double dt;

  // Is the filter initialized?
  bool initialized;

  // n-size identity matrix
  std::vector<std::vector<double>> I;

  // State estimates
  std::vector<double> x_hat, x_hat_new;
};


### Defining CUDA Kernel functions for parallel matrix and vector operations

The algorithm operates in two stages of the ongoing discrete Kalman filter cycle:

 - Discrete Kalman filter time update equations:
 
     $\begin{aligned} & \hat{x}_k^{-}=A \hat{x}_{k-1} \\ & P_k^{-}=A P_{k-1} A^T+Q\end{aligned}$
    
    They project the state and covariance estimates forward from time step *k* – 1 to step *k*.
    
 - Discrete Kalman filter measurement update equations: 

    $\begin{gathered}K_k=P_k^{-} H^T\left(H P_k^{-} H^T+R\right)^{-1} \\ \hat{x}_k=\hat{x}_k^{-}+K_k\left(z_k-H \hat{x}_k^{-}\right) \\ P_k=\left(I-K_k H\right) P_k^{-}\end{gathered}$
    
    The Kalman Gain is computed in this step

This requires us to define various CUDA kernels that handle matrix-matrix, matrix-vector, and vector-vector operations parallely:

In [3]:
__global__ void matAddKernel(double* a, double* b, double* c, int rows, int cols) {
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int row = blockIdx.y * blockDim.y + threadIdx.y;

    if (col < cols && row < rows) {
        int idx = row * cols + col;
        c[idx] = a[idx] + b[idx];
    }
}

In [4]:
__global__ void matSubKernel(double* a, double* b, double* c, int rows, int cols) {
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int row = blockIdx.y * blockDim.y + threadIdx.y;

    if (col < cols && row < rows) {
        int idx = row * cols + col;
        c[idx] = a[idx] - b[idx];
    }
}

In [5]:
__global__ void matTransposeKernel(double* a, double* c, int rows, int cols) {
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int row = blockIdx.y * blockDim.y + threadIdx.y;

    if (col < cols && row < rows) {
        int idx_in = row * cols + col;
        int idx_out = col * rows + row;
        c[idx_out] = a[idx_in];
    }
}

In [6]:
__global__ void matvecmulKernel(double* d_mat, double* d_vec, double* d_result, int rows, int cols) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid < rows) {
        double sum = 0.0;
        for (int j = 0; j < cols; j++) {
            sum += d_mat[tid * cols + j] * d_vec[j];
        }
        d_result[tid] = sum;
    }
}

In [7]:
__global__ void matmulKernel(double* d_a, double* d_b, double* d_result, int rowsA, int colsA, int colsB) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < rowsA && col < colsB) {
        double value = 0.0;
        for (int k = 0; k < colsA; k++) {
            value += d_a[row * colsA + k] * d_b[k * colsB + col];
        }
        d_result[row * colsB + col] = value;
    }
}

In [8]:
__global__ void vecsubKernel(const double* a, const double* b, double* result, int len) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < len) {
        result[idx] = a[idx] - b[idx];
    }
}

### Defining C++ abstraction functions to run CUDA kernels and obtain the results:
This allows us to run calls to all matrix-matrix, matrix-vector and vector-vector CUDA operations without dealing with CUDA memory and syntax in the actual algorithm code

In [9]:
std::vector<std::vector<double>> mataddCUDA(const std::vector<std::vector<double>>& a, const std::vector<std::vector<double>>& b) {
    int rows = a.size();
    int cols = a[0].size();
    
    double* h_a = new double[rows*cols];
    double* h_b = new double[rows*cols];
    double* h_c = new double[rows*cols];
    
    for (int i = 0; i < rows; i++) {
        for (int j = 0; j < cols; j++) {
            h_a[i*cols + j] = a[i][j];
            h_b[i*cols + j] = b[i][j];
        }
    }
    
    double* d_a, * d_b, * d_c;
    cudaMalloc((void**)&d_a, rows*cols*sizeof(double));
    cudaMalloc((void**)&d_b, rows*cols*sizeof(double));
    cudaMalloc((void**)&d_c, rows*cols*sizeof(double));

    cudaMemcpy(d_a, h_a, rows*cols*sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, rows*cols*sizeof(double), cudaMemcpyHostToDevice);

    dim3 dimBlock(16, 16);
    dim3 dimGrid((cols + dimBlock.x - 1) / dimBlock.x, (rows + dimBlock.y - 1) / dimBlock.y);
    matAddKernel<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, rows, cols);

    cudaMemcpy(h_c, d_c, rows*cols*sizeof(double), cudaMemcpyDeviceToHost);

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    std::vector<std::vector<double>> result(rows, std::vector<double>(cols));
    for (int i = 0; i < rows; i++) {
        for (int j = 0; j < cols; j++) {
            result[i][j] = h_c[i*cols + j];
        }
    }

    delete[] h_a;
    delete[] h_b;
    delete[] h_c;

    return result;
}

In [10]:
std::vector<std::vector<double>> matsubCUDA(const std::vector<std::vector<double>>& a, const std::vector<std::vector<double>>& b) {
    int rows = a.size();
    int cols = a[0].size();
    
    double* h_a = new double[rows*cols];
    double* h_b = new double[rows*cols];
    double* h_c = new double[rows*cols];
    
    for (int i = 0; i < rows; i++) {
        for (int j = 0; j < cols; j++) {
            h_a[i*cols + j] = a[i][j];
            h_b[i*cols + j] = b[i][j];
        }
    }

    double* d_a, * d_b, * d_c;
    cudaMalloc((void**)&d_a, rows*cols*sizeof(double));
    cudaMalloc((void**)&d_b, rows*cols*sizeof(double));
    cudaMalloc((void**)&d_c, rows*cols*sizeof(double));

    cudaMemcpy(d_a, h_a, rows*cols*sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, rows*cols*sizeof(double), cudaMemcpyHostToDevice);

    dim3 dimBlock(16, 16);  
    dim3 dimGrid((cols + dimBlock.x - 1) / dimBlock.x, (rows + dimBlock.y - 1) / dimBlock.y);
    matSubKernel<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, rows, cols);

    cudaMemcpy(h_c, d_c, rows*cols*sizeof(double), cudaMemcpyDeviceToHost);

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    std::vector<std::vector<double>> result(rows, std::vector<double>(cols));
    for (int i = 0; i < rows; i++) {
        for (int j = 0; j < cols; j++) {
            result[i][j] = h_c[i*cols + j];
        }
    }

    delete[] h_a;
    delete[] h_b;
    delete[] h_c;

    return result;
}

In [11]:
std::vector<std::vector<double>> mattransposeCUDA(const std::vector<std::vector<double>>& a) {
    int rows = a.size();
    int cols = a[0].size();
    
    double* h_a = new double[rows*cols];
    double* h_c = new double[cols*rows];
    
    for (int i = 0; i < rows; i++) {
        for (int j = 0; j < cols; j++) {
            h_a[i*cols + j] = a[i][j];
        }
    }

    double* d_a, * d_c;
    cudaMalloc((void**)&d_a, rows*cols*sizeof(double));
    cudaMalloc((void**)&d_c, cols*rows*sizeof(double));

    cudaMemcpy(d_a, h_a, rows*cols*sizeof(double), cudaMemcpyHostToDevice);

    dim3 dimBlock(16, 16);
    dim3 dimGrid((cols + dimBlock.x - 1) / dimBlock.x, (rows + dimBlock.y - 1) / dimBlock.y);
    matTransposeKernel<<<dimGrid, dimBlock>>>(d_a, d_c, rows, cols);

    cudaMemcpy(h_c, d_c, cols*rows*sizeof(double), cudaMemcpyDeviceToHost);

    cudaFree(d_a);
    cudaFree(d_c);

    std::vector<std::vector<double>> result(cols, std::vector<double>(rows));
    for (int i = 0; i < cols; i++) {
        for (int j = 0; j < rows; j++) {
            result[i][j] = h_c[i*rows + j];
        }
    }

    delete[] h_a;
    delete[] h_c;

    return result;
}

In [12]:
std::vector<double> matvecmulCUDA(const std::vector<std::vector<double>>& a, const std::vector<double>& b) {
    int rows = a.size();
    int cols = a[0].size();

    if (cols != b.size()) {
        throw std::runtime_error("Matrix dims do not match for multiplication.");
    }

    std::vector<double> h_mat(rows * cols);
    for (int i = 0; i < rows; i++) {
        for (int j = 0; j < cols; j++) {
            h_mat[i * cols + j] = a[i][j];
        }
    }

    double *d_mat, *d_vec, *d_result;
    cudaMalloc((void**)&d_mat, rows * cols * sizeof(double));
    cudaMalloc((void**)&d_vec, cols * sizeof(double));
    cudaMalloc((void**)&d_result, rows * sizeof(double));

    cudaMemcpy(d_mat, h_mat.data(), rows * cols * sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(d_vec, b.data(), cols * sizeof(double), cudaMemcpyHostToDevice);

    int blockSize = 256;
    int gridSize = (rows + blockSize - 1) / blockSize;

    matvecmulKernel<<<gridSize, blockSize>>>(d_mat, d_vec, d_result, rows, cols);

    std::vector<double> result(rows);
    cudaMemcpy(result.data(), d_result, rows * sizeof(double), cudaMemcpyDeviceToHost);

    cudaFree(d_mat);
    cudaFree(d_vec);
    cudaFree(d_result);

    return result;
}

In [13]:
std::vector<std::vector<double>> matmulCUDA(const std::vector<std::vector<double>>& a, const std::vector<std::vector<double>>& b) {
    int rowsA = a.size();
    int colsA = a[0].size();
    int rowsB = b.size();
    int colsB = b[0].size();

    if (colsA != rowsB) {
        throw std::runtime_error("Matrix dims do not match for multiplication.");
    }

    std::vector<double> h_a(rowsA * colsA);
    std::vector<double> h_b(rowsB * colsB);

    for (int i = 0; i < rowsA; i++) {
        for (int j = 0; j < colsA; j++) {
            h_a[i * colsA + j] = a[i][j];
        }
    }

    for (int i = 0; i < rowsB; i++) {
        for (int j = 0; j < colsB; j++) {
            h_b[i * colsB + j] = b[i][j];
        }
    }

    double *d_a, *d_b, *d_result;
    
    cudaMalloc((void**)&d_a, rowsA * colsA * sizeof(double));
    cudaMalloc((void**)&d_b, rowsB * colsB * sizeof(double));
    cudaMalloc((void**)&d_result, rowsA * colsB * sizeof(double));

    cudaMemcpy(d_a, h_a.data(), rowsA * colsA * sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b.data(), rowsB * colsB * sizeof(double), cudaMemcpyHostToDevice);

    dim3 blockSize(16, 16);
    dim3 gridSize((colsB + blockSize.x - 1) / blockSize.x, (rowsA + blockSize.y - 1) / blockSize.y);

    matmulKernel<<<gridSize, blockSize>>>(d_a, d_b, d_result, rowsA, colsA, colsB);

    std::vector<double> h_result(rowsA * colsB);
    cudaMemcpy(h_result.data(), d_result, rowsA * colsB * sizeof(double), cudaMemcpyDeviceToHost);

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_result);
    
    std::vector<std::vector<double>> result(rowsA, std::vector<double>(colsB));
    for (int i = 0; i < rowsA; i++) {
        for (int j = 0; j < colsB; j++) {
            result[i][j] = h_result[i * colsB + j];
        }
    }

    return result;
}

In [14]:
std::vector<double> vecsubCUDA(const std::vector<double>& a, const std::vector<double>& b) {
    int len = a.size();

    double* d_a;
    double* d_b;
    double* d_result;

    cudaMalloc((void**)&d_a, len * sizeof(double));
    cudaMalloc((void**)&d_b, len * sizeof(double));
    cudaMalloc((void**)&d_result, len * sizeof(double));

    cudaMemcpy(d_a, a.data(), len * sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b.data(), len * sizeof(double), cudaMemcpyHostToDevice);

    int blockSize = 256; 
    int gridSize = (len + blockSize - 1) / blockSize;

    vecsubKernel<<<gridSize, blockSize>>>(d_a, d_b, d_result, len);

    std::vector<double> result(len);
    cudaMemcpy(result.data(), d_result, len * sizeof(double), cudaMemcpyDeviceToHost);

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_result);

    return result;
}


### Function to obtain 2x2 and 3x3 matrix inverse using adjoint:

In [15]:
std::vector<std::vector<double>> matinverse(const std::vector<std::vector<double>>& a) {
    size_t n = a.size();
    
    if (n != a[0].size()) {
        std::cout<<" Shape of a : "<<a.size()<<","<<a[0].size()<<"\n";
        throw std::runtime_error("Only square matrices are supported for inversion.");
    }

    // Handle 1x1 matrix
    if (n == 1) {
        if (a[0][0] == 0) {
            throw std::runtime_error("singular matrix");
        }
        return {{1.0 / a[0][0]}};
    }
    
    if (n == 2) {
        double determinant = a[0][0] * a[1][1] - a[0][1] * a[1][0];
        if (determinant == 0) {
            throw std::runtime_error("singular matrix");
        }

        std::vector<std::vector<double>> result(2, std::vector<double>(2));
        result[0][0] = a[1][1] / determinant;
        result[0][1] = -a[0][1] / determinant;
        result[1][0] = -a[1][0] / determinant;
        result[1][1] = a[0][0] / determinant;

        return result;
    }

    if (n == 3) {
        double determinant = a[0][0]*(a[1][1]*a[2][2]-a[2][1]*a[1][2]) 
                             - a[0][1]*(a[1][0]*a[2][2]-a[1][2]*a[2][0]) 
                             + a[0][2]*(a[1][0]*a[2][1]-a[1][1]*a[2][0]);
        
        if (determinant == 0) {
            throw std::runtime_error("singular matrix");
        }

        std::vector<std::vector<double>> result(3, std::vector<double>(3));

        result[0][0] = (a[1][1] * a[2][2] - a[2][1] * a[1][2]) / determinant;
        result[0][1] = (a[0][2] * a[2][1] - a[0][1] * a[2][2]) / determinant;
        result[0][2] = (a[0][1] * a[1][2] - a[0][2] * a[1][1]) / determinant;
        result[1][0] = (a[1][2] * a[2][0] - a[1][0] * a[2][2]) / determinant;
        result[1][1] = (a[0][0] * a[2][2] - a[0][2] * a[2][0]) / determinant;
        result[1][2] = (a[1][0] * a[0][2] - a[0][0] * a[1][2]) / determinant;
        result[2][0] = (a[1][0] * a[2][1] - a[2][0] * a[1][1]) / determinant;
        result[2][1] = (a[2][0] * a[0][1] - a[0][0] * a[2][1]) / determinant;
        result[2][2] = (a[0][0] * a[1][1] - a[1][0] * a[0][1]) / determinant;

        return result;
    }

    throw std::runtime_error("Only 2x2 and 3x3 matrices supported for inversion");
}


### Overloaded functions to print both matrices and vectors

In [16]:
void printMatrix(const std::vector<std::vector<double>>& matrix) {
    for (size_t i = 0; i < matrix.size(); i++) {
        for (size_t j = 0; j < matrix[i].size(); j++) {
            std::cout << matrix[i][j] << " ";
        }
        std::cout << std::endl;
    }
}

In [17]:
void printMatrix(const std::vector<double>& vec) {
    for (size_t i = 0; i < vec.size(); i++) {
            std::cout << vec[i] << " ";
        }
        std::cout << std::endl;
    }


### Test for matrix-matrix multiplication using CUDA

In [18]:
std::vector<std::vector<double>> matrixA = {
        {1.0, 2.0},
        {3.0, 4.0},
        {5.0, 6.0}
    };

std::vector<std::vector<double>> matrixB = {
        {1.0, 2.0, 3.0, 4.0},
        {5.0, 6.0, 7.0, 8.0}
    };

std::vector<std::vector<double>> result = matmulCUDA(matrixA, matrixB);

In [19]:
std::cout << "Result matrix:" << std::endl;
printMatrix(result);

Result matrix:
11 14 17 20 
23 30 37 44 
35 46 57 68 


## Constructor to initialise an object of the Kalman Filter class

In [20]:
KalmanFilter::KalmanFilter(
    double dt,
    const std::vector<std::vector<double>>& A,
    const std::vector<std::vector<double>>& C,
    const std::vector<std::vector<double>>& Q,
    const std::vector<std::vector<double>>& R,
    const std::vector<std::vector<double>>& P)
  : A(A), C(C), Q(Q), R(R), P0(P),
    m(C.size()), n(A.size()), dt(dt), initialized(false),
    I(n, std::vector<double>(n)), x_hat(n), x_hat_new(n)
{
    for (int i = 0; i < n; i++) {
        I[i][i] = 1.0;
    }
}

KalmanFilter::KalmanFilter() {}

### Describe the `KalmanFilter::init` function to initialize the filter with initial states as zero.

In [21]:
void KalmanFilter::init() {
    std::fill(x_hat.begin(), x_hat.end(), 0.0);
    P = P0;
    t0 = 0;
    t = t0;
    initialized = true;
}

### Describe the `KalmanFilter::init` function to initialize the filter with a guess for initial states.

In [22]:
void KalmanFilter::init(double t0, const std::vector<double>& x0) {
    x_hat = x0;
    P = P0;
    this->t0 = t0;
    t = t0;
    initialized = true;
}

### Defining a function that executes the two stages of the ongoing discrete Kalman filter cycle

In [23]:
std::vector<double> KalmanFilter::update(const std::vector<double>& y) {
    if (!initialized)
        throw std::runtime_error("Filter is not initialized!");
    
    // Discrete Kalman filter time update 
    x_hat_new = matvecmulCUDA(A, x_hat);
    P = mataddCUDA(matmulCUDA(matmulCUDA(A, P), mattransposeCUDA(A)), Q);
    
    // Discrete Kalman filter measurement update
    std::vector<std::vector<double>> inv = matinverse(mataddCUDA(matmulCUDA(matmulCUDA(C, P), mattransposeCUDA(C)), R));
    K = matmulCUDA(matmulCUDA(P, mattransposeCUDA(C)), inv);
    std::vector<double> temp = matvecmulCUDA(C, x_hat_new);
    std::vector<double> difference = vecsubCUDA(y, temp);
    std::vector<double> gain = K[0];
    for (size_t i = 0; i < x_hat_new.size(); i++) {
        x_hat_new[i] += matvecmulCUDA(K, difference)[i];
    }

    P = matmulCUDA(matsubCUDA(I, matmulCUDA(K, C)), P);

    x_hat = x_hat_new;
    t += dt;
    
    return gain;
}


### Update the estimated state based on measured values, using the given time step and dynamics matrix:

In [24]:
void KalmanFilter::update(const std::vector<double>& y, double dt, const std::vector<std::vector<double>>& A) {
    this->A = A;
    this->dt = dt;
    update(y);
}


In [25]:
std::vector<double> measurements;

### The cross language support allows us to read the measurements from the python side and initialise the std::vector to be passed to the Kalman Filter

#### setData here recieves the python list as a `cppyy.gbl.std.vector` and passes it to `std::vector<double> measurements` 

In [26]:
void setData(const std::vector<double>& input) {
    measurements = input;
}

In [27]:
!pip install matplotlib
!pip install PyYaml



In [28]:
%%python

import yaml
import cppyy

with open('data/measurements.yml', 'r') as file:
    data_dict = yaml.safe_load(file)
    data_list = list(float(x) for x in data_dict['data'])

measurements_vector = cppyy.gbl.std.vector['double'](data_list)

### Lets verify the data integrity by plotting a position vs time graph

In [29]:
%%python

import matplotlib.pyplot as plt

dt = 1/30
    
# Generating time points for the x-axis based on dt
time_points = [i*dt for i in range(len(data_list))]

# Plotting
plt.plot(time_points, data_list, '-o', label='Position')
plt.xlabel('Time (s)')
plt.ylabel('Position (x)')
plt.title('Position vs. Time')
plt.legend()
plt.grid(True)
plt.savefig("position_graph.png")
plt.show()

<img src="position_graph.png">

### Now we pass it to the C++ side by calling `setData`

In [30]:
%%python

cppyy.gbl.setData(measurements_vector)

### Putting it all together:
We define the `run_kf` function that initialises the following:
- Number of states
- Dimensions of each measurement
- Time steps used $\begin{aligned} & {dt}\end{aligned}$
- Covariance matrices
- Passes the measurement data to $\begin{aligned} & \hat{x}_0^{-}\end{aligned}$

Then the measurements are fed into filter, and output estimated states

The algorithm then runs by looping over the measurements:

$\begin{gathered} & for\:y\:in\:measurements\::\end{gathered}$
$\begin{gathered} & kf.update(y)\end{gathered}$

In [31]:
std::vector<std::vector<double>> run_kf(bool verbose) {
    
    int n = 3; 
    int m = 1; 

    double dt = 1.0 / 30; // Time step
    
    std::vector<double> g_preds;
    
    std::vector<std::vector<double>> A(n, std::vector<double>(n));
    std::vector<std::vector<double>> C(m, std::vector<double>(n));
    std::vector<std::vector<double>> Q(n, std::vector<double>(n));
    std::vector<std::vector<double>> R(m, std::vector<double>(m));
    std::vector<std::vector<double>> P(n, std::vector<double>(n));
    
    A = {{1, dt, 0}, {0, 1, dt}, {0, 0, 1}};
    C = {{1, 0, 0}};
    Q = {{.05, .05, .0}, {.05, .05, .0}, {.0, .0, .0}};
    R = {{5}};
    P = {{.1, .1, .1}, {.1, 10000, 10}, {.1, 10, 100}};
    
    KalmanFilter kf(dt, A, C, Q, R, P);
    
    std::vector<double> x0 = {measurements[0], 0, -15};
    std::vector<std::vector<double>> gain;
    kf.init(0, x0);

    
    std::vector<double> y(m);
    if(verbose) {
        std::cout << "t = " << 0 << ", " << "x_hat[0]: ";
        for (auto& val : kf.state()){
            std::cout << val << " ";
        }
        std::cout << std::endl;
    }
    
    int i;
    for (i = 0; i < measurements.size(); i++) {
        y[0] = measurements[i];
        gain.push_back(kf.update(y));
        if(verbose) {
            std::cout << "t = " << (i + 1) * dt << ", y[" << i << "] = " << y[0] << ", x_hat[" << i << "] = ";
            for (auto& val : kf.state()) {
                std::cout << val << " ";
            }
            g_preds.push_back(kf.state()[2]);
            std::cout << std::endl;
        }
    }
    std::cout << std::endl;
    std::cout<<"Exec Success, Final kf states:";
    for (auto& val : kf.state()) std::cout << val << " ";
    std::cout << std::endl;

    std::vector<std::vector<double>> g_res;
    for (size_t i = 0; i < g_preds.size(); ++i) {
        std::vector<double> pair = {g_preds[i], gain[i][0]};
        g_res.push_back(pair);
    }
    
    return g_res;
}

In [32]:
std::vector<std::vector<double>> g_res = run_kf(true);

t = 0, x_hat[0]: 1.04203 0 -15 
t = 0.0333333, y[0] = 1.04203, x_hat[0] = 1.04203 -0.5 -15 
t = 0.0666667, y[1] = 1.10727, x_hat[1] = 1.08556 -0.0966619 -14.9988 
t = 0.1, y[2] = 1.29135, x_hat[2] = 1.21317 0.720024 -14.9952 
t = 0.133333, y[3] = 1.48485, x_hat[3] = 1.36865 1.21707 -14.9881 
t = 0.166667, y[4] = 1.72826, x_hat[4] = 1.55548 1.60875 -14.9732 
t = 0.2, y[5] = 1.74216, x_hat[5] = 1.66278 1.38374 -14.9637 
t = 0.233333, y[6] = 2.11672, x_hat[6] = 1.85606 1.53382 -14.9229 
t = 0.266667, y[7] = 2.14529, x_hat[7] = 1.98512 1.34018 -14.8908 
t = 0.3, y[8] = 2.1603, x_hat[8] = 2.06901 0.981678 -14.8682 
t = 0.333333, y[9] = 2.21269, x_hat[9] = 2.13267 0.58593 -14.8441 
t = 0.366667, y[10] = 2.57709, x_hat[10] = 2.26319 0.425535 -14.7321 
t = 0.4, y[11] = 2.66822, x_hat[11] = 2.37389 0.210228 -14.6096 
t = 0.433333, y[12] = 2.51642, x_hat[12] = 2.41279 -0.189016 -14.56 
t = 0.466667, y[13] = 2.76034, x_hat[13] = 2.4865 -0.459522 -14.4115 
t = 0.5, y[14] = 2.88132, x_hat[14] = 2.5

### Defining a C++ function to resolve the 2D vector result *{Kalman Filter Estimates, Kalman Gain}*

- The function returns the KF estimates if the provided axis is 0 and Kalman Gains if axis = 1

In [33]:
std::vector<double> ret_1d_vector(std::vector<std::vector<double>> res, int axis) {
    std::vector<double> ret;
    for (int i = 0; i < res.size(); i++) {
        ret.push_back(res[i][axis]);
    }
    return ret;
}
    

#### We obtain the final results as two std::vectors that can be accessed in Python:

In [34]:
std::vector<double> py_g_pred = ret_1d_vector(g_res, 0);
std::vector<double> py_kf_gains = ret_1d_vector(g_res, 1);

### Lets plot the trend of Kalman Gain across the time steps to verify its working

If the algorithm converged properly, we should see an exponential decrease in the gain  with time

In [35]:
%%python

import matplotlib.pyplot as plt

kalman_gains = list(cppyy.gbl.py_kf_gains)
x = range(len(kalman_gains))

In [36]:
%%python

plt.figure()
plt.plot(x, kalman_gains, color='blue', marker='v')
    
plt.xlabel('Time Steps')
plt.ylabel('Kalman Gain')
plt.title('Kalman Gain Plot')
plt.savefig("kalman_gain_plot.png")
    
plt.yscale('symlog')
    
plt.show()


<img src="kalman_gain_plot.png">

### Now we can plot the values of Gravitational Acceleration *(g)* obtained as Kalman Estimates :

In [37]:
%%python

true_val = 9.81
g_pred = list(cppyy.gbl.py_g_pred)
g_pred = list(-x for x in g_pred)
    
x = range(len(g_pred))

In [38]:
%%python

plt.figure()
plt.axhline(y=true_val, color='green', linestyle='-')
plt.plot(x, g_pred, color='orange', marker='o', label='KF Estimates')
plt.annotate(f'{true_val}', xy=(-0.5, true_val), color='green',
             verticalalignment='center', horizontalalignment = 'left')
    
plt.xlabel('Index')
plt.ylabel('Acceleration (m/s₂)')
plt.title('True Value vs. g_pred')
plt.legend()
plt.savefig("1D_KF_plot.png")
    
plt.yscale('symlog')
    
plt.show()


### Result : The Kalman Filter begins to converge on the value of 9.81

<img src="1D_KF_plot.png">