## Compile all the processed data for upstream analysis

In [None]:
# import libraries
import numpy as np
import polars as pl
import polars.selectors as cs


In [2]:
# load the data

surface = np.load('../data/processed/surface_coords_scaled.npy')
dist = np.load('../data/processed/x_y_z_dist_surf.npy')
gc = np.load('../data/processed/x_y_z_bin_gc.npy', allow_pickle=True)
com = np.load('../data/processed/x_y_z_dist_com_dist_rm.npy')



In [12]:
dist_schema = pl.from_numpy(
    dist, 
    schema={
        'x': pl.Float32, 
        'y': pl.Float32, 
        'z': pl.Float32, 
        'dist_surf': pl.Float32
    }
)
gc_schema = pl.from_numpy(
    np.float32(gc[:,-1]), 
    schema={'gc_content': pl.Float32}
)
com_schema = pl.from_numpy(
    com[:, -2:], 
    schema={
        'dist_com': pl.Float32, 
        'dist_rm': pl.Float32
    }
)

combined_df = pl.concat(
    [dist_schema, gc_schema, com_schema], 
    how="horizontal"
)

combined_df = combined_df.with_columns(
    [
        pl.col("dist_surf").round(5),
        pl.col("gc_content").round(5),
        pl.col("dist_com").round(5),
        pl.col("dist_rm").round(5)
    ]
)
combined_df.head

<bound method DataFrame.head of shape: (451, 7)
┌─────────┬────────┬─────────┬───────────┬────────────┬──────────┬─────────┐
│ x       ┆ y      ┆ z       ┆ dist_surf ┆ gc_content ┆ dist_com ┆ dist_rm │
│ ---     ┆ ---    ┆ ---     ┆ ---       ┆ ---        ┆ ---      ┆ ---     │
│ f32     ┆ f32    ┆ f32     ┆ f32       ┆ f32        ┆ f32      ┆ f32     │
╞═════════╪════════╪═════════╪═══════════╪════════════╪══════════╪═════════╡
│ -0.3264 ┆ 0.3988 ┆ -0.265  ┆ 0.09739   ┆ 33.650002  ┆ 0.58143  ┆ 0.23898 │
│ -0.3026 ┆ 0.4367 ┆ -0.2681 ┆ 0.1409    ┆ 43.049999  ┆ 0.59653  ┆ 0.20053 │
│ -0.2171 ┆ 0.5026 ┆ -0.2616 ┆ 0.17768   ┆ 60.5       ┆ 0.60691  ┆ 0.11906 │
│ -0.0999 ┆ 0.6025 ┆ -0.216  ┆ 0.19992   ┆ 54.07      ┆ 0.64595  ┆ 0.12115 │
│ 0.017   ┆ 0.6524 ┆ -0.1841 ┆ 0.18856   ┆ 58.549999  ┆ 0.67488  ┆ 0.22334 │
│ …       ┆ …      ┆ …       ┆ …         ┆ …          ┆ …        ┆ …       │
│ -0.3537 ┆ 0.2988 ┆ 0.4387  ┆ 0.25912   ┆ 39.82      ┆ 0.63502  ┆ 0.13989 │
│ -0.3457 ┆ 0.2375 ┆ 0.4932 

In [21]:
surface_df = pl.from_numpy(
    surface,
    schema = {
        "x": pl.Float32,
        "y": pl.Float32,
        "z": pl.Float32
    }
).with_columns(
    cs.float().round(7)
)

surface_df.head

<bound method DataFrame.head of shape: (8_325, 3)
┌───────────┬───────────┬───────────┐
│ x         ┆ y         ┆ z         │
│ ---       ┆ ---       ┆ ---       │
│ f32       ┆ f32       ┆ f32       │
╞═══════════╪═══════════╪═══════════╡
│ -0.839851 ┆ 0.286359  ┆ -0.546057 │
│ -0.839851 ┆ 0.384291  ┆ -0.546057 │
│ -0.790886 ┆ -0.007435 ┆ 0.629119  │
│ -0.790886 ┆ 0.090497  ┆ 0.629119  │
│ -0.790886 ┆ 0.188428  ┆ -0.546057 │
│ …         ┆ …         ┆ …         │
│ 0.873948  ┆ -0.007435 ┆ -0.448126 │
│ 0.873948  ┆ -0.007435 ┆ -0.39916  │
│ 0.922913  ┆ -0.301229 ┆ -0.692954 │
│ 0.922913  ┆ -0.301229 ┆ -0.643988 │
│ 0.922913  ┆ -0.301229 ┆ -0.546057 │
└───────────┴───────────┴───────────┘>

In [22]:
# creating parquet files

combined_df_output_path = '../data/processed/combined_df.parquet'
surface_df_output_path = '../data/processed/surfacedf.parquet'

combined_df.write_parquet(combined_df_output_path)
surface_df.write_parquet(surface_df_output_path)

