In [12]:
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np

In [9]:
# Sample structures, taken from https://gsdb.mu.hekademeia.org/details.php?id=GM12878,
# the actual URL is from the viewer UI
base_url = "https://calla.rnet.missouri.edu/genome3d/GSDB/Database/OO7429SF/GM12878/KR_100kb/LorDG/"
urls = []
for chr in range(1, 23):
    filename = f'chr{chr}.pdb'
    urls.append(base_url + filename)

urls

['https://calla.rnet.missouri.edu/genome3d/GSDB/Database/OO7429SF/GM12878/KR_100kb/LorDG/chr1.pdb',
 'https://calla.rnet.missouri.edu/genome3d/GSDB/Database/OO7429SF/GM12878/KR_100kb/LorDG/chr2.pdb',
 'https://calla.rnet.missouri.edu/genome3d/GSDB/Database/OO7429SF/GM12878/KR_100kb/LorDG/chr3.pdb',
 'https://calla.rnet.missouri.edu/genome3d/GSDB/Database/OO7429SF/GM12878/KR_100kb/LorDG/chr4.pdb',
 'https://calla.rnet.missouri.edu/genome3d/GSDB/Database/OO7429SF/GM12878/KR_100kb/LorDG/chr5.pdb',
 'https://calla.rnet.missouri.edu/genome3d/GSDB/Database/OO7429SF/GM12878/KR_100kb/LorDG/chr6.pdb',
 'https://calla.rnet.missouri.edu/genome3d/GSDB/Database/OO7429SF/GM12878/KR_100kb/LorDG/chr7.pdb',
 'https://calla.rnet.missouri.edu/genome3d/GSDB/Database/OO7429SF/GM12878/KR_100kb/LorDG/chr8.pdb',
 'https://calla.rnet.missouri.edu/genome3d/GSDB/Database/OO7429SF/GM12878/KR_100kb/LorDG/chr9.pdb',
 'https://calla.rnet.missouri.edu/genome3d/GSDB/Database/OO7429SF/GM12878/KR_100kb/LorDG/chr10.pdb',

In [13]:
# Download sample files
import urllib.request
files_content = []
for url in urls:
    with urllib.request.urlopen(url) as f:
        res = f.read().decode('utf-8')
        files_content.append(res)

len(files_content)

22

In [16]:
def parse_pdb(file_content):
    V = list()

    for line in file_content.splitlines():
        # print(line)
        if line.startswith("ATOM"):
            columns = line.split()
            # print(columns)
            x = eval(columns[5])
            y = eval(columns[6])
            z = eval(columns[7])
            V.append([x, y, z])
            # print(line)

    return np.array(V)

# parse the PDB structure file
for i, file in enumerate(files_content, start=1):
    pdb = parse_pdb(file)

    # separate columns (x, y, z) into arrays/lists
    x_column = []
    y_column = []
    z_column = []
    for x in pdb:
        x_column.append(x[0])
        y_column.append(x[1])
        z_column.append(x[2])

    # converting to arrow arrays
    x_array = pa.array(x_column)
    y_array = pa.array(y_column)
    z_array = pa.array(z_column)

    chr_table = pa.Table.from_arrays([x_array, y_array, z_array], names=["x", "y", "z"])
    pq.write_table(chr_table, f'chr{i}.parquet', compression=None)

    schema = pa.schema([
        pa.field('x', x_array.type),
        pa.field('y', y_array.type),
        pa.field('z', z_array.type)
    ])
    
    with pa.OSFile(f'chr{i}.arrow', 'wb') as sink:
        with pa.ipc.new_file(sink, schema=schema) as writer:
            batch = pa.record_batch([x_array, y_array, z_array], schema=schema)
            writer.write(batch)
