In [1]:
from sqlalchemy import create_engine, Column, String, Integer, func, event, text
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.dialects.postgresql import ARRAY
from sqlalchemy.orm import sessionmaker
from geoalchemy2 import Geometry 
from tqdm import tqdm
from shapely.wkt import dumps

import orjson

In [2]:
import ray

In [3]:
ray.init()

2024-11-05 18:42:47,037	INFO worker.py:1777 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


0,1
Python version:,3.10.12
Ray version:,2.37.0
Dashboard:,http://127.0.0.1:8265




In [4]:
ray.cluster_resources()

{'accelerator_type:G': 1.0,
 'node:__internal_head__': 1.0,
 'CPU': 16.0,
 'object_store_memory': 7961106432.0,
 'memory': 15922212864.0,
 'node:172.17.185.211': 1.0,
 'GPU': 1.0}

In [5]:
%%time
with open('13_266069_040_003 L02 PAS.json', 'r') as file:
#with open('/mnt/c/research/kidney/15_26609_024_045 L03 PAS.json', 'r') as file:
    # Load the JSON data into a Python dictionary
    data = orjson.loads(file.read())

import shapely
from shapely.geometry import shape

CPU times: user 12.1 s, sys: 2.77 s, total: 14.9 s
Wall time: 16.4 s


In [6]:
data[0]

{'type': 'Feature',
 'geometry': {'type': 'Polygon',
  'coordinates': [[[45837, 20092],
    [45836, 20093],
    [45835, 20094],
    [45834, 20095],
    [45833, 20096],
    [45832, 20096],
    [45831, 20096],
    [45830, 20096],
    [45829, 20096],
    [45828, 20096],
    [45827, 20096],
    [45826, 20096],
    [45825, 20096],
    [45824, 20096],
    [45823, 20096],
    [45822, 20096],
    [45821, 20096],
    [45820, 20097],
    [45819, 20098],
    [45818, 20099],
    [45817, 20100],
    [45816, 20100],
    [45815, 20100],
    [45814, 20100],
    [45813, 20100],
    [45812, 20101],
    [45811, 20102],
    [45810, 20103],
    [45809, 20104],
    [45808, 20104],
    [45807, 20104],
    [45806, 20104],
    [45805, 20104],
    [45804, 20105],
    [45803, 20106],
    [45802, 20107],
    [45801, 20108],
    [45800, 20109],
    [45799, 20110],
    [45798, 20111],
    [45797, 20112],
    [45796, 20113],
    [45795, 20114],
    [45794, 20115],
    [45793, 20116],
    [45792, 20117],
    [45791, 

In [7]:
# Create a base class for our declarative mapping
Base = declarative_base()

# Define your SQLAlchemy model
class GeometryModel(Base):
    __tablename__ = 'geometries'
    id = Column(Integer, primary_key=True)
    name = Column(String)
    geom = Column(Geometry('POLYGON'))

  Base = declarative_base()


In [8]:
from sqlalchemy_utils import database_exists, create_database
engine = create_engine('postgresql://postgres@localhost:5333/testpara')#,echo=True)

print(engine.url)
try:
    create_database(engine.url)
    print("created")
except:
    print("errored")
    pass

postgresql://postgres@localhost:5333/testpara
created


In [9]:
# Initialize Spatialite extension
@event.listens_for(engine, "connect")
def connect(dbapi_connection, connection_record):
    with dbapi_connection.cursor() as cursor:
        cursor.execute('CREATE EXTENSION IF NOT EXISTS postgis;')

In [10]:
# Create the table
Base.metadata.create_all(engine)

In [11]:
from tqdm import tqdm

In [12]:
@ray.remote
def bulk_insert(geojsons):
    from sqlalchemy import create_engine, Column, String, Integer, func, event, text
    from geoalchemy2 import Geometry 
    engine = create_engine('postgresql://postgres@localhost:5333/testpara')#,echo=True)

    # Initialize Spatialite extension-
    @event.listens_for(engine, "connect")
    def connect(dbapi_connection, connection_record):
        with dbapi_connection.cursor() as cursor:
            cursor.execute('CREATE EXTENSION IF NOT EXISTS postgis;')
    
    try:

        # Create a base class for our declarative mapping
        Base = declarative_base()
        
        # Define your SQLAlchemy model
        class GeometryModel(Base):
            __tablename__ = 'geometries'
            id = Column(Integer, primary_key=True)
            name = Column(String)
            geom = Column(Geometry('POLYGON'))

        polygons=[]
        for geojson in geojsons:

            name = geojson["properties"]["classification"]["name"]
            wkt = shape(geojson["geometry"]).wkt
            
            polygons.append(GeometryModel(name=name,geom=wkt))
            
        Session = sessionmaker(bind=engine)
        with Session() as session:
            session.bulk_save_objects(polygons)
            session.commit()

    except Exception as inst:
        print(inst)
        pass
    finally:
        engine.dispose() ##might be needed? --- yes needed

In [13]:
%%time
futures = [] 
for _ in range(12):
    batch_size=5_000
    polygons=[]
    
    for geojson in tqdm(data):
        polygons.append(geojson)
    
        if len(polygons) == batch_size:
            futures.append(bulk_insert.remote(polygons))
            polygons=[]
    
    if polygons:
        futures.append(bulk_insert.remote(polygons))
    
for f in tqdm(futures):
    ray.get(f)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 88605/88605 [00:05<00:00, 17157.37it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 88605/88605 [00:07<00:00, 11953.21it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 88605/88605 [00:07<00:00, 11908.94it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

CPU times: user 1min 24s, sys: 6.7 s, total: 1min 31s
Wall time: 2min 28s





In [14]:
%%time
#lets make sure insert worked as expected
with  engine.connect() as conn:
    res=conn.execute(text("select count(geom) from geometries"))
    nresults=res.fetchall()
    print(nresults)

[(1063260,)]
CPU times: user 6.65 ms, sys: 751 µs, total: 7.4 ms
Wall time: 416 ms


In [17]:
%%time
with  engine.connect() as conn:
    res=conn.execute(text("select ST_AsGeoJSON(ST_centroid(geom))  from geometries limit 1000"))
    centroids=res.fetchall()

CPU times: user 3.88 ms, sys: 437 µs, total: 4.32 ms
Wall time: 16.7 ms


In [18]:
centroids[0:100]

[('{"type":"Point","coordinates":[45862.132927504,20242.072237595]}',),
 ('{"type":"Point","coordinates":[45812.262488647,20306.061459279]}',),
 ('{"type":"Point","coordinates":[45854.852130326,20261.929323308]}',),
 ('{"type":"Point","coordinates":[45972.1875,41083.470833333]}',),
 ('{"type":"Point","coordinates":[41711.703818663,36082.014000816]}',),
 ('{"type":"Point","coordinates":[52616.444076642,32874.580228137]}',),
 ('{"type":"Point","coordinates":[45901.874369386,20236.459702227]}',),
 ('{"type":"Point","coordinates":[46006.169376694,20198.662601626]}',),
 ('{"type":"Point","coordinates":[45858.962783172,20359.420550162]}',),
 ('{"type":"Point","coordinates":[45792.374420146,20346.953611663]}',),
 ('{"type":"Point","coordinates":[45750.401608789,20334.951736315]}',),
 ('{"type":"Point","coordinates":[45900.775072464,20576.216618357]}',),
 ('{"type":"Point","coordinates":[45954.05971937,20293.65349076]}',),
 ('{"type":"Point","coordinates":[45759.335901387,20285.433744222]}',),