In [1]:
import cv2
import math
import os
import pytesseract
import re
import numpy as np
import csv

from PIL import Image, ImageFilter
from osgeo import gdal, osr
from tqdm import tqdm

In [2]:
def crop_map(input_path, output_path):
  # Load the JPEG file
  img = cv2.imread(input_path)

  # Convert the image to grayscale
  gray = cv2.cvtColor(
    img, 
    cv2.COLOR_BGR2GRAY
  )

  # Apply a Canny edge detection filter to find the edges
  edges = cv2.Canny(gray, 50, 150)

  # Find contours in the image
  contours, hierarchy = cv2.findContours(
    edges, 
    cv2.RETR_EXTERNAL, 
    cv2.CHAIN_APPROX_SIMPLE
  )

  # Find the largest contour
  largest_contour = max(
    contours, 
    key=cv2.contourArea
  )

  # Get the bounding box coordinates
  x, y, w, h = cv2.boundingRect(largest_contour)

  # Extract the image inside the bounding box
  img_cropped = img[y:y+h, x:x+w]

  # Save the cropped image as a new JPEG file
  cv2.imwrite(output_path, img_cropped)

In [3]:
def get_coordinates(input_path):
  # Read the PNG image using OpenCV
  image = cv2.imread(input_path)

  # Convert the image to grayscale
  gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

  # Perform OCR to extract text from the image
  text = pytesseract.image_to_string(gray_image)
  
  lines = re.split(r'\s+|\n', text)

  matched_lines = [line for line in lines if line.startswith('X_min')]
  xmin = float(matched_lines[0].split('_')[-1])

  matched_lines = [line for line in lines if line.startswith('X_max')]
  xmax = float(matched_lines[0].split('_')[-1])

  matched_lines = [line for line in lines if line.startswith('Y_min')]
  ymin = float(matched_lines[0].split('_')[-1])

  matched_lines = [line for line in lines if line.startswith('Y_max')]
  ymax = float(matched_lines[0].split('_')[-1])
  
  return xmin, xmax, ymin, ymax

In [4]:
def georeference(input_path, output_path, xmin, xmax, ymin, ymax):
  # Load the input image
  input_image = gdal.Open(input_path)

  # Get the image dimensions
  image_width = input_image.RasterXSize
  image_height = input_image.RasterYSize
  num_bands = input_image.RasterCount

  # Create a spatial reference for the output GeoTIFF
  output_srs = osr.SpatialReference()
  output_srs.ImportFromEPSG(4326)  # Use WGS84 coordinate system

  # Create the output GeoTIFF file
  driver = gdal.GetDriverByName('GTiff')
  output_image = driver.Create(
    output_path, 
    image_width, 
    image_height, 
    num_bands, 
    gdal.GDT_Byte
  )

  # Write the output image data
  for band_index in range(1, num_bands + 1):
      input_band = input_image.GetRasterBand(band_index)
      output_band = output_image.GetRasterBand(band_index)
      output_band.WriteArray(input_band.ReadAsArray())

  # Set the output image geotransform
  geotransform = (
    xmin, 
    (xmax - xmin) / image_width, 
    0, 
    ymax, 
    0, 
    (ymin - ymax) / image_height
  )
  output_image.SetGeoTransform(geotransform)

  # Set the output image projection
  output_image.SetProjection(output_srs.ExportToWkt())

  # Close the input and output images
  input_image = None
  output_image = None

In [5]:
if not os.path.exists('/temp_file'):
  os.makedirs('/temp_file')
  
if not os.path.exists('/temp_file/straighten'):
  os.makedirs('/temp_file/straighten')

if not os.path.exists('/temp_file/crop'):
  os.makedirs('/temp_file/crop')

In [6]:
if not os.path.exists('georeferenced_maps'):
  os.makedirs('georeferenced_maps')

In [7]:
list_peta = os.listdir('scan_peta')

In [8]:
failed = []
for i in tqdm(list_peta):
  name = i.split('.')[0]
  
  try:
    xmin, xmax, ymin, ymax = get_coordinates(
      'scan_peta/{}.jpg'.format(name)
    )
    
    crop_map(
      'scan_peta/{}.jpg'.format(name),
      'temp_file/crop/{}.jpg'.format(name)
    )
    
    georeference(
      'temp_file/crop/{}.jpg'.format(name), 
      'georeferenced_maps/{}.tif'.format(name),
      xmin, xmax, ymin, ymax
    )
    
  except:
    failed.append(name)

with open("failed_georeference.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerows(failed)

  0%|                                                                                           | 0/22 [00:00<?, ?it/s]

scan_peta/Document_20230714_0010.jpg
118.69395973618504 118.69631715409326 -8.558126424747066 -8.555455249779108


  5%|███▊                                                                               | 1/22 [00:02<00:45,  2.14s/it]

scan_peta/Document_20230714_0011.jpg
118.68285338135313 118.68417081901835 -8.596279877836846 -8.59517388078456


  9%|███████▌                                                                           | 2/22 [00:05<01:00,  3.05s/it]

scan_peta/Document_20230714_0012.jpg
118.69429422806591 118.69642113708908 -8.612419741190436 -8.610634187936425


 14%|███████████▎                                                                       | 3/22 [00:09<00:59,  3.13s/it]

scan_peta/Document_20230714_0013.jpg


 18%|███████████████                                                                    | 4/22 [00:10<00:46,  2.60s/it]

scan_peta/Document_20230714_0014.jpg


 23%|██████████████████▊                                                                | 5/22 [00:12<00:38,  2.27s/it]

scan_peta/Document_20230714_0015.jpg
118.63393142378507 118.63713189027729 -8.596222615437759 -8.593535804061576


 27%|██████████████████████▋                                                            | 6/22 [00:16<00:46,  2.90s/it]

scan_peta/Document_20230714_0016.jpg
118.67924803995487 118.68250842162422 -8.559192294492455 -8.556455183955222


 32%|██████████████████████████▍                                                        | 7/22 [00:20<00:47,  3.16s/it]

scan_peta/Document_20230714_0017.jpg
118.67770749009101 118.680167885643 -8.5611436815306 -8.559078164277077


 36%|██████████████████████████████▏                                                    | 8/22 [00:24<00:49,  3.52s/it]

scan_peta/Document_20230714_0018.jpg


 41%|█████████████████████████████████▉                                                 | 9/22 [00:26<00:37,  2.91s/it]

scan_peta/Document_20230714_0019.jpg
118.69156474485999 118.69545512184098 -8.602588762677039 -8.598180601709396


 45%|█████████████████████████████████████▎                                            | 10/22 [00:29<00:36,  3.06s/it]

scan_peta/Document_20230714_0020.jpg
118.69612932408779 118.69787053237593 -8.592505220338966 -8.591043465232875


 50%|█████████████████████████████████████████                                         | 11/22 [00:32<00:33,  3.08s/it]

scan_peta/Document_20230714_0021.jpg


 55%|████████████████████████████████████████████▋                                     | 12/22 [00:34<00:26,  2.60s/it]

scan_peta/Document_20230714_0022.jpg


 59%|████████████████████████████████████████████████▍                                 | 13/22 [00:36<00:21,  2.44s/it]

scan_peta/Document_20230714_0001.jpg


 64%|████████████████████████████████████████████████████▏                             | 14/22 [00:37<00:17,  2.19s/it]

scan_peta/Document_20230714_0002.jpg


 68%|███████████████████████████████████████████████████████▉                          | 15/22 [00:39<00:13,  1.98s/it]

scan_peta/Document_20230714_0003.jpg


 73%|███████████████████████████████████████████████████████████▋                      | 16/22 [00:40<00:11,  1.84s/it]

scan_peta/Document_20230714_0004.jpg
118.66988984005663 118.6725254874169 -8.57345688668447 -8.57047045172588


 77%|███████████████████████████████████████████████████████████████▎                  | 17/22 [00:44<00:12,  2.50s/it]

scan_peta/Document_20230714_0005.jpg
118.65204726999256 118.65468832636022 -8.57254269037705 -8.57032550725359


 82%|███████████████████████████████████████████████████████████████████               | 18/22 [00:48<00:10,  2.74s/it]

scan_peta/Document_20230714_0006.jpg
118.6360701370677 118.63832671991118 -8.562845909701513 -8.560288990292543


 86%|██████████████████████████████████████████████████████████████████████▊           | 19/22 [00:52<00:09,  3.09s/it]

scan_peta/Document_20230714_0007.jpg


 91%|██████████████████████████████████████████████████████████████████████████▌       | 20/22 [00:53<00:05,  2.59s/it]

scan_peta/Document_20230714_0008.jpg


 95%|██████████████████████████████████████████████████████████████████████████████▎   | 21/22 [00:54<00:02,  2.21s/it]

scan_peta/Document_20230714_0009.jpg


100%|██████████████████████████████████████████████████████████████████████████████████| 22/22 [00:56<00:00,  2.57s/it]


In [28]:
list_peta

['Document_20230714_0010.jpg',
 'Document_20230714_0011.jpg',
 'Document_20230714_0012.jpg',
 'Document_20230714_0013.jpg',
 'Document_20230714_0014.jpg',
 'Document_20230714_0015.jpg',
 'Document_20230714_0016.jpg',
 'Document_20230714_0017.jpg',
 'Document_20230714_0018.jpg',
 'Document_20230714_0019.jpg',
 'Document_20230714_0020.jpg',
 'Document_20230714_0021.jpg',
 'Document_20230714_0022.jpg',
 'Document_20230714_0001.jpg',
 'Document_20230714_0002.jpg',
 'Document_20230714_0003.jpg',
 'Document_20230714_0004.jpg',
 'Document_20230714_0005.jpg',
 'Document_20230714_0006.jpg',
 'Document_20230714_0007.jpg',
 'Document_20230714_0008.jpg',
 'Document_20230714_0009.jpg']