In [1]:
import pandas as pd
from PIL import Image
import os
import numpy as np

In [2]:
images_path = '../images'
population_data_path = '../data/full_data.csv'

In [3]:
def crop_and_resize(image: Image.Image, size: int) -> Image.Image:
    """
    Crop a PIL image to a square by trimming the longer side and centering,
    then resize it to the designated resolution.
    
    :param image: Input PIL Image
    :param size: Target resolution (size x size)
    :return: Processed PIL Image
    """
    width, height = image.size
    min_side = min(width, height)
    
    # Calculate crop box for center cropping
    left = (width - min_side) // 2
    top = (height - min_side) // 2
    right = left + min_side
    bottom = top + min_side
    
    # Crop to square
    cropped_image = image.crop((left, top, right, bottom))
    
    # Resize to designated resolution
    resized_image = cropped_image.resize((size, size), Image.LANCZOS)
    
    return resized_image

In [4]:
images = {}

for dir in os.listdir(images_path):
    if not dir.endswith('.png'):
        continue
    
    full_path = f'{images_path}/{dir}'
    image = Image.open(full_path)
    square_image = crop_and_resize(image, 512)
    images[dir[:-4].lower()] = square_image
    
print(f'Loaded {len(images)} images')
images

Loaded 472 images


{'muzaffarpur': <PIL.Image.Image image mode=RGB size=512x512>,
 'pasig_city': <PIL.Image.Image image mode=RGB size=512x512>,
 'tehran': <PIL.Image.Image image mode=RGB size=512x512>,
 'pontianak': <PIL.Image.Image image mode=RGB size=512x512>,
 'luohe': <PIL.Image.Image image mode=RGB size=512x512>,
 'krasnodar': <PIL.Image.Image image mode=RGB size=512x512>,
 'baltimore': <PIL.Image.Image image mode=RGB size=512x512>,
 'houston': <PIL.Image.Image image mode=RGB size=512x512>,
 'manila': <PIL.Image.Image image mode=RGB size=512x512>,
 'jinzhong': <PIL.Image.Image image mode=RGB size=512x512>,
 'wuxi': <PIL.Image.Image image mode=RGB size=512x512>,
 'meerut': <PIL.Image.Image image mode=RGB size=512x512>,
 'suqian': <PIL.Image.Image image mode=RGB size=512x512>,
 'lampang': <PIL.Image.Image image mode=RGB size=512x512>,
 'saitama': <PIL.Image.Image image mode=RGB size=512x512>,
 'hanoi': <PIL.Image.Image image mode=RGB size=512x512>,
 'tabriz': <PIL.Image.Image image mode=RGB size=512x5

In [5]:
images_df = pd.DataFrame.from_dict(images, orient='index', columns=['image'])
images_df = images_df.reset_index().rename(columns={'index': 'name'})
images_df

Unnamed: 0,name,image
0,muzaffarpur,<PIL.Image.Image image mode=RGB size=512x512 a...
1,pasig_city,<PIL.Image.Image image mode=RGB size=512x512 a...
2,tehran,<PIL.Image.Image image mode=RGB size=512x512 a...
3,pontianak,<PIL.Image.Image image mode=RGB size=512x512 a...
4,luohe,<PIL.Image.Image image mode=RGB size=512x512 a...
...,...,...
467,makassar,<PIL.Image.Image image mode=RGB size=512x512 a...
468,jiangmen,<PIL.Image.Image image mode=RGB size=512x512 a...
469,chennai,<PIL.Image.Image image mode=RGB size=512x512 a...
470,tijuana,<PIL.Image.Image image mode=RGB size=512x512 a...


In [6]:
population = pd.read_csv(population_data_path)
population['name'] = population['name'].str.lower().str.replace(' ', '_')
population

Unnamed: 0,name,lon,lat,population
0,shanghai,121.4670,31.1667,22120000
1,beijing,116.3910,39.9050,19433000
2,shenzhen,114.0540,22.5350,15929000
3,guangzhou,113.2590,23.1288,20902000
4,chengdu,104.0670,30.6636,11309000
...,...,...,...,...
959,zadar,15.2167,44.1167,75082
960,nampula,39.2667,-15.1167,477900
961,slavonski_brod,18.0144,45.1553,56769
962,karlovac,15.5558,45.4931,53134


In [7]:
df = pd.merge(population, images_df, on='name', how='left')
df.dropna(inplace=True)
df

Unnamed: 0,name,lon,lat,population,image
0,shanghai,121.467,31.1667,22120000,<PIL.Image.Image image mode=RGB size=512x512 a...
1,beijing,116.391,39.9050,19433000,<PIL.Image.Image image mode=RGB size=512x512 a...
2,shenzhen,114.054,22.5350,15929000,<PIL.Image.Image image mode=RGB size=512x512 a...
3,guangzhou,113.259,23.1288,20902000,<PIL.Image.Image image mode=RGB size=512x512 a...
4,chengdu,104.067,30.6636,11309000,<PIL.Image.Image image mode=RGB size=512x512 a...
...,...,...,...,...,...
863,leon_de_los_aldama,-101.683,21.1167,1454793,<PIL.Image.Image image mode=RGB size=512x512 a...
875,zapopan,-103.400,20.7167,1155790,<PIL.Image.Image image mode=RGB size=512x512 a...
879,foshan,113.106,23.0292,7194311,<PIL.Image.Image image mode=RGB size=512x512 a...
928,san_jose,-121.849,37.3019,1798103,<PIL.Image.Image image mode=RGB size=512x512 a...


In [8]:
X = df['image'].to_list()
X = np.array(X)
X_flatten = np.array(X).reshape(-1, 512*512*3)
print(X.shape)
print(X_flatten.shape)  # for regression models

(487, 512, 512, 3)
(487, 786432)


In [9]:
y = df['population']
y = np.array(y.to_list()).reshape(-1, 1)
y.shape

(487, 1)