In [25]:
import pandas as pd

In [26]:
df = pd.read_csv('0.merged_df.csv')

In [27]:
df.drop('Unnamed: 0', axis = 1, inplace = True)

In [28]:
lon_min, lon_max = 4, 5.5
lat_min, lat_max = 52, 53

In [29]:
df = df.loc[(df.lon >= lon_min) & (df.lon <= lon_max)]
df = df.loc[(df.lat >= lat_min) & (df.lat <= lat_max)]
df

Unnamed: 0,osm_id,code,fclass,name,lon,lat,SELECT
3,6320110,7212,nature_reserve,Palazzo,5.483148,52.491385,nature
4,6320117,7209,commercial,,5.474250,52.495943,entertainment
5,6320135,7209,commercial,Noordersluis,5.438832,52.500937,entertainment
6,6320142,7209,commercial,,5.446325,52.507542,entertainment
7,6320167,7212,nature_reserve,,5.495712,52.523269,nature
...,...,...,...,...,...,...,...
2082696,1115993961,8200,water,,4.617390,52.082825,nature
2082697,1115993962,8200,water,,4.617506,52.083592,nature
2082698,1115993963,8200,water,,4.617715,52.082837,nature
2082699,1115993965,8200,water,,4.617114,52.081069,nature


In [30]:
n_rows, n_columns = 200, 200

lat_min, lat_max = df.lat.min(), df.lat.max()  # rows
lon_min, lon_max = df.lon.min(), df.lon.max()  # columns

diff_lat = lat_max - lat_min
diff_lon = lon_max - lon_min

step_lat = diff_lat / n_rows
step_lon = diff_lon / n_columns

In [31]:
categories = df['SELECT'].drop_duplicates().to_list()

In [32]:
starting_point_lat, starting_point_lon = lat_min, lon_min # start from the highest column and the first column

result_log = {}

#starting bbox
bbox_lat_min, bbox_lat_max = starting_point_lat, starting_point_lat + step_lat
bbox_lon_min, bbox_lon_max = starting_point_lon, starting_point_lon + step_lon

for row in range(n_rows):
    # reset starting point of the column axis
    bbox_lon_max = starting_point_lon
    bbox_lon_min = starting_point_lon + step_lon

    row_df = df.loc[(df.lat > bbox_lat_min) & (df.lat <= bbox_lat_max), :]

    for column in range(n_columns):
        cell_df = row_df.loc[(row_df.lon > bbox_lon_min) & (row_df.lon <= bbox_lon_max), :]

        cell_name = 'R' + str(row + 1) + 'C' + str(column + 1)
        cell_values = [row + 1, column +1, bbox_lat_min, bbox_lat_max, bbox_lon_min, bbox_lon_max]

        for category in categories:
            try:
                count = cell_df.SELECT.value_counts()[category]
                cell_values.append(count)

            except:
                cell_values.append(0)

        result_log[cell_name] = cell_values

        #print('LAT:', bbox_lat_min, '   ', bbox_lat_max, '  LON:', bbox_lon_min, '   ', bbox_lon_max, '    COUNT:', cell_df.shape[0])

        bbox_lon_min = bbox_lon_max
        bbox_lon_max += step_lon

    bbox_lat_min = bbox_lat_max
    bbox_lat_max += step_lat

In [33]:
result_df = pd.DataFrame(result_log).T
result_df.columns = ['row', 'column', 'lat_min', 'lat_max', 'lon_min', 'lon_max'] + categories
result_df

Unnamed: 0,row,column,lat_min,lat_max,lon_min,lon_max,nature,entertainment,transports,art,tourism,security,accessibility,sport
R1C1,1.0,1.0,52.000000,52.005000,4.130890,4.124010,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
R1C2,1.0,2.0,52.000000,52.005000,4.124010,4.130890,1.0,3.0,5.0,0.0,3.0,0.0,5.0,0.0
R1C3,1.0,3.0,52.000000,52.005000,4.130890,4.137770,4.0,1.0,34.0,0.0,2.0,0.0,4.0,0.0
R1C4,1.0,4.0,52.000000,52.005000,4.137770,4.144650,9.0,0.0,25.0,0.0,1.0,0.0,1.0,0.0
R1C5,1.0,5.0,52.000000,52.005000,4.144650,4.151530,6.0,2.0,20.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
R200C196,200.0,196.0,52.994975,52.999974,5.458719,5.465599,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
R200C197,200.0,197.0,52.994975,52.999974,5.465599,5.472479,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
R200C198,200.0,198.0,52.994975,52.999974,5.472479,5.479359,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
R200C199,200.0,199.0,52.994975,52.999974,5.479359,5.486238,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
result_df.to_csv('1.complete_df.csv')

In [35]:
filtered_df = result_df[result_df[categories].T.sum(0) != 0] # remove the cells that have 0 on every fclass
filtered_df

Unnamed: 0,row,column,lat_min,lat_max,lon_min,lon_max,nature,entertainment,transports,art,tourism,security,accessibility,sport
R1C2,1.0,2.0,52.000000,52.005000,4.124010,4.130890,1.0,3.0,5.0,0.0,3.0,0.0,5.0,0.0
R1C3,1.0,3.0,52.000000,52.005000,4.130890,4.137770,4.0,1.0,34.0,0.0,2.0,0.0,4.0,0.0
R1C4,1.0,4.0,52.000000,52.005000,4.137770,4.144650,9.0,0.0,25.0,0.0,1.0,0.0,1.0,0.0
R1C5,1.0,5.0,52.000000,52.005000,4.144650,4.151530,6.0,2.0,20.0,0.0,0.0,0.0,0.0,0.0
R1C6,1.0,6.0,52.000000,52.005000,4.151530,4.158410,16.0,8.0,112.0,1.0,0.0,2.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
R200C93,200.0,93.0,52.994975,52.999974,4.750085,4.756965,1.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
R200C94,200.0,94.0,52.994975,52.999974,4.756965,4.763845,1.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
R200C95,200.0,95.0,52.994975,52.999974,4.763845,4.770724,3.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0
R200C110,200.0,110.0,52.994975,52.999974,4.867044,4.873924,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
filtered_df.to_csv('1.filtered_df.csv')