In [1]:
import os

from tqdm import tqdm
from glob import glob

import cv2
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

```
There are 44 doubled tiffs (same "godło"), but with different resolutions
and 12 doubled tiffs with no other differences than the image itself
```

In [2]:
META_IMAGES = "../../data/meta/images"
PLOT_DIR = "../../plots"
os.makedirs(PLOT_DIR, exist_ok=True)

images = sorted(glob(os.path.join(META_IMAGES, '*.csv')))

In [3]:
def merge_dfs(paths):
    mrg = []
    for pth in tqdm(paths):
        df = pd.read_csv(pth)
        df["fname"] = os.path.basename(pth)
        df["godlo"] = os.path.basename(pth).replace('.csv', '').split('_')[-1]
        mrg.append(df)
    df = pd.concat(mrg)
    return df

In [None]:
df = merge_dfs(images)
df = df.round({'scale': 2})

100%|██████████| 60774/60774 [03:06<00:00, 325.69it/s]


In [None]:
df.godlo.unique().size

In [None]:
df['index'] = range(0, df.shape[0])
df = df.set_index('index')

doubled = df[df.groupby('godlo').godlo.transform('size') > 1].godlo.unique()

to_del = []
for gdl in tqdm(doubled):
    _df = df[df.godlo==gdl]
    assert _df.shape[0]==2
    
    if _df.iloc[0]['scale'] >= _df.iloc[1]['scale']: # del duplicate with a bigger scale
        to_del.append(_df.index[0])
    else:
        to_del.append(_df.index[1]) 
    
df = df.drop(to_del)
df

In [None]:
sorted(df.scale.unique())

In [None]:
new_df = pd.DataFrame(columns=["scale", "counter"])
new_df.scale = sorted(df.scale.unique()) 

new_df.counter = new_df.apply(lambda x: df[df.scale == x.scale].count(), axis=1)
new_df

In [None]:
df_area = pd.DataFrame(columns=["scale", "area"])
df_area.scale = sorted(df.scale.unique()) 

df_area.area = df_area.apply(lambda x: (df[df.scale==x.scale].m_width*df[df.scale==x.scale].m_height).sum(), axis=1)
df_area

In [None]:
df_area.area = df_area.area*1e-6
df_area

In [None]:
df_area.area.sum() # PL area from wiki - 312696

### number of images in each scale

In [None]:
plt.figure(figsize=(16, 10))
sns.set(style = 'whitegrid', font_scale = 1.2)
sns.set_palette(sns.color_palette("Set1", desat=.5))
plt.tight_layout()

width = 0.5
num = np.arange(len(new_df.scale))

p1 = plt.bar(num, new_df.counter, width)
plt.xticks(num, new_df.scale)

plt.xlabel('Meters per pixel')
plt.ylabel('Number of images')
plt.yscale('log')
plt.savefig(os.path.join(PLOT_DIR, 'scale_img-number.eps'), bbox_inches='tight')

### area per scale

In [None]:
plt.figure(figsize=(16, 10))
sns.set(style = 'whitegrid', font_scale = 1.2)
sns.set_palette(sns.color_palette("Set1", desat=.5))
plt.tight_layout()

width = 0.5
num = np.arange(len(new_df.scale))

p1 = plt.bar(num, new_df.counter, width)
plt.xticks(num, new_df.scale)

plt.xlabel('Meters per pixel')
plt.ylabel('$km^2$')
plt.yscale('log')
plt.savefig(os.path.join(PLOT_DIR, 'scale_km2.eps'), bbox_inches='tight')