In [None]:
import os
import re

import matplotlib.pyplot as plt
import pandas as pd
from PIL import Image

%matplotlib inline

dataset_dir = 'c:/Users/flori/download'
pd.set_option('display.max_rows', 50)
os.chdir('c:/Users/flori/OneDrive/Documents/Uni/8_Master_thesis/code')
pd.set_option('display.max_columns', None)

# Load files
## occurrence.csv

In [None]:
occurrence = pd.read_csv('datasets/occurrence.csv', sep=',')
print(occurrence.shape)
print(occurrence['_id'].unique().shape)
occurrence.head(1)

## multimedia.csv

In [None]:
multimedia = pd.read_csv('datasets/multimedia.csv', sep=',')
print(multimedia.shape)
print(multimedia['_id'].unique().shape)
multimedia.head(1)

## dataset.csv

In [None]:
ds = pd.read_csv('datasets/dataset.csv', sep=',', encoding='ISO-8859-1')
print(ds.shape)
print(ds['id'].unique().shape)
ds.head(1)


# Analyse data
## Create label list

In [None]:
pd.set_option('display.max_rows', 50)

def extract_tags_count(df_column):
    tags = {}
    for i, line in enumerate(list(df_column)):
        if isinstance(line, str):
            for tag in re.split(r'\_| |-|\.|\(|\)|\#|\,|\:', line): ## split on different seperators
                if tag.isnumeric() == False and tag != '':
                    tag = re.sub(r'[0-9]+', '#', tag).lower() ## replace all numbers in a tag with a hashtag and make all text lower
                    if tag not in tags:
                        tags[tag] = 1
                    else:
                        tags[tag] += 1
    return tags

In [None]:
multimedia_tags = extract_tags_count(multimedia.title)
multimedia_tags = pd.DataFrame.from_dict(multimedia_tags, orient='index', columns=['count'])
multimedia_tags = multimedia_tags.sort_values(by='count', ascending=False)
# multimedia_tags.head(10)

In [None]:
dataset_tags = extract_tags_count(ds.file_description)
dataset_tags = pd.DataFrame.from_dict(dataset_tags, orient='index', columns=['count'])
dataset_tags = dataset_tags.sort_values(by='count', ascending=False)
# dataset_tags.head(10)

In [None]:
merged = pd.merge(dataset_tags, multimedia_tags, left_index=True, right_index=True, suffixes=['_dataset', '_multimedia'])
merged.head(10)
# merged.to_csv('datasets/tags.csv')

In [None]:
multimedia_tags_subset = multimedia_tags.drop(['bmnhe', 'nhmuk', 'bmnh'])
multimedia_tags_head = multimedia_tags_subset.head(20)
multimedia_tags_head.plot(kind='bar', y='count', figsize=(8,3))
plt.tight_layout()
plt.savefig('plots/label_bar_plot.pdf')
plt.show()

## Check the image sizes

In [None]:
sizes = []
sizes_all_landscape = []
for i, image_filename in enumerate(ds.filename):
    if i % 10000 == 0: print(i)
    # if i > 1000: break
    try:
        im = Image.open(dataset_dir + '/' + image_filename)
        width, height = im.size
        sizes.append([width, height])
        if width < height:
            sizes_all_landscape.append([height, width])
        else:
            sizes_all_landscape.append([width, height])
    except Exception as e:
        print(e)


sizes_df = pd.DataFrame(sizes, columns=['width', 'heigth'])
sizes_df.to_csv('datasets/sizes.csv')
print(sizes_df.value_counts())


In [None]:
fig1, ax1 = plt.subplots(figsize=(8,3))
ax1.boxplot(sizes_df, labels=['width [px]', 'height [px]'])
plt.tight_layout()
plt.savefig('plots/boxplot_image_dim.pdf')

In [None]:
sizes_all_landscape_df = pd.DataFrame(sizes_all_landscape, columns=['width', 'heigth'])
fig1, ax1, = plt.subplots()
ax1.boxplot(sizes_all_landscape_df)

## Check amount of images in per specimen

In [None]:
from pprint import pprint

exclude_tags = ['', 'jpg', 'bmnhe', 'bmnh']

def extract_tags(filenames, subset=None, droplast=False):
    dataset = {} 
    ## Loop over each filename
    for i, filename in enumerate(list(filenames)):
        ## Temporarly subset the dataset
        if subset != None and i > subset: break
        ## Check if the filename exists
        if isinstance(filename, str):
            tags = []
            ## Loop over each tag in the filename
            for j, tag in enumerate(re.split(r'\_| |-|\.|\(|\)|\#|\,|\:', filename)): ## split on different seperators
                ## The first tag is the id of the specimen
                if j == 0: _id = int(tag)
                ## Check if the other tags aren't numbers, or in the exclude_tags var
                if tag.isnumeric() == False and tag.lower() not in exclude_tags:
                    tag = re.sub(r'[0-9]+', '#', tag).lower() ## replace all numbers in a tag with a hashtag and make all text lower
                    tags.append(tag)
        
        ## Append the filename and tags to the dictionary
        if _id not in dataset:
            dataset[_id] = {'files': []}
        dataset[_id]['files'].append([filename, tags])

    ## Exclude the last group just to be sure no files are forgotten in the count when subsetting the data
    if droplast: del dataset[_id]
    ## Add a filecount to every ID
    for _id in dataset:
        dataset[_id]['file_count'] = len(dataset[_id]['files']) 
    return dataset

multimedia['filename'] = multimedia['_id'].astype(str) + '_' + multimedia['title'].astype(str)
multimedia_dict = extract_tags(multimedia.filename, droplast=False)

In [None]:
## Sanity check
print('Original count:', len(list(multimedia.filename)))
print('Output count:',sum([multimedia_dict[_id]['file_count'] for _id in multimedia_dict]))
print("Counts may differ a bit as the last specimen doesn't get include due to the group being cut off")

In [None]:
pict_per_group = []
for i, _id in enumerate(multimedia_dict):
    pict_per_group.append(multimedia_dict[_id]['file_count'])

In [None]:
freq_table = pd.Series(pict_per_group).value_counts(sort=False)
freq_table

In [None]:
print('1', freq_table.take([0]))
print('2', freq_table.take([1]))
print('rest', sum(freq_table.take(list(range(2,31)))))
print(sum(freq_table))

## Image dimensions

In [None]:
import numpy as np
fig1, ax1 = plt.subplots(figsize=(15,4))
freq, bins, patches = ax1.hist(pict_per_group, bins=list(range(max(pict_per_group))), log=True, )
plt.xticks(list(range(max(pict_per_group))))
plt.xlabel('Amount of pictures (log)')
plt.ylabel('Amount of pictures in group')
ax1.set_xticklabels(list(range(max(pict_per_group))), rotation='horizontal')
# plt.tight_layout()

# x coordinate for labels
bin_centers = np.diff(bins)*0.5 + bins[:-1]

n = 0
for fr, x, patch in zip(freq, bin_centers, patches):
  height = int(freq[n])
  plt.annotate("{}".format(height),
               xy = (x, height),        
               xytext = (0,0.2),     
               textcoords = "offset points",
               ha = 'center', rotation=45
               )
  n = n+1

# plt.savefig('plots/boxplot_image_dim.pdf')

In [None]:
from PIL import Image

dataset_dir = Path('c:/Users/flori/download/subset')

def image_sizes(dataset_dir):
    dirlist = os.listdir(dataset_dir)
    print(len(dirlist))
    sizes = []
    sizes_all_landscape = []
    for i, image_filename in enumerate(dirlist):
        if i % 1000 == 0: print(i)
        # if i > 1000: break
        try:
            im = Image.open(Path(dataset_dir / image_filename))
            width, height = im.size
            sizes.append([width, height])
            if width < height:
                sizes_all_landscape.append([height, width])
            else:
                sizes_all_landscape.append([width, height])
        except Exception as e:
            print(e)
    sizes_df = pd.DataFrame(sizes, columns=['width', 'heigth'])
    return sizes_df

sizes_df = image_sizes(dataset_dir)
# sizes_df.to_csv('datasets/sizes.csv')
print(sizes_df.value_counts())



## Move images

In [None]:
import os
import shutil
from pathlib import Path

dir_from = Path('c:/Users/flori/download/subset/data')
dir_to = Path('c:/Users/flori/download/subset')
dirlist = os.listdir(dir_from)

# print(dirlist)
print(len(dirlist))
for i, f in enumerate(dirlist):
    if i % 1000 == 0: print(i)
    if f != 'original':
        try:
            shutil.move(dir_from / Path(f),  dir_to / Path(f))
        except:
            print('oops')


## Create a matrix of images from a folder

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import os
img_dir = 'C:/Users/flori/download/image_ranges'
images = os.listdir(img_dir)

w=4
h=6
fig=plt.figure(figsize=(15,8))
columns = 4
rows = 5
for i in range(1, columns*rows +1):
    img = mpimg.imread(img_dir + '/' + images[i-1])
    fig.add_subplot(rows, columns, i)
    plt.axis('off')
    plt.imshow(img)
# plt.show()
plt.tight_layout()
plt.savefig('plots/some_images.pdf')