In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os
import pandas as pd
import re
import glob
from tqdm.notebook import tqdm
import skimage
import skimage.transform
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [None]:
# natural sorting
def natural_key(string_):
    """See http://www.codinghorror.com/blog/archives/001018.html"""
    return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_)]

####  Setup



In [None]:
import os,sys

if "google.colab" in sys.modules:
    %pip install wget
    
import wget,zipfile

if "labsetup_run" not in locals() or labsetup_run:

    print("running setup ...")

    #if "google.colab" in sys.modules:
    #    print("colab")
    #else:
    #    print("local")

    # download data.zip from shared google drive
    if not(os.path.isfile("data.zip")): 
        filename=wget.download("https://drive.google.com/uc?export=download&confirm=yes&id=1dkSV2oL8Ua1SDmzVvtGkyQ0LGQ6VpUIy")
    # unpack it
    if not(os.path.isdir("./data")):
        zf = zipfile.ZipFile(os.path.join(".","data.zip"), "r")
        zf.extractall()
                          
    # allow "hot-reloading" of modules
    %load_ext autoreload
    %autoreload 2
    # needed for inline plots in some contexts
    %matplotlib inline

    print("done.")
    labsetup_run = True  # change to True re-run setup
else:
    print("setup already run.")

#### Open Data

In [None]:
#List all folders in path and sort them
Ordner=np.asarray(sorted(glob.glob(r"./data/simpson_data_orig/simpsons_dataset_train/*"), key=natural_key))
print(Ordner[0:5])


In [None]:
Pfade=np.empty(0)
for i in range(0,len(Ordner)):
    Pfade = np.append(Pfade,(sorted(glob.glob(Ordner[i]+"/*jpg"), key=natural_key)))

In [None]:
Pfade

In [None]:
label=np.array([Pfade[i].replace("\\","/").split("/")[4] for i in range(0,len(Pfade))])
label

In [None]:
Data=pd.DataFrame(np.column_stack([Pfade,label]),columns=["Bild_Pfad","label"])
Data

In [None]:
np.unique(Data["label"],return_counts=True)[1]

In [None]:
np.unique(Data["label"],return_counts=True)[0]

In [None]:
idx=np.unique(Data["label"],return_counts=True)[1]>500
train_characters=np.unique(Data["label"],return_counts=True)[0][idx]

In [None]:
train_idx=np.in1d(Data["label"].values,train_characters)
Data=Data.iloc[train_idx]

In [None]:
klasse=np.zeros((len(Data["label"])),dtype="uint8")
for i in range(0,len(np.unique(Data["label"]))):
    klasse[Data["label"]==np.unique(Data["label"])[i]]=i

In [None]:
Data = Data.assign(Klasse=klasse)

In [None]:
Data=Data.reset_index(drop=True)
Data

In [None]:
img=plt.imread(Data["Bild_Pfad"][0])
plt.imshow(img)
plt.show()
img=np.array(skimage.transform.resize(img, (80,80), preserve_range=True),dtype="uint8")
plt.imshow(img)
plt.show()


In [None]:
size=80
X_data = np.zeros((len(Data),size,size,3),dtype="uint8")
print(X_data.shape)
for i in tqdm(range(0,len(X_data))):
    img = plt.imread(Data["Bild_Pfad"][i])
    img_resized = np.array(skimage.transform.resize(img, (size,size), preserve_range=True),dtype="uint8")
    X_data[i,:,:,:] = img_resized


In [None]:
Y_data=to_categorical(Data["Klasse"],max(Data["Klasse"])+1)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, test_size=0.2, random_state=54)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.3, random_state=22)

In [None]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

In [None]:
outdir=r"./data/simpson_data"
if not os.path.exists(outdir):
    os.mkdir(outdir)

np.save(os.path.join(outdir,"X_train.npy"), X_train)
np.save(os.path.join(outdir,"X_val.npy"), X_val)
np.save(os.path.join(outdir,"X_test.npy"), X_test)

np.save(os.path.join(outdir,"Y_train.npy"), Y_train)
np.save(os.path.join(outdir,"Y_val.npy"), Y_val)
np.save(os.path.join(outdir,"Y_test.npy"), Y_test)

Data.to_csv(os.path.join(outdir,"Data.csv"),index=False)