In [8]:
import numpy as np
import cv2 as cv
from pathlib import Path

<h3>Datasets to numpy arrays</h3>

This collection of scripts takes some of the various datasets we've seen and converts them to a pair of numpy arrays, one for images and one for labels. The processing is all idiosyncratic because of the varying presentations and compositions of the datasets. "labels" entries are python dicts containing all the metadata I think we might want to know when manipulating these sets in the future. The most important keys are 'dataset', 'subgroup', and 'sign'. 'sign' is the classifying label in the ML sense.

In [104]:
# a dict of all the keys and values that appear (except 'subgroup') in datasets I've tagged
# this isn't used for anything yet (maybe never will)
keywords={'dataset':['cap','dis','grass','mav1','mav2','urgarg'],'subgroup':['none'], 'colour':['greyscale','rbg','bgr'], 
          'crop':['close','none'], 'size':['small','large','128'], 'shape':['square', 'rectangle'],
         'fill':['gauss','uniform','int','zeros']}

<b>Dataset: CAP</b>

In [None]:
images = []
labels = []

In [50]:
d = Path("bak/Dataset (CAP)/test")

In [53]:
for letter in d.iterdir():
    for file in letter.iterdir():
        images.append(cv.imread(str(file)))
        if not file.name[4].isdigit():
            subgrp = "model2" 
        elif int(file.name[4]) > 5:
            subgrp = "model2"
        else:
            subgrp = "model1"     
        labels.append({'dataset': "cap", 'subgroup':subgrp, 'sign': letter.stem, 
                       'colour':'greyscale', 'crop':'close', 'size':'large', 'shape':'square'})

In [52]:
d = Path("bak/Dataset (CAP)/val")
# and re-run the last cell

In [54]:
d = Path("bak/Dataset (CAP)/train")

In [55]:
for letter in d.iterdir():
    flist = [file for file in letter.iterdir()]
    dirsize = len(flist)
    for file in letter.iterdir():
        images.append(cv.imread(str(file)))
        if len(file.name) > 10:
            if not file.name[10].isdigit():
                subgrp = "model1"
            elif (dirsize - int(file.name[9:11])) < 30:
                subgrp = "model2"
            else:
                subgrp = "model1"
        else:
            if not file.name[4].isdigit():
                subgrp = "model1"
            elif (dirsize - int(file.name[3:5])) < 30:
                subgrp = "model2"
            else:
                subgrp = "model1"
        labels.append({'dataset': "cap", 'subgroup':subgrp, 'sign': letter.stem, 
                       'colour':'greyscale', 'crop':'close', 'size':'large', 'shape':'square'})

In [56]:
print(len(images), len(labels))

2282 2282


In [57]:
np.save("datasets/img_cap.npy", images)
np.save("datasets/lbl_cap.npy", labels)

<b>Dataset: dis</b>

In [61]:
images = []
labels = []

In [62]:
d = Path("bak/Dataset (dis)/")

In [63]:
for sign in d.iterdir():
    for file in sign.iterdir():
        images.append(cv.imread(str(file)))
        labels.append({'dataset':'dis', 'subgroup':file.name[0:5], 'sign':sign.stem, 
                       'colour': 'bgr', 'crop': 'close', 'size':'large', 'shape':'square'})

In [64]:
print(len(images), len(labels))

2515 2515


In [66]:
np.save("datasets/img_dis.npy", images)
np.save("datasets/lbl_dis.npy", labels)

<b>Dataset: grass</b>

In [2]:
images=[]
labels=[]

In [3]:
d=Path("bak/dataset (grass)/")

In [4]:
exclude=['K', 'G', 'M', 'N', 'P', 'T', 'nothing']

In [6]:
for folder in d.iterdir():
    if folder.is_file():
        continue
    if folder.name in exclude:
        continue
    for file in folder.iterdir():
        if not file.name[-6].isdigit():
            continue
        if file.name[-5] == '0' and int(file.name[-6]) % 2 == 0:
            images.append(cv.imread(str(file)))
            #1000, 2000, 2500
            if folder.stem=='del':
                if len(file.name) < 11:
                    source = 'video1'
                else:
                    id = int(file.name[3:7])
                    if id == 1000:
                        source = 'video1'
                    if 1000 < id <= 2000:
                        source = 'video2'
                    if 2000< id <=2500:
                        source = 'video3'
                    if 2500<id <= 3000:
                        source = 'video4'
                        
            elif folder.stem=='space':
                    if len(file.name) < 13:
                        source = 'video1'
                    else:
                        id = int(file.name[5:9])
                        if id == 1000:
                            source = 'video1'
                        if 1000 < id <= 2000:
                            source = 'video2'
                        if 2000< id <=2500:
                            source = 'video3'
                        if 2500<id <= 3000:
                            source = 'video4'
            elif len(file.name) < 9:
                source = 'video1'
            else:
                id = int(file.name[1:5])
                if id == 1000:
                    source = 'video1'
                if 1000 < id <= 2000:
                    source = 'video2'
                if 2000< id <=2500:
                    source = 'video3'
                if 2500<id <= 3000:
                    source = 'video4'
            labels.append({'dataset':'grass', 'subgroup':source, 'sign':folder.stem, 
                           'colour':'bgr', 'crop':'close','size': 'large', 'shape':'square'})

In [7]:
print(len(images), len(labels))

3300 3300


In [8]:
np.save("datasets/bak/img_grass.npy", images)
np.save("datasets/bak/lbl_grass.npy", labels)

<b>Dataset: mav1</b>

In [73]:
images =[]
labels=[]

In [74]:
d = Path("bak/dataset (mav1)/images (colour)/")

In [75]:
for numeral in d.iterdir():
    for file in numeral.iterdir():
        images.append(cv.imread(str(file)))
        labels.append({'dataset':'mav1', 'subgroup':'none', 'sign':numeral.stem, 
                       'colour':'bgr', 'crop':'close', 'size':'small', 'shape':'square'})

In [76]:
print(len(images),len(labels))

2059 2059


In [78]:
np.save("datasets/img_mav1.npy", images)
np.save("datasets/lbl_mav1.npy", labels)

<b>Dataset: mav2</b>

In [95]:
images = np.load("bak/dataset (mav2)/x.npy")
labels = np.load("bak/dataset (mav2)/y.npy")

In [97]:
images = np.uint8(images*255)

In [100]:
labels_temp = labels.squeeze()

In [101]:
labels = []
for lbl in labels_temp:
    labels.append({'dataset':'mav2', 'subgroup':'none', 'sign':lbl,
                    'colour':'rgb', 'crop':'close', 'size':'128', 'shape':'square'})

In [103]:
np.save("datasets/img_mav2.npy", images)
np.save("datasets/lbl_mav2.npy", labels)

<b>Dataset: ur-garg</b>

In [88]:
d = Path("bak/dataset (ur-garg)/")

In [91]:
images=[]
labels=[]

In [92]:
for user in d.iterdir():
    for file in user.iterdir():
        if file.suffix != ".jpg":
            continue
        images.append(cv.imread(str(file)))
        labels.append({'dataset':'urgarg', 'subgroup':user.stem, 'sign':file.name[0], 
                       'colour':'bgr', 'crop':'none', 'size':'large', 'shape':'rectangle'})

In [93]:
print(len(images),len(labels))

1680 1680


In [94]:
np.save("datasets/img_urgarg.npy", images)
np.save("datasets/lbl_urgarg.npy", labels)

<b>Dataset: Lee</b>

In [4]:
images=[]
labels=[]

In [9]:
d=Path("inbox/Lee/train")

In [10]:
j = 0
for frame in d.iterdir():
    if j % 3 == 0:
        images.append(cv.imread(str(frame)))
        labels.append({'dataset':'lee', 'subgroup':'none', 'sign':frame.name[0],
                    'colour':'bgr', 'crop':'close', 'size':'large', 'shape':'square'})
    j += 1

In [11]:
d=Path('inbox/Lee/valid')

In [16]:
d=Path('inbox/Lee/test')

In [17]:
for frame in d.iterdir():
    images.append(cv.imread(str(frame)))
    labels.append({'dataset':'lee', 'subgroup':'none', 'sign':frame.name[0],
                    'colour':'bgr', 'crop':'close', 'size':'large', 'shape':'square'})

In [27]:
np.save("datasets/bak/img_lee.npy", images)
np.save("datasets/bak/lbl_lee.npy", labels)

<b>Dataset: Arikari</b>

https://www.kaggle.com/datasets/prathumarikeri/indian-sign-language-isl

Although I like the idea of expanding our label space (there's an idea that training models to do additional, related tasks can help them with their original goals), as long as the frames can't be split among train/val/hold it sounds like more trouble than it's worth. So I removed these from my working dataset.

In [42]:
replace = {'3':'w', '6':'i', '8':'3'}
twohand = ['a', 'b', 'd','e','f','g','h','j','k','m','n','p','q','r','s','t','w','x','y','z']
exclude = ['c','i','o','v']
keep = ['l']
Indian = ['7', '9','u']

In [43]:
d = "inbox/Indian (Arikeri)/"

In [44]:
images=[]
labels=[]

In [47]:
for key in replace.keys():
    for frame in Path(d+key).iterdir():
        if len(frame.name) < 6:
            continue
        if frame.name[-5] == '0' and (frame.name[-6] == '0' or frame.name[-6] == '5'):
            images.append(cv.imread(str(frame)))
            labels.append({'dataset':'arikeri', 'subgroup':'single_model', 'sign':replace[key],
                          'colour':'bgr', 'crop':'close', 'size':'128', 'shape':'square'})
for letter in keep:
    for frame in Path(d+letter).iterdir():
        if len(frame.name) < 6:
            continue
        if frame.name[-5] == '0' and (frame.name[-6] == '0' or frame.name[-6] == '5'):
            images.append(cv.imread(str(frame)))
            labels.append({'dataset':'arikeri', 'subgroup':'single_model', 'sign':letter,
                          'colour':'bgr', 'crop':'close', 'size':'128', 'shape':'square'})
for letter in Indian:
    for frame in Path(d+letter).iterdir():
        if len(frame.name) < 6:
            continue
        if frame.name[-5] == '0' and (frame.name[-6] == '0' or frame.name[-6] == '5'):
            images.append(cv.imread(str(frame)))
            labels.append({'dataset':'arikeri', 'subgroup':'single_model', 'sign':'Ind_'+letter,
                          'colour':'bgr', 'crop':'close', 'size':'128', 'shape':'square'})

In [48]:
np.save("datasets/bak/img_ari.npy", images)
np.save("datasets/bak/lbl_ari.npy", labels)

<b>Dataset: Bredun</b>

https://www.kaggle.com/datasets/ruslanbredun/sign-language-eng-alphabet

Since this is another single-model dataset, everything picked out from it will be going into the train split.

In [49]:
d = Path("bak/dataset (bredun)/")

In [50]:
images=[]
labels=[]

In [51]:
for letter in d.iterdir():
    for frame in letter.iterdir():
        if len(frame.name) < 6:
            continue
        if frame.name[-5] == '0' and (frame.name[-6] in ['0', '3', '6']):
            images.append(cv.imread(str(frame)))
            labels.append({'dataset':'bredun', 'subgroup':'single_model', 'sign':letter.name,
                          'colour':'bgr', 'crop':'close', 'size':'large', 'shape':'square'})

In [52]:
np.save("datasets/bak/img_bredun.npy", images)
np.save("datasets/bak/lbl_bredun.npy", labels)