In [187]:
import os
import glob
import pickle

from PIL import Image
from resizeimage import resizeimage
from lxml.html import fromstring
from sklearn.cluster import KMeans


# Process Images to Managable Format

In [37]:
def resizeImg(input_folder, output_folder):
    fileNames = glob.glob(input_folder + "\*.jpg")
    print(input_folder + "\*.jpg")
    for fileName in fileNames:
        imgId = fileName.split("\\")[-1].split(".")[0]
        with open(fileName, 'r+b') as f:
            with Image.open(f) as image:
                cover = resizeimage.resize_cover(image, [400, 300])
                cover.save(output_folder + "\\" + imgId + "_small.jpg", image.format)

In [49]:
#Define Input Directory
train_xml = r"C:\Users\c10670A\Documents\Python Scripts\03. Projects\Kaufland_Case\Kaufland_DataThon+2019_04_participants\ground_truth_xml"
val_xml = r"C:\Users\c10670A\Documents\Python Scripts\03. Projects\Kaufland_Case\Kaufland_DataThon+2019_04_participants\working_xml"
test_xml = r"C:\Users\c10670A\Documents\Python Scripts\03. Projects\Kaufland_Case\Kaufland_DataThon_2019_testset_xml\test_xml_edited"
gen_xml = r"C:\Users\c10670A\Documents\Python Scripts\03. Projects\Kaufland_Case\datathon-datathon-2019_04_1-computer-vision\Images"

#Define Input Directory
train_images = r"C:\Users\c10670A\Documents\Python Scripts\03. Projects\Kaufland_Case\Kaufland_DataThon+2019_04_participants\ground_truth"
val_images = r"C:\Users\c10670A\Documents\Python Scripts\03. Projects\Kaufland_Case\Kaufland_DataThon+2019_04_participants\working"
test_images = r"C:\Users\c10670A\Documents\Python Scripts\03. Projects\Kaufland_Case\Kaufland_DataThon_2019_testset_images\test"
gen_images = r"C:\Users\c10670A\Documents\Python Scripts\03. Projects\Kaufland_Case\datathon-datathon-2019_04_1-computer-vision\Images"


#Define Output dir
processed_data = r"C:\Users\c10670A\Documents\Python Scripts\03. Projects\Kaufland_Case\Processed_Data"
train_output = os.path.join(processed_data, "train")
val_output = os.path.join(processed_data, "val")
test_output = os.path.join(processed_data, "test")
gen_output = os.path.join(processed_data, "gen")




In [39]:
resizeImg(train_images, train_output)

C:\Users\c10670A\Documents\Python Scripts\03. Projects\Kaufland_Case\Kaufland_DataThon+2019_04_participants\ground_truth\*.jpg


In [40]:
resizeImg(val_images, val_output)

C:\Users\c10670A\Documents\Python Scripts\03. Projects\Kaufland_Case\Kaufland_DataThon+2019_04_participants\working\*.jpg


In [41]:
resizeImg(test_images, test_output)

C:\Users\c10670A\Documents\Python Scripts\03. Projects\Kaufland_Case\Kaufland_DataThon_2019_testset_images\test\*.jpg


In [42]:
resizeImg(gen_images, gen_output)

C:\Users\c10670A\Documents\Python Scripts\03. Projects\Kaufland_Case\datathon-datathon-2019_04_1-computer-vision\Images\*.jpg


# Load XML Data

## Load Development Data

In [177]:
def name2cat(x):
    if x.startswith("label"):
        return "0"
    elif x.isnumeric():
        return "1"
    else:
        return "other"

def processFile(filename, img_folder):
    f = open(filename, "r")
    text = f.read()
    f.close()
    
    xml = fromstring(text)
    xmin = map(scaleSize, xml.xpath("//xmin/text()"))
    xmax = map(scaleSize, xml.xpath("//xmax/text()"))
    ymin = map(scaleSize, xml.xpath("//ymin/text()"))
    ymax = map(scaleSize, xml.xpath("//ymax/text()"))
    name = xml.xpath("//name/text()")
    cat = [name2cat(x) for x in name]
    
    row = " ".join([",".join(x) for x in zip(xmin, ymin, xmax, ymax, cat) if x[4] != "other"])
    return os.path.join(img_folder, filename.split("\\")[-1].replace(".xml", "_small.jpg").replace("_xml", "")) \
                + " " + row


def processFilePickle(filename, img_folder):
    f = pickle.load(open(filename, "rb" ))
    xmin = [scaleSize(x['object']['bndbox']['xmin']) for x in f]
    xmax = [scaleSize(x['object']['bndbox']['xmax']) for x in f]
    ymin = [scaleSize(x['object']['bndbox']['ymin']) for x in f]
    ymax = [scaleSize(x['object']['bndbox']['ymax']) for x in f]
    cat = [name2cat(x['object']['name']) for x in f]
    
    row = " ".join([",".join(x) for x in zip(xmin, ymin, xmax, ymax, cat) if x[4] != "other"])
    return os.path.join(img_folder, filename.split("\\")[-1].replace(".pkl", "_small.jpg")) \
                + " " + row

def scaleSize(x):
    return str(int(int(x) / 11.52))

def loadData(xml_folder, img_folder):
    fileNames = glob.glob(xml_folder + "\*.xml")
    return "\n".join([processFile(fileName, img_folder) for fileName in fileNames])

def loadDataPickle(pkl_folder, img_folder):
    fileNames = glob.glob(pkl_folder + "\*.pkl")
    return "\n".join([processFilePickle(fileName, img_folder) for fileName in fileNames])

In [140]:
dev_data = loadData(train_xml, train_output)

In [143]:
f = open("Processed_XMLs\\dev_data.txt", "w")
f.write(dev_data)
f.close()

In [141]:
val_data = loadData(val_xml, val_output)

In [144]:
f = open("Processed_XMLs\\val_data.txt", "w")
f.write(val_data)
f.close()

In [142]:
test_data = loadData(test_xml, test_output)

In [145]:
f = open("Processed_XMLs\\test_data.txt", "w")
f.write(test_data)
f.close()

In [178]:
gen_data = loadDataPickle(gen_xml, gen_output)

In [179]:
f = open("Processed_XMLs\\gen_data.txt", "w")
f.write(gen_data)
f.close()

# Anchors

In [186]:
def calcAnchors(folder):
    result = []
    fileNames = glob.glob(folder + "\*.xml")
    
    for fileName in fileNames:
        x = processAnchor(fileName)
        result += x
        
    return result

def processAnchorTupple(x):
    return int(int(x[0]) / 11.52) - int(int(x[1]) / 11.52)

def processAnchor(filename):
    f = open(filename, "r")
    text = f.read()
    f.close()
    
    xml = fromstring(text)
    xmin = xml.xpath("//xmin/text()")
    xmax = xml.xpath("//xmax/text()")
    ymin = xml.xpath("//ymin/text()")
    ymax = xml.xpath("//ymax/text()")
    
    x = [processAnchorTupple(d) for d in zip(xmax, xmin)]
    y = [processAnchorTupple(d) for d in zip(ymax, ymin)]
    return [z for z in zip(x, y)]

In [188]:
#calc ancors
anc_train = calcAnchors(train_xml)
kmeans = KMeans(n_clusters=10, random_state=0).fit(anc)
f = open("ancors.txt", "w")
f.write(", ".join([",".join(map(str, map(int, x))) for x in kmeans.cluster_centers_]))
f.close()