In [1]:
!ls out

a_example.txt		 c_memorable_moments.txt  e_shiny_selfies.txt
b_lovely_landscapes.txt  d_pet_pictures.txt


In [7]:
from collections import defaultdict, namedtuple
from os import path, linesep
from tqdm.autonotebook import tqdm
import random

class HashCode2019(dict):

    class record():
        pass
    
    Data = namedtuple('Data', ['tags', 'orientation'])
    
    IN = "in"
    OUT = "out"
    FILES = [ 'a_example.txt',
              'b_lovely_landscapes.txt',
              'c_memorable_moments.txt',
              'd_pet_pictures.txt',
              'e_shiny_selfies.txt']
    
    def __init__(self):
        total = 0
        pbar = tqdm(self.FILES, 'loading data')
        for file in pbar:
            self[file] = self.record()
            data = self.Data(*self.load_data(file))
            self[file].data = data 
            
            if path.exists(path.join(self.OUT, file)):
                show, points = self.load_show(file)
                self[file].show = show
                self[file].points = points
                total += points
            else:
                print(f"Creating slides for {file}")
                slides = self.build_slides(file)
                show, points = self.build_show(file, slides)
                self[file].show = show
                self[file].points = points
                total += points
                self.write_show(show, file)
            pbar.set_postfix({'total': total})
                
    def __repr__(self):
        return str({ file: self[file].points for file in self.FILES })
    
    def load_data(self, file):
        fotos_by_tag = defaultdict(set)
        tags = []
        orientation = []

        with open(path.join(self.IN, file), 'r') as input:
            input.readline()
            
            for (id, line) in enumerate(tqdm(input, f'loading input data', postfix=file)):
                (ori, _, *_tags) = line[:-1].split(' ')
                #print(f"{id} {orientation} {tags}")
                tags.append(set(_tags))
                orientation.append(ori)
                
        return (tags, orientation)  
         
    def build_slides(self, file):
        """Build slides for input file.
        
        Vertical photos are randomly paired.
        """
        (tags, orientation) = self[file].data
        slides = []
        verticals = []
        
        for id in range(len(tags)):
            if orientation[id] == 'H':
                # create tuple with only one member
                slides.append((id,))
            else:
                verticals.append(id)
        
        random.shuffle(verticals)
        vslides = []
        # build random pairs 
        for x, y in zip(*[iter(verticals)] * 2):
            vslides.append((x, y))
        slides += vslides
        
        return slides
    
    def build_show(self, file, slides):
        """Build a slide show from slides.
        
        Use a simple greedy algorithm to optimize points."""
        data = self[file].data
                
        slides = slides.copy()
        #random.shuffle(slides)
        
        m = 0
        if self.FILES.index(file) > 2:
            m = 80
        scores = self.get_scores(slides, data, m)
        
        
        # We have now a graph with weighted adges and are looking for a maximum weight 
        # Hamiltonian path.
        #
        show = self.naive_greedy(slides, scores)
                     
        score = 0
        t1 = self.get_tags(data, show[0])
        for i in range(1, len(show)):
            t2 = self.get_tags(data, show[i])
            score += self.val(t1, t2)
            t1 = t2
        
        print(f"initial show for {file} has {score} points")
        return (show, score)
    
    def get_scores(self, slides, data, max_tags=0):
        slides_by_tag = defaultdict(set)
        scores = defaultdict(dict)
        N = len(slides)
        
        for i in tqdm(range(N), 'populating slides_by_tag'):
            for tag in self.get_tags(data, slides[i]):
                slides_by_tag[tag].add(i)  
                
        for i in tqdm(range(N), 'populating points matrix'):
            # calculate points for all slides with overlapping tags
            tags_i = self.get_tags(data, slides[i])
            if max_tags > 0:
                tags = random.sample(tags_i, min(len(tags_i), max_tags))
            else:
                tags = tags_i
                
            for tag in tags:
                candidates = slides_by_tag[tag]
                if max_tags > 0:
                    candidates = random.sample(candidates, min(len(candidates), max_tags))
                for j in candidates:
                    if i == j: 
                        continue
                    else:
                        tags_j = self.get_tags(data, slides[j])
                        v = self.val(tags_i, tags_j)
                        if v > 0:
                            scores[i][j] = v
        return scores

    def naive_greedy(self, slides, scores):
        """The following is a rather naive greedy algorithm.

        One issue is that this approach often leaves nodes which have no more connection.
        """
        show = []
        fail = 0
        N = len(slides)
        todo = set(range(N))
        slide = min(todo)
        todo.remove(slide)
        show.append(slides[slide])
        
        pbar = tqdm(desc='running greedy', total=N)
        pbar.update(1)
        while len(todo) > 0:
            pbar.update(1)
            candidates = scores[slide].keys()
            if len(candidates & todo) > 0:
                n = max(candidates & todo, key=lambda j: scores[slide][j])
            else:
                fail += 1
                n = random.choice(list(todo))
                pbar.set_postfix({'fail': fail})
    
            slide = n
            todo.remove(slide)
            show.append(slides[slide])
        pbar.close()
        
        return show
    
    def greedy_edges(self, slides, scores):
        """Start with using the most precious edges in the graph."""
        
        N = len(slides)
        show = []
        max_scores = []
        edges = defaultdict(list)
        
        for i in range(N):
            for j in sorted(scores[i].keys(), key=lambda j: scores[i][j], reverse=True)[:2]:
                if i < j:
                    max_scores.append((i, j, scores[i][j]))
        max_scores = sorted(max_scores, key=lambda tup: tup[2], reverse=True)
       
        
        for tup in max_scores:
            (i, j, m) = tup
            if len(edges[i]) < 2:
                edges[i].append((j, m))
            if len(edges[j]) < 2:
                edges[j].append((i, m))
        
        #print(f"edges: {edges}")
        
        
        todo = set(range(N))
        nodes = list(edges.keys())
        while len(nodes) > 0:
            slide = nodes.pop(0)
            if slide in todo:
                show.append(slides[slide])
                todo.remove(slide)
                
                path = True
                cpath = []
                cpath.append(slide)
                while path:
                    for (node, w) in edges[slide]:
                        if node in todo:
                            slide = node
                            cpath.append(slide)
                            show.append(slides[slide])
                            todo.remove(slide)
                            break
                    else:
                        path = False
                        #print(f"new path: {cpath}")
                            
        #print(show)
    
        return show
        
    def better_greedy(self, slides, scores):
        """Try to avoid generating isolated nodes."""
        
        show = []
        fail = 0
        N = len(slides)
        todo = sorted(list(range(N)), key=lambda i: len(scores[i]))
        done = set()
        successor = [ -1 for i in range(N) ]
        
        # print(f"scores: {[ len(scores[i]) for i in todo ]}")
        
        pbar = tqdm(todo, 'running greedy')
        for slide in pbar:
            done.add(slide)
            candidates = scores[slide].keys()
            if len(candidates - done) > 0:
                n = max(candidates - done, key=lambda j: scores[slide][j])
                successor[slide] = n
                done.add(n)
            else:
                fail += 1
                pbar.set_postfix({ 'fail': fail })
                
        print(f"successor: {successor}")
        
        slide = todo[0]
        todo = set(range(N))
        show.append(slides[slide])
        todo.remove(slide)
        positive = { i for i in range(N) if successor[i] > 0 }
        while len(todo) > 0:
            if successor[slide] > 0  & successor[slide] in todo:
                slide = successor[slide]
            elif len(todo & positive) > 0:
                slide = min(todo & positive)
            else:
                slide = min(todo)
            show.append(slides[slide])
            todo.remove(slide)
        
        return show
    
    def write_show(self, show, file):
        """Write a slide show"""
        with open(path.join(self.OUT, file), 'w') as out:
            out.write(f"{len(show)}" + linesep)
            for slide in show:
                for i in slide:
                    out.write(f"{i} ")
                out.write(linesep)
    
    def get_tags(self, data, fotos):
        """Get all tags for iter of fotos"""
        l1 = [ data.tags[foto] for foto in fotos ]
        
        return l1[0].union(*l1[1:])
        
    def load_show(self, file):
        """Load a slide show"""
        show = []
        points = 0
        
        with open(path.join(self.OUT, file), 'r') as input:
            input.readline()
            for (id, line) in enumerate(tqdm(input, f'loading show data', postfix=file)):
                fotos = line[:-1].split(' ')
                show.append(list(int(foto) for foto in fotos if len(foto) > 0))
        
        data = self[file].data
        t1 = self.get_tags(data, show[0])
        for i in range(1, len(show)):
            t2 = self.get_tags(data, show[i])
            points += self.val(t1, t2)
            t1 = t2
        
        return (show, points)


    def val(self, s1, s2):
        return min(len(s1 - s2), len(s1 & s2), len(s2 - s1))
    
    def optimize_show(self, file):
        """Try to optimize the show for file"""
        M = 100000
        gain = 0
        success = 0
        old = self[file].points
        show = self[file].show
        data = self[file].data
        N = len(show)
    
        for count in tqdm(range(M), f"optimizing", postfix=file):
            i, j = random.randint(1, N-2), random.randint(1, N-2)
            if (i - j) * (i - j) < 4:
                continue
            #print(f"{i} {j}")
            
            sim1 = self.get_tags(data, show[i-1])
            si   = self.get_tags(data, show[i])
            sip1 = self.get_tags(data, show[i+1])
            sjm1 = self.get_tags(data, show[j-1])
            sj   = self.get_tags(data, show[j])
            sjp1 = self.get_tags(data, show[j+1])
            
            
            v_old = self.val(sim1, si) + self.val(si, sip1) + \
                    self.val(sjm1, sj) + self.val(sj, sjp1)
            v_new = self.val(sim1, sj) + self.val(sj, sip1) + \
                    self.val(sjm1, si) + self.val(si, sjp1)
            
            # print(f"old/new: {v_old} {v_new}")
        
            if v_new > v_old:
                success += 1
                gain += v_new - v_old
                #print(f"1-opt gain: {v_new - v_old}")
                show[i], show[j] = show[j], show[i]
            else:
                l1 = list(show[i])
                l2 = list(show[j])

                if len(l1) == 2 and len(l2) == 2:
                    #print("4-opt")
                    random.shuffle(list(l1))
                    random.shuffle(list(l2))
                    # print((l1, l2))
                    s1 = self.get_tags(data, (l1[0], l2[0])) 
                    s2 = self.get_tags(data, (l1[1], l2[1])) 
                    v_new = self.val(sim1, s1) + self.val(s1, sip1) + \
                            self.val(sjm1, s2) + self.val(s2, sjp1)
                    if v_new > v_old:
                        #print(f"4-opt gain: {v_new - v_old}")
                        show[i] = (l1[0], l2[0])
                        show[j] = (l1[1], l2[1])
                        gain += v_new - v_old
                        success += 1
                   
       
        new = 0
        t1 = self.get_tags(data, show[0])
        for i in range(1, len(show)):
            t2 = self.get_tags(data, show[i])
            new += self.val(t1, t2)
            t1 = t2
        #print(v)
        self[file].show = show
        self[file].points = new
        if new > old:
            self.write_show(show, file)
        print(f"optimize_show: success: {success/M}, gain: {new - old}")
        return (old, new)

In [8]:
hc = HashCode2019()
hc

HBox(children=(IntProgress(value=0, description='loading data', max=5, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=1, bar_style='info', description='loading input data', max=1, style=ProgressS…

HBox(children=(IntProgress(value=1, bar_style='info', description='loading show data', max=1, style=ProgressSt…

HBox(children=(IntProgress(value=1, bar_style='info', description='loading input data', max=1, style=ProgressS…

HBox(children=(IntProgress(value=1, bar_style='info', description='loading show data', max=1, style=ProgressSt…

HBox(children=(IntProgress(value=1, bar_style='info', description='loading input data', max=1, style=ProgressS…

HBox(children=(IntProgress(value=1, bar_style='info', description='loading show data', max=1, style=ProgressSt…

HBox(children=(IntProgress(value=1, bar_style='info', description='loading input data', max=1, style=ProgressS…

HBox(children=(IntProgress(value=1, bar_style='info', description='loading show data', max=1, style=ProgressSt…

HBox(children=(IntProgress(value=1, bar_style='info', description='loading input data', max=1, style=ProgressS…

HBox(children=(IntProgress(value=1, bar_style='info', description='loading show data', max=1, style=ProgressSt…




{'a_example.txt': 2, 'b_lovely_landscapes.txt': 204651, 'c_memorable_moments.txt': 1548, 'd_pet_pictures.txt': 375253, 'e_shiny_selfies.txt': 340596}

In [3]:
!ls -l out

insgesamt 1628
-rw-r--r-- 1 jupyter jupyter     13 Dez 17 11:01 a_example.txt
-rw-r--r-- 1 jupyter jupyter 548896 Dez 17 11:01 b_lovely_landscapes.txt
-rw-r--r-- 1 jupyter jupyter   4644 Jan  6 14:53 c_memorable_moments.txt
-rw-r--r-- 1 jupyter jupyter 588896 Jan  6 14:53 d_pet_pictures.txt
-rw-r--r-- 1 jupyter jupyter 508896 Jan  6 14:54 e_shiny_selfies.txt


In [4]:
gain = {}
total = 0
for file in hc.FILES:
    (old, new) = hc.optimize_show(file)
    gain[file] = new - old
    total += new

while True:
    file = max(gain, key=lambda k: gain[k])
    (old, new) = hc.optimize_show(file)
    gain[file] = new - old
    total += new - old
    print(f"total: {total}, file: {file}")
    print(gain)
    

HBox(children=(IntProgress(value=0, description='optimizing', max=10000, style=ProgressStyle(description_width…


optimize_show: success: 0.0, gain: 0


HBox(children=(IntProgress(value=0, description='optimizing', max=10000, style=ProgressStyle(description_width…


optimize_show: success: 0.0, gain: 0


HBox(children=(IntProgress(value=0, description='optimizing', max=10000, style=ProgressStyle(description_width…


optimize_show: success: 0.0, gain: 0


HBox(children=(IntProgress(value=0, description='optimizing', max=10000, style=ProgressStyle(description_width…


optimize_show: success: 0.0002, gain: 2


HBox(children=(IntProgress(value=0, description='optimizing', max=10000, style=ProgressStyle(description_width…


optimize_show: success: 0.0005, gain: 6


HBox(children=(IntProgress(value=0, description='optimizing', max=10000, style=ProgressStyle(description_width…


optimize_show: success: 0.0003, gain: 3
total: 921974, file: e_shiny_selfies.txt
{'a_example.txt': 0, 'b_lovely_landscapes.txt': 0, 'c_memorable_moments.txt': 0, 'd_pet_pictures.txt': 2, 'e_shiny_selfies.txt': 3}


HBox(children=(IntProgress(value=0, description='optimizing', max=10000, style=ProgressStyle(description_width…


optimize_show: success: 0.0007, gain: 9
total: 921983, file: e_shiny_selfies.txt
{'a_example.txt': 0, 'b_lovely_landscapes.txt': 0, 'c_memorable_moments.txt': 0, 'd_pet_pictures.txt': 2, 'e_shiny_selfies.txt': 9}


HBox(children=(IntProgress(value=0, description='optimizing', max=10000, style=ProgressStyle(description_width…


optimize_show: success: 0.0005, gain: 9
total: 921992, file: e_shiny_selfies.txt
{'a_example.txt': 0, 'b_lovely_landscapes.txt': 0, 'c_memorable_moments.txt': 0, 'd_pet_pictures.txt': 2, 'e_shiny_selfies.txt': 9}


HBox(children=(IntProgress(value=0, description='optimizing', max=10000, style=ProgressStyle(description_width…


optimize_show: success: 0.0003, gain: 4
total: 921996, file: e_shiny_selfies.txt
{'a_example.txt': 0, 'b_lovely_landscapes.txt': 0, 'c_memorable_moments.txt': 0, 'd_pet_pictures.txt': 2, 'e_shiny_selfies.txt': 4}


HBox(children=(IntProgress(value=0, description='optimizing', max=10000, style=ProgressStyle(description_width…


optimize_show: success: 0.0005, gain: 7
total: 922003, file: e_shiny_selfies.txt
{'a_example.txt': 0, 'b_lovely_landscapes.txt': 0, 'c_memorable_moments.txt': 0, 'd_pet_pictures.txt': 2, 'e_shiny_selfies.txt': 7}


HBox(children=(IntProgress(value=0, description='optimizing', max=10000, style=ProgressStyle(description_width…


optimize_show: success: 0.0005, gain: 7
total: 922010, file: e_shiny_selfies.txt
{'a_example.txt': 0, 'b_lovely_landscapes.txt': 0, 'c_memorable_moments.txt': 0, 'd_pet_pictures.txt': 2, 'e_shiny_selfies.txt': 7}


HBox(children=(IntProgress(value=0, description='optimizing', max=10000, style=ProgressStyle(description_width…




KeyboardInterrupt: 

In [20]:
import threading

class Optimizer(threading.Thread):
    
    def __init__(self, file):
        threading.Thread.__init__(self)
        self.file = file
    
    def run(self):
        print ("Starting optimizing " + self.file)
        (old, new) = hc.optimize_show(self.file)
        gain[self.file] = new - old
        print (f"gain for {self.file}: {gain[self.file]}")
        return gain[self.file]
        
gain = {}
threads = {}
total = 0
for file in hc.FILES:
    threads[file] = Optimizer(file)
    threads[file].start()

for file in hc.FILES:
    threads[file].join()
    total += gain[file]

print(f"total gain: {total}")

Starting optimizing a_example.txt
Starting optimizing b_lovely_landscapes.txt
Starting optimizing c_memorable_moments.txt
Starting optimizing d_pet_pictures.txt
Starting optimizing e_shiny_selfies.txt


HBox(children=(IntProgress(value=0, description='optimizing', max=100000, style=ProgressStyle(description_widt…

HBox(children=(IntProgress(value=0, description='optimizing', max=100000, style=ProgressStyle(description_widt…

HBox(children=(IntProgress(value=0, description='optimizing', max=100000, style=ProgressStyle(description_widt…

HBox(children=(IntProgress(value=0, description='optimizing', max=100000, style=ProgressStyle(description_widt…

HBox(children=(IntProgress(value=0, description='optimizing', max=100000, style=ProgressStyle(description_widt…

optimize_show: success: 0.0, gain: 0
gain for a_example.txt: 0
optimize_show: success: 0.0, gain: 0
gain for c_memorable_moments.txt: 0
optimize_show: success: 0.0, gain: 0
gain for b_lovely_landscapes.txt: 0
optimize_show: success: 0.00038, gain: 44
gain for d_pet_pictures.txt: 44
optimize_show: success: 0.00067, gain: 96
gain for e_shiny_selfies.txt: 96
total gain: 140


In [22]:
from multiprocessing import Process

def optimize(file):
        print ("Starting optimizing " + file)
        (old, new) = hc.optimize_show(file)
        gain = new - old
        print (f"gain for {file}: {gain}")

if __name__ == '__main__':
    for file in hc.FILES:
        p = Process(target=optimize, args=(file,))
        p.start()
    
    for file in hc.FILES:
        p.join()
        


Starting optimizing a_example.txt
Starting optimizing b_lovely_landscapes.txt
Starting optimizing c_memorable_moments.txt


HBox(children=(IntProgress(value=0, description='optimizing', max=100000, style=ProgressStyle(description_widt…

Starting optimizing d_pet_pictures.txt


HBox(children=(IntProgress(value=0, description='optimizing', max=100000, style=ProgressStyle(description_widt…

Starting optimizing e_shiny_selfies.txt


HBox(children=(IntProgress(value=0, description='optimizing', max=100000, style=ProgressStyle(description_widt…

HBox(children=(IntProgress(value=0, description='optimizing', max=100000, style=ProgressStyle(description_widt…

HBox(children=(IntProgress(value=0, description='optimizing', max=100000, style=ProgressStyle(description_widt…


optimize_show: success: 0.0, gain: 0
gain for a_example.txt: 0

optimize_show: success: 0.0, gain: 0
gain for c_memorable_moments.txt: 0


optimize_show: success: 0.0, gain: 0
gain for b_lovely_landscapes.txt: 0
optimize_show: success: 0.00023, gain: 30
gain for d_pet_pictures.txt: 30

optimize_show: success: 0.00058, gain: 88
gain for e_shiny_selfies.txt: 88
