# Hash Indexes

In memory hash map

In [56]:
class Database:
    db = ""
    index = {}

    def __init__(self):
        pass

    def set(self, key, value):
        self.index[key] = len(self.db)
        self.db += f"{key},{value}\n"

    def get(self, key):
        if key not in self.index:
            return None

        offset = self.index[key]
        pair = []
        char = self.db[offset]
        while True:
            if char == '\n':
                break

            pair.append(char)
            offset += 1
            char = self.db[offset]

        pair = ''.join(pair)
        value = pair.split(key)[1]
        return value[1:] # no comma

db = Database()

db.set("123456", '{"name":"London","attractions":["Big Ben","London Eye"]}')
db.set("42", '{"name":"San Francisco","attractions":["Golden Gate Bridge"]}')
db.set("42", '{"name":"San Francisco","attractions":["Exploratorium"]}')

db.db

'123456,{"name":"London","attractions":["Big Ben","London Eye"]}\n42,{"name":"San Francisco","attractions":["Golden Gate Bridge"]}\n42,{"name":"San Francisco","attractions":["Exploratorium"]}\n'

In [58]:
db.get("123456")

'{"name":"London","attractions":["Big Ben","London Eye"]}'

# Compaction

In [124]:
class Database:
    def __init__(self):
        self.db = ""
        self.index = {}
        self.log = []

    def set(self, key, value):
        offset = len(self.db)
        log_entry = [offset, key, value]
        self.log.append(log_entry)
        self.index[key] = offset
        self.db += f"{key},{value}\n"

    def get(self, key):
        if key not in self.index:
            return None

        offset = self.index[key]
        pair = self.get_pair(offset)
        value = self.get_value(pair, key)
        return value

    def get_pair(self, offset):
        pair = []
        char = self.db[offset]
        while True:
            if char == '\n':
                break

            pair.append(char)
            offset += 1
            char = self.db[offset]

        pair = ''.join(pair)
        return pair # no comma

    def get_value(self, pair, key):
        _, value = pair.split(key)
        return value[1:] # no comma

    def overwrite(self, entries):
        self.db = ""
        for key, value in entries.items():
            self.set(key, value)

    def compact(self):
        entries = {}
        for offset, key, value in self.log:
            entries[key] = value

        self.overwrite(entries)

        return self.db

db = Database()

entries = [
    ("mew", "1078"),
    ("purr", "2103"),
    ("purr", "2104"),
    ("mew", "1079"),
    ("mew", "1080"),
    ("mew", "1081"),
    ("purr", "2105"),
    ("purr", "2106"),
    ("purr", "2107"),
    ("yawn", "511"),
    ("purr", "2108"),
    ("mew", "1082"),
]

for key, value in entries:
    db.set(key, value)

db.db

'mew,1078\npurr,2103\npurr,2104\nmew,1079\nmew,1080\nmew,1081\npurr,2105\npurr,2106\npurr,2107\nyawn,511\npurr,2108\nmew,1082\n'

In [103]:
db.log

[[0, 'mew', '1078'],
 [9, 'purr', '2103'],
 [19, 'purr', '2104'],
 [29, 'mew', '1079'],
 [38, 'mew', '1080'],
 [47, 'mew', '1081'],
 [56, 'purr', '2105'],
 [66, 'purr', '2106'],
 [76, 'purr', '2107'],
 [86, 'yawn', '511'],
 [95, 'purr', '2108'],
 [105, 'mew', '1082']]

In [104]:
db.compact()

'mew,1082\npurr,2108\nyawn,511\n'

# Segment Files

In [144]:
class Segment:
    def __init__(self):
        self.contents = ""
        self.index = {}
        self.log = []

    def __len__(self):
        return len(self.log)

    def set(self, key, value):
        offset = len(self.contents)
        log_entry = [offset, key, value]
        self.log.append(log_entry)
        self.index[key] = offset
        self.contents += f"{key},{value}\n"

    def get(self, key):
        if key not in self.index:
            return None

        offset = self.index[key]
        pair = self.get_pair(offset)
        value = self.get_value(pair, key)
        return value

    def get_pair(self, offset):
        pair = []
        char = self.contents[offset]
        while True:
            if char == '\n':
                break

            pair.append(char)
            offset += 1
            char = self.contents[offset]

        pair = ''.join(pair)
        return pair # no comma

    def get_value(self, pair, key):
        _, value = pair.split(key)
        return value[1:] # no comma

    def overwrite(self, entries):
        self.contents = ""
        for key, value in entries.items():
            self.set(key, value)

    def compact(self):
        entries = {}
        for offset, key, value in self.log:
            entries[key] = value

        self.overwrite(entries)

        return self.contents


class Database:
    def __init__(self):
        self.segments = [Segment()]

    def set(self, key, value):
        segment = self.segments[0]
        if len(segment) > 12:
            new_segment = Segment()
            self.segments = [new_segment] + self.segments

        segment = self.segments[0]
        segment.set(key, value)

    def get(self, key):
        for segment in segments:
            if key in segment.index:
                return segment.get(key)


    def compact(self):
        for segment in self.segments:
            segment.compact()

        new_segment = Segment()
        for segment in reversed(self.segments):
            for key in segment.index:
                value = segment.get(key)
                new_segment.set(key, value)

        new_segment.compact()

        self.segments = [new_segment]

        return [segment.contents for segment in self.segments]

db = Database()

entries = [
    ("mew", "1078"),
    ("purr", "2103"),
    ("purr", "2104"),
    ("mew", "1079"),
    ("mew", "1080"),
    ("mew", "1081"),
    ("purr", "2105"),
    ("purr", "2106"),
    ("purr", "2107"),
    ("yawn", "511"),
    ("purr", "2108"),
    ("mew", "1082"),
    ("purr", "2109"),
    ("purr", "2110"),
    ("mew", "1083"),
    ("scratch", "252"),
    ("mew", "1084"),
    ("mew", "1085"),
    ("purr", "2111"),
    ("mew", "1086"),
    ("purr", "2112"),
    ("purr", "2113"),
    ("mew", "1087"),
    ("purr", "2114"),
]

for key, value in entries:
    db.set(key, value)

for segment in db.segments:
    print(repr(segment.contents))

'purr,2110\nmew,1083\nscratch,252\nmew,1084\nmew,1085\npurr,2111\nmew,1086\npurr,2112\npurr,2113\nmew,1087\npurr,2114\n'
'mew,1078\npurr,2103\npurr,2104\nmew,1079\nmew,1080\nmew,1081\npurr,2105\npurr,2106\npurr,2107\nyawn,511\npurr,2108\nmew,1082\npurr,2109\n'


In [145]:
db.compact()

['mew,1087\npurr,2114\nyawn,511\nscratch,252\n']

# SSTable

Merging segments

In [47]:
class Segment:
    def __init__(self):
        self.contents = []

    def set(self, key, value):
        self.contents.append((key, value))
        self.contents.sort()

    def __len__(self):
        return len(self.contents)

    def __getitem__(self, i):
        return self.contents[i]

    def __repr__(self):
        return str(self.contents)

    def get(self, key):
        for k, value in self.contents:
            if k == key:
                return value

class Database:
    def __init__(self, segments):
        self.segments = segments

    def compact(self):
        indexes = [0]*len(self.segments)
        new_segment = Segment()
        while True:
            j = 0
            pairs = []
            for i, segment in zip(indexes, self.segments):
                if i == len(segment):
                    j += 1
                    continue

                key, value = segment[i]
                pair = (key, j, value)
                pairs.append(pair)
                j += 1

            if not pairs:
                break

            key, index, value = sorted(pairs)[0]
            
            new_segment.set(key, value)

            for k, i, v in pairs:
                if k != key:
                    continue
                indexes[i] += 1

        return new_segment

entries1 = [
    ("handful", "44662"),
    ("handicap", "70836"),
    ("handiwork", "45521"),
    ("handlebars", "3869"),
    ("handoff", "5741"),
    ("handprinted", "33632"),
]

segment1 = Segment()
for key, value in entries1:
    segment1.set(key, value)

entries2 = [
    ("handcuffs", "2729"),
    ("handful", "42307"),
    ("handicap", "67884"),
    ("handiwork", "16912"),
    ("handkerchief", "20952"),
    ("handprinted", "15725"),
]

segment2 = Segment()
for key, value in entries2:
    segment2.set(key, value)

entries3 = [
    ("handbag", "8786"),
    ("handful", "40308"),
    ("handicap", "65995"),
    ("handkerchief", "16324"),
    ("handlebars", "3869"),
    ("handprinted", "11150")
]

segment3 = Segment()
for key, value in entries3:
    segment3.set(key, value)

segments = [segment1, segment2, segment3]
db = Database(segments)

db.segments

[[('handful', '44662'), ('handicap', '70836'), ('handiwork', '45521'), ('handlebars', '3869'), ('handoff', '5741'), ('handprinted', '33632')],
 [('handcuffs', '2729'), ('handful', '42307'), ('handicap', '67884'), ('handiwork', '16912'), ('handkerchief', '20952'), ('handprinted', '15725')],
 [('handbag', '8786'), ('handful', '40308'), ('handicap', '65995'), ('handkerchief', '16324'), ('handlebars', '3869'), ('handprinted', '11150')]]

In [48]:
db.compact()

[('handbag', '8786'), ('handcuffs', '2729'), ('handful', '44662'), ('handicap', '70836'), ('handiwork', '45521'), ('handkerchief', '20952'), ('handlebars', '3869'), ('handoff', '5741'), ('handprinted', '33632')]