In [1]:
import time

In [2]:
class Person:
    def __init__(self, name, age = None):
        self.age = age
        self.name = name

    def __eq__(self, other):
        return self.age == other.age and self.name == other.name

    def __hash__(self):
        print('The hash is:')
        return hash((self.age, self.name))

In [3]:
house = []
p1 = Person('Python')
p2 = Person('Java')
p3 = Person('C')
p4 = Person('JS')
p5 = Person('Go')
p6 = Person('Visual')
p7 = Person('Kotlin')
p8 = Person('R')

In [4]:
def search_v1(person):
    start = time.time()
    for i in range(len(house1)):
        if house1[i] == person:
            end = time.time()
            return i, end - start
    end = time.time()
    return None, end-time()

In [5]:
house1 = [p1, p2, p3, p4, p5, p6, p7, p8]

In [6]:
pos1, t1 = search_v1(p6)
print(pos1, t1)

5 9.5367431640625e-06


In [7]:
def hash_v1(per):
    house2[id(per)%6] = per
    
def search_v2(person):
    start = time.time()
    code = id(person)%6
    if house2[code] == person:
        end = time.time()
        return code, end - start
    end = time.time()
    return None, end-start


In [8]:
house2 = [i for i in range(10, 20)]
for per in house1:
    hash_v1(per)

In [9]:
pos2, t2 = search_v2(p8)
print(pos2, t2)

4 2.86102294921875e-06


In [10]:
t1/t2

3.3333333333333335

In [11]:
pos1, t1 = search_v1(p8)
print(pos1, t1)
pos2, t2 = search_v2(p8)
print(pos2, t2)

7 9.5367431640625e-06
4 3.337860107421875e-06


In [12]:
t1/t2

2.857142857142857

search_v1 takes 2.85 times longer than search_v2 to looking for the last object.

In [13]:
house2

[<__main__.Person at 0x7fed286dbd30>,
 11,
 <__main__.Person at 0x7fed286dbd68>,
 13,
 <__main__.Person at 0x7fed286dbda0>,
 15,
 16,
 17,
 18,
 19]

We put 8 person into house2 but now it seems like we've lost 5 of thems. --> Collisions

### Collision

There are two main ways to fix a collision

    * The first is to change the value in your hash function or the change the hash function completely, os you have more than enough slots to store all your potential values. -> Closed Hashing
    
    * You can also keep your original hash function but change the strucutre of your array. You can store some type of lists that contains all values hashed at that spot. These lists are generally called buckets in this context. -> Open Hashing
    
Cons and Pros:

    * By using a bigger number in your hash function, you're aoing to require a lot more space to store your values. Also if you do this reactively and change the value in your hash function everytime you have a collision, moving all of your data to a new array is going to definitely increase the complexity in terms of both size and time.
    
    * With the bucket approach, you still need to iterate through some collection though a shorter one, every time you're looking for something. You could store every value in one bucket and then you're still essentially just iterating through a list.
    Ideally, you would have one to three elements stored in each bucket. So you can design a hash function with that in mind.
    You can also use second hash function inside of a large bucket to split up those elements even more.
    

Example:
One of your coworkers comes to you with a hash function that divides a group of values by 100, and uses the remainder as a key. The values are 100 numbers are all mutiples of 5.
He thinks it's a little slow - what number would you recommend his function to divide by rather than 100 to speed it up?

A: 87,  B: 107,  C: 125,  D:1001

125 is a mutiple of 5. Dividing a bunch of mutiples of 5 by another mutiple of 5 will cause a lot of collisions.


87 is better than 125 but because it's less than 100 it'll still have collisions


1001 is good, but it'll create a ton of leftover buckets and waste a lot of memory.

In [14]:
from random import randint

In [50]:
l = [randint(1, 2500)*5 for i in range(100)]
print(len(l), len(set(l)))

100 100


We have a list l with 100 multiples of 5

In [51]:
def search_linear(number):
    start = time.time()
    for i in range(len(l)):
        if l[i] == number:
            end = time.time()
            return i, end-start
    end = time.time()
    return None, end-start

In [70]:
index, time_linear = search_linear(7795)
print(index, time_linear)

71 1.9550323486328125e-05


In [71]:
hashed_l = [0 for i in range(130)]
for numb in l:
    hashed_l[numb%87] = numb

In [72]:
def search_hash(number):
    start = time.time()
    if hashed_l[number%87] == number:
        end = time.time()
        return number%87, end-start
    end = time.time()
    return None, end-start

In [73]:
index, time_hash = search_hash(7795)
print(index, time_hash)

52 2.1457672119140625e-06


In [74]:
time_linear/time_hash

9.11111111111111

search_hash 9.1 times faster than search_linear. but let check collisions.

If we use 87 as a factor then our hashed numbers will be put in hashed_l[0:87].

In [75]:
numbers = 0
for i in range(87):
    if hashed_l[i] != 0:
        numbers += 1
numbers

58

There are 58 numbers in hashed_l so there were 42 collision. 

Let's change the factor !

Try 125 first.

In [76]:
hashed_l = [0 for i in range(130)]
for numb in l:
    hashed_l[numb%125] = numb

In [77]:
def search_hash(number):
    start = time.time()
    if hashed_l[number%125] == number:
        end = time.time()
        return number%125, end-start
    end = time.time()
    return None, end-start

In [78]:
index, time_hash = search_hash(7795)
print(index, time_hash)

None 9.5367431640625e-07


In [79]:
time_linear/time_hash

20.5

search_hash 20.5 times faster than search_linear. but let check collisions. it seems like there were collision with 7795*5

If we use 125 as a factor then our hashed numbers will be put in hashed_l[0:125]. 

In [80]:
numbers = 0
for i in range(125):
    if hashed_l[i] != 0:
        numbers += 1
numbers

24

There are 24 numbers !0 so there were 76 collision !!!!

Try 107 !

In [81]:
hashed_l = [0 for i in range(130)]
for numb in l:
    hashed_l[numb%107] = numb

In [82]:
def search_hash(number):
    start = time.time()
    if hashed_l[number%107] == number:
        end = time.time()
        return number%107, end-start
    end = time.time()
    return None, end-start

In [83]:
index, time_hash = search_hash(7795)
print(index, time_hash)

91 1.9073486328125e-06


In [84]:
time_linear/time_hash

10.25

search_hash 10.25 times faster than search_linear. but let check collisions.

If we use 107 as a factor then our hashed numbers will be put in hashed_l[0:107]. 

In [85]:
numbers = 0
for i in range(125):
    if hashed_l[i] != 0:
        numbers += 1
numbers

65

There are 65 numbers !0 so there were 35 collision !!!!

Conclusion: 87 is pretty good but consider between space complexity and collision, 107 is much better !