# Pit falls with generators

## About generators

* Generators returns an iterator 
* Generators uses a special keyword called `yield` instead of `return` in a function

In [None]:
#Example

def example1():
    yield 1
    yield 2
    yield 3

test = example1()

In [None]:
next(example1())


In [None]:
def infinite_seq():
    n = 0
    while True:
        n = n + 1
        yield n

In [None]:
seq = infinite_seq()

In [None]:
next(seq)

# Requirements
* We have `allowed list` and `block list` URLs for different airlines in files.
* There are 100 + files with 1000s of `URLS` and `IPs`.
* We are migrating to new architecture were this existing file formats are not supported.
* We need to convert those files in to new architecture compatible files.



* When I was doing this change I felt to use generators. 
* Generators really helpful,  but my initial understanding the concept was not true.
* I have to do some modification to use the generators in the code to get the right output.
* This talk is to share few snippet from the code.



In [None]:
# %load gen-v1.py
"""Initilize the generator outside the function call
to avoid reset
"""

from pathlib import Path
import re

pattern_ip = r"[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}"

domains1 = ["www.cnn.com |80|443|", "*chennaipy.org", "192.168.1.12 |2000|9000|"]
domains2 = ["www.chennaipy.com |80|443|", "*chennaipy.org/home", "192.168.1.45 |2003|9004|"]


def id_generator():
    n = 0
    while True:
        n = n + 1
        yield n

def check_ip(ip):
    ip = ip.split(" ")[0]

    match = re.match(pattern_ip, ip)

    if match:
        return 1

    return 0

def check_port(domain):
    if "|" in domain:
        return 1
    else:
        return 0

def split_port(domain):
    if domain.count("|") == 2:
        return domain.split("|")[1]
    else:
        return domain.split("|")[1:]


def initialize_config_template(config):
    config["id"] =  None
    config["description"] =  None
    
def build_config(config, domains, domain_string):
    categories = {}
    id_gen = id_generator()
    _ports = []
    for value in domains:
        ports = split_port(value)
        if check_port(value):
            for port in ports:
                _id = next(id_gen)
                if check_ip(value):
                    _desc = domain_string + " IP " + port
                else:
                    _desc = domain_string + " URL " + port

                if port not in _ports:
                    _ports.append(port)
                    config["id"] = _id
                    config["description"] = _desc
                    categories[_id] = config.copy()
             
    return categories



def main():
    config = {}
    initialize_config_template(config)
    _cat1 = build_config(config, domains1, "client 1")
    _cat2 = build_config(config, domains2, "client 2")

    print(_cat1)
    print(_cat2)

main()


        


# Issue 1

* The issue is everytime when i call the function the generators get reset
* To solve this we need to initilize the generator outside the function


In [None]:
# %load gen-v2.py
"""Initilize the generator outside the function call
to avoid reset
"""

from pathlib import Path
import re

pattern_ip = r"[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}"

domains1 = ["www.cnn.com |80|443|", "www.test.com |80|443|", "*chennaipy.org", "10.90.30.12 |2000|9000|"]
domains2 = ["www.chennaipy.com |80|443|", "*chennaipy.org/home", "10.90.30.45 |2003|9004|"]


def id_generator():
    n = 0
    while True:
        n = n + 1
        yield n

def check_ip(ip):
    ip = ip.split(" ")[0]

    match = re.match(pattern_ip, ip)

    if match:
        return 1

    return 0

def check_port(domain):
    if "|" in domain:
        return 1
    else:
        return 0

def split_port(domain):
    count_pipe = domain.count("|")
    port = domain.split("|")[1:count_pipe]
    return port

def initialize_config_template(config):
    config["id"] =  None
    config["description"] =  None
    
def build_config(id_gen, config, domains, domain_string):
    categories = {}

    _ports = []

    """Default id for IP and URL with out port
    """
    _id = next(id_gen)
    _desc = domain_string + " default id"
    config["id"] = _id
    config["description"] = _desc
    categories[_id] = config.copy()

    for value in domains:
        ports = split_port(value)
        if check_port(value):
            for port in ports:
                _id = next(id_gen)
                if check_ip(value):
                    _desc = domain_string + " IP " + port
                else:
                    _desc = domain_string + " URL " + port

                if port not in _ports:
                    _ports.append(port)
                    config["id"] = _id
                    config["description"] = _desc
                    categories[_id] = config.copy()
             
    return categories

def main():
    config = {}
    id_gen = id_generator()
    initialize_config_template(config)
    _cat1 = build_config(id_gen, config, domains1, "client 1")
    _cat2 = build_config(id_gen, config, domains2, "client 2")

    print(_cat1)
    print(_cat2)

main()


        


# Issue 2

* Ids are skipped if the port is already present.
* But we need to have sequence of ids even the duplicate ports are skipped.

Fix is generate the new `id` if `id` for the port is not generated before.


In [None]:
# %load gen-v3.py
"""Initilize the generator outside the function call
to avoid reset
"""

from pathlib import Path
import re

pattern_ip = r"[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}"

domains1 = ["www.cnn.com |80|443|", "www.test.com |80|443|", "*chennaipy.org", "10.90.30.12 |2000|9000|"]
domains2 = ["www.chennaipy.com |80|443|", "*chennaipy.org/home", "10.90.30.45 |2003|9004|"]


def id_generator():
    n = 0
    while True:
        n = n + 1
        yield n

def check_ip(ip):
    ip = ip.split(" ")[0]

    match = re.match(pattern_ip, ip)

    if match:
        return 1

    return 0

def check_port(domain):
    if "|" in domain:
        return 1
    else:
        return 0

def split_port(domain):
    count_pipe = domain.count("|")
    port = domain.split("|")[1:count_pipe]
    return port

def initialize_config_template(config):
    config["id"] =  None
    config["description"] =  None
    
def build_config(id_gen, config, domains, domain_string):
    categories = {}

    _ports = []

    """Default id for IP and URL with out port
    """
    _id = next(id_gen)
    _desc = domain_string + " default id"
    config["id"] = _id
    config["description"] = _desc
    categories[_id] = config.copy()

    for value in domains:
        ports = split_port(value)
        if check_port(value):
            for port in ports:

                if check_ip(value):
                    _desc = domain_string + " IP " + port
                else:
                    _desc = domain_string + " URL " + port

                if port not in _ports:
                    _id = next(id_gen)
                    _ports.append(port)
                    config["id"] = _id
                    config["description"] = _desc
                    categories[_id] = config.copy()
             
    return categories

def main():
    config = {}
    id_gen = id_generator()
    initialize_config_template(config)
    _cat1 = build_config(id_gen, config, domains1, "client 1")
    _cat2 = build_config(id_gen, config, domains2, "client 2")

    print(_cat1)
    print(_cat2)

main()


        


# What I learnt ?
* We may need to adjust and rethink the place where we are going to use the generators inside our code.
* As the requirement changes some times I felt it is hard to use generators.
* We need to design a code in such a way that any new requirements can be easily adopted.
