!pip install mypy

!pip install nb_mypy

In [None]:
%load_ext nb_mypy

In [None]:
from typing import TypeVar

# From Regular Expressions to <span style="font-variant:small-caps;">Fsm</span>s

This notebook shows how a given regular expression $r$ can be transformed into an equivalent finite state machine. 
It implements the theory that is outlined in section 4.4. of the 
lecture notes.

The type `RegExp` describes the *parse tree* of a regular expression.  This will be the *input* of the program we develop in this notebook.  Note that `RegExp` is a *recursive* type.  

The expression `Tuple[RegExp, ...]` denotes a tuple of objects of type `RegExp` that has an arbitrary length.

In [None]:
Char   = str
RegExp = TypeVar('RegExp')
RegExp = int | Char | tuple[RegExp, ...]

We will represent the states of the `NFA` as integers.

In [None]:
State = int

The type `Delta` denotes the transition relation of a non-deterministic finite state machine.
The type `NFA` denotes a non-deterministic finite state machine.

In [None]:
Delta = dict[tuple[State, Char], set[State]]
NFA   = tuple[set[State], set[Char], Delta, State, set[State]]

The class `RegExp2NFA` administers two member variables:
- `Sigma` is the <em style="color:blue">alphabet</em>, i.e. the set of characters used.
- `StateCount` is a counter that is needed to create <em style="color:blue">unique</em> state names.

The methods given here are just stubs that are needed by the type checker.  The implementation of these stubs is given later.

In [None]:
class RegExp2NFA:
    def __init__(self, Sigma: set[Char]):
        self.Sigma:      set[Char] = Sigma
        self.StateCount: int = 0
            
    def toNFA(self, r: RegExp) -> NFA:
        return None # type: ignore
    def genEmptyNFA(self) -> NFA:
        return None # type: ignore
    def genEpsilonNFA(self) -> NFA:
        return None # type: ignore
    def genCharNFA(self, c) -> NFA:
        return None # type: ignore
    def catenate(self, f1, f2) -> NFA:   
        return None # type: ignore
    def disjunction(self, f1, f2) -> NFA:
        return None # type: ignore
    def kleene(self, f) -> NFA:
        return None # type: ignore
    def getNewState(self) -> State:
        return None # type: ignore

The member function `toNFA` takes an object `self` of class `RegExp2NFA` and a regular expression `r` and returns a finite state machine 
that accepts the same language as described by `r`.  The regular expression is represented in `Python` as follows:
- The regular expression $\emptyset$ is represented as the number `0`.
- The regular expression $\varepsilon$ is represented as the string `'𝜀'`.
- The regular expression $c$ that matches the character $c$ is represented by the character $c$.
- The regular expression $r_1 \cdot r_2$  is represented by the triple $\bigl(\texttt{repr}(r_1), \texttt{'⋅'}, \texttt{repr}(r_2)\bigr)$.

  Here, and in the following, for a given regular expression $r$ the expression $\texttt{repr}(r)$ denotes the `Python` representation of the regular 
  expressions  $r$.
- The regular expression $r_1 + r_2$  is represented by the triple $\bigl(\texttt{repr}(r_1), \texttt{'+'}, \texttt{repr}(r_2)\bigr)$.
- The regular expression $r^*$  is represented by the pair $\bigl(\texttt{repr}(r), \texttt{'*'}\bigr)$.

The annotation `# type: ignore`is needed to silence the type checker.

In [None]:
def toNFA(self: RegExp2NFA, r: RegExp) -> NFA:
    match r:
        case 0: 
            return self.genEmptyNFA()
        case '𝜀': 
            return self.genEpsilonNFA()
        case r if isinstance(r, str) and len(r) == 1: 
            return self.genCharNFA(r)
        case (r1, '⋅', r2):
            return self.catenate(self.toNFA(r1), self.toNFA(r2))
        case (r1, '+', r2):
            return self.disjunction(self.toNFA(r1), self.toNFA(r2))
        case (r1, '*'):
            return self.kleene(self.toNFA(r1))
        case _:
            raise ValueError(f'{r} is not a proper regular expression.') 
    return None # type: ignore
    
RegExp2NFA.toNFA = toNFA # type: ignore
del toNFA

The <span style="font-variant:small-caps;">Fsm</span> `genEmptyNFA()` is defined as
$$\bigl\langle \{ q_0, q_1 \}, \Sigma, \{\}, q_0, \{ q_1 \} \bigr\rangle. $$
Note that this <span style="font-variant:small-caps;">Fsm</span> has no transitions at all.
Graphically, this <span style="font-variant:small-caps;">Fsm</span> looks as follows:

![Fsm recognizing the empty set](./aLeer.jpg)

In [None]:
def genEmptyNFA(self: RegExp2NFA) -> NFA:
    q0 = self.getNewState()
    q1 = self.getNewState()
    return {q0, q1}, self.Sigma, {}, q0, { q1 }

RegExp2NFA.genEmptyNFA = genEmptyNFA # type: ignore
del genEmptyNFA

The <span style="font-variant:small-caps;">Fsm</span> `genEpsilonNFA` is defined as
$$  \bigl\langle \{ q_0, q_1 \}, \Sigma, 
                          \bigl\{ \langle q_0, \varepsilon\rangle \mapsto \{q_1\} \bigr\}, q_0, \{ q_1 \} \bigr\rangle.
$$
Graphically, this <span style="font-variant:small-caps;">Fsm</span> looks as follows:

![Fsm recognizing the empty string](./aEpsilon.jpg)

In [None]:
def genEpsilonNFA(self: RegExp2NFA) -> NFA:
    q0 = self.getNewState()
    q1 = self.getNewState()
    delta = { (q0, '𝜀'): {q1} }
    return {q0, q1}, self.Sigma, delta, q0, { q1 }

RegExp2NFA.genEpsilonNFA = genEpsilonNFA # type: ignore
del genEpsilonNFA

For a letter $c \in \Sigma$ the <span style="font-variant:small-caps;">Fsm</span> `genCharNFA`$(c)$ is defined as 
$$ A(c) = 
   \bigl\langle \{ q_0, q_1 \}, \Sigma, 
   \bigl\{ \langle q_0, c \rangle \mapsto \{q_1\}\bigr\}, q_0, \{ q_1 \} \bigr\rangle.
$$
Graphically, this <span style="font-variant:small-caps;">Fsm</span> looks as follows:

![Fsm recognizing the character c](./aChar.jpg)

In [None]:
def genCharNFA(self: RegExp2NFA, c: str) -> NFA:
    q0 = self.getNewState()
    q1 = self.getNewState()
    delta = { (q0, c): {q1} } 
    return {q0, q1}, self.Sigma, delta, q0, { q1 }

RegExp2NFA.genCharNFA = genCharNFA # type: ignore
del genCharNFA

Given two <span style="font-variant:small-caps;">Fsm</span>s `f1` and `f2`, the function `catenate(f1, f2)` 
creates an <span style="font-variant:small-caps;">Fsm</span> that recognizes a string $s$ if it can be written 
in the form
$$ s = s_1s_2 $$
and $s_1$ is recognized by `f1` and $s_2$ is recognized by `f2`. 

Assume that $f_1$ and $f_2$ have the following form:
- $f_1 = \langle Q_1, \Sigma, \delta_1, q_1, \{ q_2 \}\rangle$,
- $f_2 = \langle Q_2, \Sigma, \delta_2, q_3, \{ q_4 \}\rangle$,
- $Q_1 \cap Q_2 = \{\}$.
 
Then $\texttt{catenate}(f_1, f_2)$ is defined as:
$$  \bigl\langle Q_1 \cup Q_2, \Sigma, 
   \bigl\{ \langle q_2,\varepsilon\rangle  \mapsto \{q_3\} \bigr\} 
         \cup \delta_1 \cup \delta_2, q_1, \{ q_4 \} \bigr\rangle.
$$
Graphically, this <span style="font-variant:small-caps;">Fsm</span> looks as follows:

![Fsm recognizing the concatenation of two languages](./aConcat.jpg)

In [None]:
def catenate(self: RegExp2NFA, f1: NFA, f2: NFA) -> NFA:
    M1, Sigma, delta1, q1, A1 = f1
    M2, Sigma, delta2, q3, A2 = f2
    q2, = A1 # extract the element from A1
    delta = delta1 | delta2
    delta[q2, '𝜀'] = {q3}
    return M1 | M2, Sigma, delta, q1, A2

RegExp2NFA.catenate = catenate # type: ignore
del catenate

Given two <span style="font-variant:small-caps;">Fsm</span>s `f1` and `f2`, the function `disjunction(f1, f2)` 
creates an <span style="font-variant:small-caps;">Fsm</span> that recognizes a string $s$ if it is either 
is recognized by `f1` or by `f2`. 

Assume again that the states of 
$f_1$ and $f_2$ are different and that $f_1$ and $f_2$ have the following form:
- $f_1 = \langle Q_1, \Sigma, \delta_1, q_1, \{ q_3 \}\rangle$,
- $f_2 = \langle Q_2, \Sigma, \delta_2, q_2, \{ q_4 \}\rangle$,
- $Q_1 \cap Q_2 = \{\}$.

Then $\texttt{disjunction}(f_1, f_2)$ is defined as follows:
$$ \bigl\langle \{ q_0, q_5 \} \cup Q_1 \cup Q_2, \Sigma, 
                \bigl\{ \langle q_0,\varepsilon\rangle \mapsto \{q_1, q_2\},
                   \langle q_3,\varepsilon\rangle \mapsto \{q_5\}, 
                   \langle q_4,\varepsilon\rangle \mapsto \{q_5\} \bigr\} 
                   \cup \delta_1 \cup \delta_2, q_0, \{ q_5 \} \bigr\rangle
$$
Graphically, this <span style="font-variant:small-caps;">Fsm</span> looks as follows:
![Fsm recognizing the disjunction](./aPlus.jpg)

In [None]:
def disjunction(self: RegExp2NFA, f1: NFA, f2: NFA) -> NFA:
        M1, Sigma, delta1, q1, A1 = f1
        M2, Sigma, delta2, q2, A2 = f2
        q3, = A1
        q4, = A2
        q0 = self.getNewState()
        q5 = self.getNewState() 
        delta = delta1 | delta2
        delta[q0, '𝜀'] = { q1, q2 }
        delta[q3, '𝜀'] = { q5 }
        delta[q4, '𝜀'] = { q5 }
        return { q0, q5 } | M1 | M2, Sigma, delta, q0, { q5 }
    
RegExp2NFA.disjunction = disjunction # type: ignore
del disjunction

Given an <span style="font-variant:small-caps;">Fsm</span> `f`, the function `kleene(f)` 
creates an <span style="font-variant:small-caps;">Fsm</span> that recognizes a string $s$ if it can be written as
$$ s = s_1 s_2 \cdots s_n $$
and all $s_i$ are recognized by `f`.  Note that $n$ might be $0$. 

If `f` is defined as
$$ f = \langle Q, \Sigma, \delta, q_1, \{ q_2 \} \rangle,
$$
then  `kleene(f)` is defined as follows:
$$ \bigl\langle \{ q_0, q_3 \} \cup Q, \Sigma, 
                \bigl\{ \langle q_0,\varepsilon\rangle \mapsto \{q_1, q_3\},  
                        \langle q_2,\varepsilon\rangle \mapsto \{q_1, q_3\} \bigr\} 
                \cup \delta, q_0, \{ q_3 \} \bigr\rangle.
$$
Graphically, this <span style="font-variant:small-caps;">Fsm</span> looks as follows:
![Fsm recognizing the Kleene star](./aStar.jpg)

In [None]:
def kleene(self: RegExp2NFA, f: NFA) -> NFA:
    M, Sigma, delta0, q1, A = f
    q2, = A
    q0 = self.getNewState()
    q3 = self.getNewState()
    delta = delta0
    delta[q0, '𝜀'] = { q1, q3 }
    delta[q2, '𝜀'] = { q1, q3 }
    return { q0, q3 } | M, Sigma, delta, q0, { q3 }

RegExp2NFA.kleene = kleene # type: ignore
del kleene

The auxiliary function `getNewState` returns a new number that has not yet been used as a state.

In [None]:
def getNewState(self: RegExp2NFA) -> State:
    self.StateCount += 1
    return self.StateCount

RegExp2NFA.getNewState = getNewState # type: ignore
del getNewState

The notebook `04-Test-Regexp-2-NFA`can be used to test the functions implemented in this notebook.

In [None]:
// regexp_to_nfa_final.ts
// Standalone NFA-Builder + Parser that accepts both "." and "⋅" as concatenation.
// ---------------------------------------------------------------------------

type Char = string;
type State = number;
const EPS = "ε" as const;

// ---- Regular Expression AST -------------------------------------------------

type Empty = 0;
type Epsilon = typeof EPS;
type Sym = string;

type Concat = ["⋅", RegExp, RegExp];
type Union  = ["+", RegExp, RegExp];
type Star   = ["*", RegExp];

type RegExp = Empty | Epsilon | Sym | Concat | Union | Star;

// ---- NFA Definition ---------------------------------------------------------

type Delta = Map<string, Set<State>>;

interface NFA {
  Q: Set<State>;
  Sigma: Set<Char>;
  delta: Delta;
  q0: State;
  F: Set<State>;
}

function dkey(q: State, a: Char | typeof EPS): string {
  return JSON.stringify([q, a]);
}

// ---- Class RegExp2NFA -------------------------------------------------------

class RegExp2NFA {
  private Sigma: Set<Char>;
  private stateCount: number;

  constructor(Sigma: Set<Char>) {
    this.Sigma = new Set(Sigma);
    this.stateCount = 0;
  }

  private fresh(): State {
    return this.stateCount++;
  }

  public toNFA(r: RegExp): NFA {
    if (r === 0) return this.genEmptyNFA();
    if (r === EPS) return this.genEpsilonNFA();
    if (typeof r === "string" && r.length === 1 && r !== EPS && !isOperator(r)) {
      return this.genCharNFA(r);
    }
    if (Array.isArray(r)) {
      const tag = r[0];
      if (tag === "⋅") {
        const [, r1, r2] = r as Concat;
        return this.catenate(this.toNFA(r1), this.toNFA(r2));
      } else if (tag === "+") {
        const [, r1, r2] = r as Union;
        return this.disjunction(this.toNFA(r1), this.toNFA(r2));
      } else if (tag === "*") {
        const [, r1] = r as Star;
        return this.kleene(this.toNFA(r1));
      }
    }
    throw new Error("Invalid RegExp form: " + JSON.stringify(r));
  }

  private genEmptyNFA(): NFA {
    const q0 = this.fresh();
    const Q = new Set<State>([q0]);
    return { Q, Sigma: new Set(this.Sigma), delta: new Map(), q0, F: new Set() };
  }

  private genEpsilonNFA(): NFA {
    const q0 = this.fresh();
    const qf = this.fresh();
    const Q = new Set<State>([q0, qf]);
    const delta: Delta = new Map();
    addTrans(delta, q0, EPS, qf);
    return { Q, Sigma: new Set(this.Sigma), delta, q0, F: new Set([qf]) };
  }

  private genCharNFA(c: Char): NFA {
    if (!this.Sigma.has(c)) {
      throw new Error(`Character '${c}' not in Sigma`);
    }
    const q0 = this.fresh();
    const qf = this.fresh();
    const Q = new Set<State>([q0, qf]);
    const delta: Delta = new Map();
    addTrans(delta, q0, c, qf);
    return { Q, Sigma: new Set(this.Sigma), delta, q0, F: new Set([qf]) };
  }

  private catenate(n1: NFA, n2: NFA): NFA {
    assertSameSigma(n1, n2);
    const n1r = this.reindexOffset(n1, 0);
    const n2r = this.reindexOffset(n2, this.maxState(n1r) + 1);
    for (const f of n1r.F) addTrans(n1r.delta, f, EPS, n2r.q0);
    return mergeNFAs(n1r, n2r, n1r.q0, new Set(n2r.F));
  }

  private disjunction(n1: NFA, n2: NFA): NFA {
    assertSameSigma(n1, n2);
    const q0 = this.fresh();
    const qf = this.fresh();

    const n1R = this.reindexOffset(n1, 0);
    const n2R = this.reindexOffset(n2, this.maxState(n1R) + 1);

    const Q = new Set<State>([q0, qf, ...n1R.Q, ...n2R.Q]);
    const delta = mergeDelta(n1R.delta, n2R.delta);

    addTrans(delta, q0, EPS, n1R.q0);
    addTrans(delta, q0, EPS, n2R.q0);
    for (const f of n1R.F) addTrans(delta, f, EPS, qf);
    for (const f of n2R.F) addTrans(delta, f, EPS, qf);

    const F = new Set<State>([qf]);
    const Sigma = new Set(n1.Sigma);
    return { Q, Sigma, delta, q0, F };
  }

  private kleene(n: NFA): NFA {
    const q0 = this.fresh();
    const qf = this.fresh();
    const nR = this.reindexOffset(n, 0);

    const Q = new Set<State>([q0, qf, ...nR.Q]);
    const delta = new Map(nR.delta);

    addTrans(delta, q0, EPS, nR.q0);
    addTrans(delta, q0, EPS, qf);
    for (const f of nR.F) {
      addTrans(delta, f, EPS, nR.q0);
      addTrans(delta, f, EPS, qf);
    }

    const F = new Set<State>([qf]);
    const Sigma = new Set(n.Sigma);
    return { Q, Sigma, delta, q0, F };
  }

  private maxState(n: NFA): number {
    return Math.max(...n.Q);
  }

  private reindexOffset(n: NFA, base: number): NFA {
    const mapState = (q: State) => q + base;
    const Q = new Set([...n.Q].map(mapState));
    const F = new Set([...n.F].map(mapState));
    const q0 = mapState(n.q0);
    const delta: Delta = new Map();
    for (const [k, vs] of n.delta) {
      const [q, a] = JSON.parse(k) as [State, Char];
      const q2 = mapState(q);
      const vs2 = new Set([...vs].map(mapState));
      delta.set(dkey(q2, a), vs2);
    }
    return { Q, Sigma: new Set(n.Sigma), delta, q0, F };
  }
}

// ---- Utilities --------------------------------------------------------------

function isOperator(s: string): boolean {
  return s === "⋅" || s === "+" || s === "*" || s === EPS;
}

function addTrans(delta: Delta, from: State, a: Char | typeof EPS, to: State) {
  const k = dkey(from, a);
  const set = delta.get(k) ?? new Set<State>();
  set.add(to);
  delta.set(k, set);
}

function mergeDelta(d1: Delta, d2: Delta): Delta {
  const out = new Map<string, Set<State>>();
  for (const [k, v] of d1) out.set(k, new Set(v));
  for (const [k, v] of d2) {
    const s = out.get(k) ?? new Set<State>();
    for (const q of v) s.add(q);
    out.set(k, s);
  }
  return out;
}

function mergeNFAs(n1: NFA, n2: NFA, q0: State, F: Set<State>): NFA {
  const Q = new Set<State>([...n1.Q, ...n2.Q]);
  const Sigma = new Set<Char>(n1.Sigma);
  const delta = mergeDelta(n1.delta, n2.delta);
  return { Q, Sigma, delta, q0, F };
}

function assertSameSigma(a: NFA, b: NFA) {
  if (a.Sigma.size !== b.Sigma.size) throw new Error("Sigma mismatch");
  for (const c of a.Sigma) if (!b.Sigma.has(c)) throw new Error("Sigma mismatch");
}

// ---- Simulation -------------------------------------------------------------

function epsilonClosure(n: NFA, S: Set<State>): Set<State> {
  const stack = [...S];
  const visited = new Set(S);
  while (stack.length) {
    const q = stack.pop()!;
    const k = dkey(q, EPS);
    const next = n.delta.get(k);
    if (!next) continue;
    for (const t of next) {
      if (!visited.has(t)) {
        visited.add(t);
        stack.push(t);
      }
    }
  }
  return visited;
}

function step(n: NFA, S: Set<State>, a: Char): Set<State> {
  const out = new Set<State>();
  for (const q of S) {
    const k = dkey(q, a);
    const next = n.delta.get(k);
    if (next) for (const t of next) out.add(t);
  }
  return out;
}

function simulateNFA(n: NFA, input: string): boolean {
  let S = epsilonClosure(n, new Set([n.q0]));
  for (const a of [...input]) {
    if (!n.Sigma.has(a)) return false;
    S = epsilonClosure(n, step(n, S, a));
  }
  for (const f of n.F) if (S.has(f)) return true;
  return false;
}

// ---- Robust Regex Parser ----------------------------------------------------
// Accepts both "." and "⋅" for concatenation
// Automatically ignores whitespace and ensures "⋅" is never treated as symbol

function parseRegex(src: string, sigma: Set<Char>): RegExp {
  const cleaned = src.replace(/\s+/g, "");
  const tokens = [...cleaned];
  let pos = 0;

  function peek(): string | null { return pos < tokens.length ? tokens[pos] : null; }
  function get(): string { return tokens[pos++]!; }

  function parseR(): RegExp {
    let node = parseC();
    while (peek() === "+") {
      get();
      node = ["+", node, parseC()];
    }
    return node;
  }

  function parseC(): RegExp {
    let node = parseK();
    while (true) {
      const p = peek();
      // treat both '.' and '⋅' as explicit concat ops
      if (p === "." || p === "⋅") { get(); }
      else if (p === null || p === ")" || p === "+") break;
      const rhs = parseK();
      node = ["⋅", node, rhs];
    }
    return node;
  }

  function parseK(): RegExp {
    let node = parseAtom();
    while (peek() === "*") {
      get();
      node = ["*", node];
    }
    return node;
  }

  function parseAtom(): RegExp {
    const t = peek();
    if (t === null) throw new Error("Unexpected end");
    if (t === "(") {
      get();
      const node = parseR();
      if (get() !== ")") throw new Error("Missing ')'");
      return node;
    }
    if (t === EPS) { get(); return EPS; }
    if (sigma.has(t)) { get(); return t; }
    throw new Error(`Unexpected token '${t}'`);
  }

  return parseR();
}

// ---- Demo ------------------------------------------------------------------

const Sigma = new Set<Char>(["a", "b"]);
const conv = new RegExp2NFA(Sigma);

const r = parseRegex("(a+b)⋅a*", Sigma); // also works with "(a+b).a*"
const n = conv.toNFA(r);

console.log("accepts '':", simulateNFA(n, ""));
console.log("accepts 'baaa':", simulateNFA(n, "baaa"));
console.log("accepts 'bb':", simulateNFA(n, "bb"));
