In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import random
from typing import Union, Optional, Dict

In [2]:
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

In [3]:
seed = 69
seed_everything(seed=seed)

In [4]:
class Node:
    def __init__(self, reward:Union[int, float], children:Optional[Dict[str, float]]=None):
        self.reward = reward
        self.terminal = children is None
        self.children = children

    def transition(self):
        output_name = None
        if self.terminal: return output_name
        roll = random.random()
        total = 0
        for node in self.children.keys():
            output_name = node
            total += self.children[node]
            if total >= roll: return output_name
        return output_name

In [5]:
class CavesPaths:
    def __init__(self, default_paths:Optional[Dict[str,float]]=None, node_params:Optional[dict]=None):
        if default_paths is None: 
            default_paths = {
                'A': .5,
                'B': .3,
                'C': .2
            }
        if node_params is None:
            node_params = {
                'start': (0, default_paths),
                'A': (2, default_paths),
                'B': (1, default_paths),
                'C': (0, None)
            }

        self.default_paths = default_paths
        self.node_params = node_params

    def build_node(self, name:str) -> Node:
        return Node(*self.node_params[name])

    def build_path(self):
        node = self.build_node('start')
        distance = node.reward
        paths = []
        while node is not None:
            next_node_name = node.transition()
            if next_node_name is None: break
            paths.append(next_node_name)
            node = self.build_node(next_node_name)
            distance += node.reward
        return paths, distance

In [6]:
caves_paths = CavesPaths()

In [7]:
for _ in range(5):
    path, distance = caves_paths.build_path()
    print(f'distance traveled: {distance}\n\tpath: {path}')

distance traveled: 8
	path: ['B', 'A', 'A', 'B', 'A', 'C']
distance traveled: 1
	path: ['B', 'C']
distance traveled: 2
	path: ['A', 'C']
distance traveled: 1
	path: ['B', 'C']
distance traveled: 21
	path: ['A', 'B', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'C']


In [8]:
# we want to know the probability that AAB appears in the sequence and 
# it appears before any BAA appears
def check_aab(path):
    assert type(path) == list
    if len(path) < 4: return False
    for i in range(len(path) - 3):
        phrase = path[i] + path[i+1] + path[i+2]
        if phrase == 'AAB' : return True
        if phrase == 'BAA' : return False
    return False

In [9]:
def create_data(m:int=1000000) -> pd.DataFrame:
    paths = []
    distances = []

    for _ in range(m):
        path, distance = caves_paths.build_path()
        paths.append(path)
        distances.append(distance)

    paths = pd.Series(paths)
    distances = pd.Series(distances, dtype=int)

    df = pd.DataFrame({
        'paths':paths,
        'days':distances
    })

    targets = []
    for i in range(m):
        has_aab = 1 if check_aab(paths[i]) else 0
        targets.append(has_aab)

    targets = pd.Series(targets, dtype=int)
    df['target'] = targets

    return df

In [10]:
df = create_data()

In [11]:
df.head()

Unnamed: 0,paths,days,target
0,"[A, A, B, A, C]",7,1
1,"[B, A, A, A, A, A, A, A, A, C]",17,0
2,"[B, B, C]",2,0
3,"[B, A, A, A, A, B, C]",10,0
4,"[A, A, B, A, A, A, C]",11,1


In [12]:
df['target'].mean()

0.151337

In [13]:
import sklearn
from sklearn.model_selection import train_test_split