In [87]:

import numpy as np
import pandas
from copy import deepcopy
from matplotlib import pyplot as plt
from random import random
from random import randint
import re
from warnings import warn

In [99]:
class Gram:
    def __init__(self, pattern, beforenum, afternum):
        self.pattern = pattern
        self.beforearray = np.array([dict() for _ in range(beforenum)])
        self.afterarray = np.array([dict() for _ in range(afternum)])
        randval = random()
        self.hashval = hash(self.pattern + str(randval))
        self.num_entries = 0

    
    def addGram(self, gram, pos):
        """
        gram        - gram being read from text material
        pos         - position relative to this gram (0 is invalid here)
        """
        if pos > 0:
            if pos < len( self.afterarray ):
                self.afterarray[-pos].setdefault(gram,0)
                self.afterarray[-pos][gram] += 1
                self.num_entries += 1
            else:
                raise ValueError(f"pos ({pos}) is out of range of {self}.afterarray which is of length {len(self.afterarray)}")
        elif pos < 0:
            if pos < len( self.beforearray ):
                self.beforearray[pos].setdefault(gram,0)
                self.beforearray[pos][gram] += 1
                self.num_entries += 1
            else:
                raise ValueError(f"pos ({pos}) is out of range of {self}.beforearray which is of length {len(self.beforearray)}")
        else:
            raise ValueError("pos cannot be 0")
    
    def getStatsForPosition(self, pos):
        
        if pos > 0:
            if pos < len( self.afterarray ):
                return( self.afterarray[-pos] )
            else:
                raise ValueError(f"pos ({-pos}) is out of range of {self}.afterarray which is of length {len(self.afterarray)}")
        elif pos < 0:
            if pos < len( self.beforearray ):
                return( self.beforearray[pos] )
            else:
                raise ValueError(f"pos ({pos}) is out of range of {self}.beforearray which is of length {len(self.beforearray)}")
        else:
            raise ValueError("pos cannot be 0")

    def getPattern(self):
        return(self.pattern)

    def __lt__(self, other):
        return(self.pattern < other.pattern)
    def __gt__(self, other):
        return(self.pattern > other.pattern)
    def __lte__(self, other):
        return(self.pattern <= other.pattern)
    def __gte__(self, other):
        return(self.pattern >= other.pattern)
    def __eq__(self, other):
        return(self.pattern == other.pattern)
    def __ne__(self, other):
        return(self.pattern == other.pattern)

    def __hash__(self):
        return(self.hashval)
    def __str__(self):
        s = '<Gram: "'
        s += self.pattern
        s += '" hash: '
        s += str(self.hashval)
        s += ' entries: '
        s += str(self.num_entries)
        s += ">"
        return(s)



In [104]:
class GramData:
    def __init__(self, beforenum, afternum):
        self.allgrams = dict()
        self.allgrams_by_pattern = dict()
        self.beforenum = beforenum
        self.afternum = afternum
        self.emptygram = Gram("<EMPTY_GRAM>", self.beforenum, self.afternum)
        self.defaultStartGram = Gram("<DEFAULT_START_GRAM>", self.beforenum, self.afternum)
        self.defaultEndGram = Gram("<DEFAULT_END_GRAM>", self.beforenum, self.afternum)
    
    def addGram(self, gram):
        self.allgrams.setdefault(gram, 0)
        self.allgrams[gram] += 1

        self.allgrams_by_pattern.setdefault(gram.getPattern(),set())
        self.allgrams_by_pattern[gram.getPattern()].add(gram)

    def readGramSequence(self, gramSequence, startgram = None, endgram = None):
        if startgram == None:
            startgram = self.defaultStartGram
        if endgram == None:
            endgram = self.defaultEndGram
        
        for i,gram in enumerate(gramSequence):
            self.addGram(gram)
            
            ## before gram
            for pos in range(-1, self.beforenum, -1):
                i_before = i + pos
                if i_before < -1:
                    beforegram = self.emptygram
                elif i_before == -1:
                    beforegram = startgram
                else:
                    beforegram = gramSequence[i_before]
                gram.addGram(beforegram, pos)

            ## after gram
            for pos in range(1, self.afternum, 1):
                i_after = i + pos
                if i_after > len(gramSequence):
                    aftergram = self.emptygram
                elif i_after == len(gramSequence):
                    aftergram = endgram
                else:
                    aftergram = gramSequence[i_after]
                gram.addGram(aftergram, pos)
    
    def tokenizeTextBasic(self, s):
        gramSequence = []

        allchars = set(s)
        local_allgrams_by_pattern = dict()
        for character in allchars:
            if character in local_allgrams_by_pattern:
                continue
            elif character in self.allgrams_by_pattern:
                local_allgrams_by_pattern[character] = list(self.allgrams_by_pattern[character])[0]  ## this won't work with multiple patterns with the same pattern
                if len(self.allgrams_by_pattern[character]) > 1:
                    warn(f"tokenizeTextBasic: there are multiple grams for pattern '{character}'")
            else:
                local_allgrams_by_pattern[character] = Gram(character, self.beforenum, self.afternum)

        for character in s:
            gramSequence.append( local_allgrams_by_pattern[character] )

        return(gramSequence)

            


                






In [105]:
gd = GramData(5,5)
gramSequence = gd.tokenizeTextBasic("here is a cat")
gd.readGramSequence(gramSequence)


In [108]:
for gram, count in gd.allgrams.items():
    print(f"{gram} : {count}")

<Gram: "h" hash: -709373536 entries: 4> : 1
<Gram: "e" hash: -2088609242 entries: 8> : 2
<Gram: "r" hash: 1511071314 entries: 4> : 1
<Gram: " " hash: 1947830918 entries: 12> : 3
<Gram: "i" hash: -1557622774 entries: 4> : 1
<Gram: "s" hash: 1610470904 entries: 4> : 1
<Gram: "a" hash: 214198794 entries: 8> : 2
<Gram: "c" hash: -1339954065 entries: 4> : 1
<Gram: "t" hash: -277469929 entries: 4> : 1


In [73]:
re.match("cat", "catlcatolok")



<re.Match object; span=(0, 3), match='cat'>