In [3]:

import numpy as np
import pandas
from copy import deepcopy
from matplotlib import pyplot as plt
from random import random
from random import randint
import re
from warnings import warn

In [76]:
class Gram:
    def __init__(self, patterns, beforenum, afternum):

        if type(patterns) == str:
            patterns = [patterns]

        self.patterns = patterns
        self.patterns.sort()
        self.beforearray = np.array([dict() for _ in range(beforenum)])
        self.afterarray = np.array([dict() for _ in range(afternum)])
        randval = random()
        self.hashval = hash(str(self.patterns) + str(randval))
        self.num_entries = 0

    
    def addGram(self, gram, pos):
        """
        gram        - gram being read from text material
        pos         - position relative to this gram (0 is invalid here)
        """
        if pos > 0:
            if pos < len( self.afterarray ):
                self.afterarray[-pos].setdefault(gram,0)
                self.afterarray[-pos][gram] += 1
                self.num_entries += 1
            else:
                raise ValueError(f"pos ({pos}) is out of range of {self}.afterarray which is of length {len(self.afterarray)}")
        elif pos < 0:
            if pos < len( self.beforearray ):
                self.beforearray[pos].setdefault(gram,0)
                self.beforearray[pos][gram] += 1
                self.num_entries += 1
            else:
                raise ValueError(f"pos ({pos}) is out of range of {self}.beforearray which is of length {len(self.beforearray)}")
        else:
            raise ValueError("pos cannot be 0")
    
    def getStatsForPosition(self, pos):
        if pos > 0:
            if pos < len( self.afterarray ):
                return( self.afterarray[-pos] )
            else:
                raise ValueError(f"pos ({-pos}) is out of range of {self}.afterarray which is of length {len(self.afterarray)}")
        elif pos < 0:
            if pos < len( self.beforearray ):
                return( self.beforearray[pos] )
            else:
                raise ValueError(f"pos ({pos}) is out of range of {self}.beforearray which is of length {len(self.beforearray)}")
        else:
            raise ValueError("pos cannot be 0")

    def matchPattern(self, s):
        """
        s   - a string
        returns the length of the matching pattern if there is a match
        else returns 0
        """
        longestmatch = 0
        for pattern in self.getPatterns():

            ## truncate string s
            str_to_match = s[:len(pattern)]

            # match truncated string
            if str_to_match == pattern:
                if len(pattern) > longestmatch:
                    longestmatch = len(pattern)
        return(longestmatch)


    def getPatterns(self):
        return(self.patterns)

    def __lt__(self, other):
        return(str(self.patterns) < str(other.patterns))
    def __gt__(self, other):
        return(str(self.patterns) > str(other.patterns))
    def __lte__(self, other):
        return(str(self.patterns) <= str(other.patterns))
    def __gte__(self, other):
        return(str(self.patterns) >= str(other.patterns))
    def __eq__(self, other):
        return(self.patterns == other.patterns)
    def __ne__(self, other):
        return(self.patterns != other.patterns)

    def __hash__(self):
        return(self.hashval)
    def __str__(self):
        s = '<Gram: "'
        s += str(self.patterns)
        s += '" hash: '
        s += str(self.__hash__())
        s += ' entries: '
        s += str(self.num_entries)
        s += ">"
        return(s)



In [104]:
class GramData:
    def __init__(self, beforenum, afternum):
        self.allgrams = dict()
        self.allgrams_by_pattern = dict()
        self.beforenum = beforenum
        self.afternum = afternum
        self.emptygram = Gram("<EMPTY_GRAM>", self.beforenum, self.afternum)
        self.defaultStartGram = Gram("<DEFAULT_START_GRAM>", self.beforenum, self.afternum)
        self.defaultEndGram = Gram("<DEFAULT_END_GRAM>", self.beforenum, self.afternum)
    
    def addGram(self, gram):
        self.allgrams.setdefault(gram, 0)
        #self.allgrams[gram] += 1

        for pattern in gram.getPatterns():
            self.allgrams_by_pattern.setdefault(pattern,set())
            self.allgrams_by_pattern[pattern].add(gram)

    def incGramCount(self, gram):
        self.allgrams.setdefault(gram, 0)
        self.allgrams[gram] += 1

        #for pattern in gram.getPatterns():
        #    self.allgrams_by_pattern.setdefault(pattern,set())
        #    self.allgrams_by_pattern[pattern].add(gram)

    def readGramSequence(self, gramSequence, startgram = None, endgram = None):
        if startgram == None:
            startgram = self.defaultStartGram
        if endgram == None:
            endgram = self.defaultEndGram
        
        for i,gram in enumerate(gramSequence):
            self.incGramCount(gram)
            
            ## before gram
            for pos in range(-1, self.beforenum, -1):
                i_before = i + pos
                if i_before < -1:
                    beforegram = self.emptygram
                elif i_before == -1:
                    beforegram = startgram
                else:
                    beforegram = gramSequence[i_before]
                gram.addGram(beforegram, pos)

            ## after gram
            for pos in range(1, self.afternum, 1):
                i_after = i + pos
                if i_after > len(gramSequence):
                    aftergram = self.emptygram
                elif i_after == len(gramSequence):
                    aftergram = endgram
                else:
                    aftergram = gramSequence[i_after]
                gram.addGram(aftergram, pos)

    def tokenizeString(self, s):
        token_sequence = []
        i=0
        while i < len(s):
            #print(i)
            #print(s[i])
            longestmatch_gram = None
            longestmatch = 0
            for gram in self.allgrams.keys():
                
                result = gram.matchPattern(s[i:])
#                print(gram)
                if result > 0:
                    if result > longestmatch:
                        longestmatch_gram = gram
                        longestmatch = result
            if longestmatch == 0:
                longestmatch = 1
                longestmatch_gram = Gram(s[i], self.beforenum, self.afternum)
                self.addGram(longestmatch_gram)

            i += longestmatch
            token_sequence.append(longestmatch_gram)

        return(token_sequence)

    
    def tokenizeTextBasic(self, s):
        gramSequence = []

        allchars = set(s)
        local_allgrams_by_pattern = dict()
        for character in allchars:
            if character in local_allgrams_by_pattern:
                continue
            elif character in self.allgrams_by_pattern:
                local_allgrams_by_pattern[character] = list(self.allgrams_by_pattern[character])[0]  ## this won't work with multiple patterns with the same pattern
                if len(self.allgrams_by_pattern[character]) > 1:
                    warn(f"tokenizeTextBasic: there are multiple grams for pattern '{character}'")
            else:
                local_allgrams_by_pattern[character] = Gram(character, self.beforenum, self.afternum)

        for character in s:
            gramSequence.append( local_allgrams_by_pattern[character] )

        return(gramSequence)

            

In [105]:
gd = GramData(5,5)
gramSequence = gd.tokenizeString("here is a cat and here it is again")
gd.readGramSequence(gramSequence)



In [106]:
for gram,count in gd.allgrams.items():
    print(f"{gram}:{count}")


<Gram: "['h']" hash: 2020856964 entries: 8>:2
<Gram: "['e']" hash: 513820499 entries: 16>:4
<Gram: "['r']" hash: -506551348 entries: 8>:2
<Gram: "[' ']" hash: 2038929698 entries: 32>:8
<Gram: "['i']" hash: -1494704369 entries: 16>:4
<Gram: "['s']" hash: 2084596253 entries: 8>:2
<Gram: "['a']" hash: 1132832397 entries: 20>:5
<Gram: "['c']" hash: -1899141315 entries: 4>:1
<Gram: "['t']" hash: -638609257 entries: 8>:2
<Gram: "['n']" hash: -312730298 entries: 8>:2
<Gram: "['d']" hash: 1809122505 entries: 4>:1
<Gram: "['g']" hash: -1945577307 entries: 4>:1


In [103]:
for gram in gramSequence:
    print(gram)


<Gram: "['h']" hash: 88354605 entries: 8>
<Gram: "['e']" hash: 1572867462 entries: 16>
<Gram: "['r']" hash: -3166736 entries: 8>
<Gram: "['e']" hash: 1572867462 entries: 16>
<Gram: "[' ']" hash: 1795449775 entries: 32>
<Gram: "['i']" hash: 1212407449 entries: 16>
<Gram: "['s']" hash: 1577366684 entries: 8>
<Gram: "[' ']" hash: 1795449775 entries: 32>
<Gram: "['a']" hash: 666374999 entries: 20>
<Gram: "[' ']" hash: 1795449775 entries: 32>
<Gram: "['c']" hash: -1232861293 entries: 4>
<Gram: "['a']" hash: 666374999 entries: 20>
<Gram: "['t']" hash: 112387569 entries: 8>
<Gram: "[' ']" hash: 1795449775 entries: 32>
<Gram: "['a']" hash: 666374999 entries: 20>
<Gram: "['n']" hash: 1480122275 entries: 8>
<Gram: "['d']" hash: -2071129317 entries: 4>
<Gram: "[' ']" hash: 1795449775 entries: 32>
<Gram: "['h']" hash: 88354605 entries: 8>
<Gram: "['e']" hash: 1572867462 entries: 16>
<Gram: "['r']" hash: -3166736 entries: 8>
<Gram: "['e']" hash: 1572867462 entries: 16>
<Gram: "[' ']" hash: 17954497

In [21]:
str(['cat'])

"['cat']"