In [79]:
class ASTNode:
    pass

class NumberNode(ASTNode):
    def __init__(self, value):
        self.value = value
        
class BinOpNode(ASTNode):
    def __init__(self, left, op, right):
        self.left = left
        self.op = op
        self.right = right

class UnaryOpNode(ASTNode):
    def __init__(self, op, expr):
        self.op = op
        self.expr = expr

In [77]:
class Parser:
    def __init__(self, text):
        self.tokens = self.tokenize(text)
        self.pos = 0
        
    def parse(self):
        return self.expr()
        
    def tokenize(self, text):
        # Tokenize the input string into a list of tokens
        # ...
        
    def consume(self, expected_token_type):
        # Consume the next token if it is of the expected type
        # ...
        
    def expr(self):
        # Parse an expression
        left = self.term()
        
        if self.pos < len(self.tokens) and self.tokens[self.pos].type in ["PLUS", "MINUS"]:
            op_token = self.tokens[self.pos]
            self.pos += 1
            right = self.expr()
            return BinOpNode(left, op_token.value, right)
        else:
            return left
        
    def term(self):
        # Parse a term
        left = self.factor()
        
        if self.pos < len(self.tokens) and self.tokens[self.pos].type in ["MULTIPLY", "DIVIDE"]:
            op_token = self.tokens[self.pos]
            self.pos += 1
            right = self.term()
            return BinOpNode(left, op_token.value, right)
        else:
            return left
        
    def factor(self):
        # Parse a factor
        token = self.tokens[self.pos]
        self.pos += 1
        
        if token.type == "LPAREN":
            expr_node = self.expr()
            self.consume("RPAREN")
            return expr_node
        elif token.type == "NUMBER":
            return NumberNode(token.value)
        elif token.type == "PLUS":
            expr_node = self.expr()
            return UnaryOpNode(token.value, expr_node)
        elif token.type == "MINUS":
            expr_node = self.expr()
            return UnaryOpNode(token.value, expr_node)
        else:
            raise SyntaxError("Unexpected token: {}".format(token))


In [78]:
text = "2 + 3 * (4 - 1)"
parser = Parser(text)
root_node = parser.parse()

['1', '+', '1']


In [65]:
parser = Parser()
parser.parse('Hello World')
print(parser)

{'bob': 'Hello World'}


In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm

In [None]:
!pip install nltk

In [84]:
import spacy
nlp = spacy.load("en_core_web_sm")
import en_core_web_sm
nlp = en_core_web_sm.load()

In [85]:
doc = nlp("This is a sentence.")
print([(w.text, w.pos_) for w in doc])

[('This', 'PRON'), ('is', 'AUX'), ('a', 'DET'), ('sentence', 'NOUN'), ('.', 'PUNCT')]


In [87]:
import nltk
import spacy

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

nlp = spacy.load("en_core_web_sm")

text = "This is a sample sentence."

# Tokenize with NLTK
nltk_tokens = nltk.word_tokenize(text)

# Tag with NLTK
nltk_tags = nltk.pos_tag(nltk_tokens)

# Tokenize and tag with SpaCy
spacy_doc = nlp(text)
spacy_tokens = [token.text for token in spacy_doc]
spacy_tags = [(token.text, token.pos_) for token in spacy_doc]

print("NLTK tokens:", nltk_tokens)
print("NLTK tags:", nltk_tags)
print("SpaCy tokens:", spacy_tokens)
print("SpaCy tags:", spacy_tags)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Corbin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Corbin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


NLTK tokens: ['This', 'is', 'a', 'sample', 'sentence', '.']
NLTK tags: [('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('sample', 'JJ'), ('sentence', 'NN'), ('.', '.')]
SpaCy tokens: ['This', 'is', 'a', 'sample', 'sentence', '.']
SpaCy tags: [('This', 'PRON'), ('is', 'AUX'), ('a', 'DET'), ('sample', 'NOUN'), ('sentence', 'NOUN'), ('.', 'PUNCT')]


In [139]:
class Parser:
    world = {}
    # nlp = SpacyProcessor

    def __init__(self) -> None:
        self.nlp = spacy.load("en_core_web_sm")
        self.world = {}

    def __str__(self) -> str:
        return str(self.world)
    
    def identify(self, doc):
        for token in doc:
            if ('subj' in token.dep_):
                subtree = list(token.subtree)
                start = subtree[0].i
                end = subtree[-1].i + 1
                return doc[start:end]
            
    def objective(self, doc):
        for token in doc:
            if('dobj' in token.dep_):
                subtree = list(token.subtree)
                start = subtree[0].i
                end = subtree[-1].i + 1
                return doc[start:end]

    def parse(self, sentence) -> str:
        doc = self.nlp(sentence)
        sub = self.identify(doc)
        obj = self.objective(doc)
        if obj:
            print(obj)
        self.world[str(sub)] = sentence
        return "Worked."

In [140]:
sents = [
    'The brown fox jumped over the lazy dog.',
    'Jane watched TV with her brother when she was tired.',
    'Steve Martin is a good actor.',
    "The big black cat stared at the small dog.",
    "Jane watched her brother in the evenings."
]

sorting_algorithm = [
    'There is a line of people at Walmart.',
    'The people in this line are Greg, Steve, and Stan.',
    'Steve is three feet tall.',
    'Greg is two feet tall.',
    'Stan is five feet tall.'
    # Query: Who is in line? : Steve, Greg, Stan
    # Query: What is the order by height? : Stan, Steve, Greg
]

In [141]:
parser = Parser()
for sentence in sorting_algorithm:
    parser.parse(sentence)

print(parser)

{'None': 'There is a line of people at Walmart.', 'The people in this line': 'The people in this line are Greg, Steve, and Stan.', 'Steve': 'Steve is three feet tall.', 'Greg': 'Greg is two feet tall.', 'Stan': 'Stan is five feet tall.'}


In [89]:
def get_subject_phrase(doc):
    for token in doc:
        if ('subj' in token.dep_):
            subtree = list(token.subtree)
            start = subtree[0].i
            end = subtree[-1].i + 1
            return doc[start:end]

In [90]:
def get_object_phrase(doc):
    for token in doc:
        subtree = list(token.subtree)
        start = subtree[0].i
        end = subtree[-1].i + 1
        return doc[start:end]

In [98]:
for sentence in sents:
    doc = nlp(sentence)
    sub = get_subject_phrase(doc)
    obj = get_object_phrase(doc)
    print(sub)
    # print(obj)

The brown fox
Jane


In [122]:
sentence = "The cat sat on the mat."

doc = nlp(sentence)

# Find the root verb of the sentence
root = None
for token in doc:
    if token.dep_ == "ROOT" and token.pos_ == "VERB":
        root = token
        break

if root:
    # Print the root verb and its children
    print("Predicate:", root.text)
    print("Children:", [child.text for child in root.children])
else:
    print("No predicate found.")


Predicate: sat
Children: ['cat', 'on', '.']


In [137]:
import nltk

# Download the required NLTK resources (only need to do this once)
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")
nltk.download("chunking")

sentence = "What is 2 plus 2?"

# Tokenize the sentence into words
words = nltk.word_tokenize(sentence)

# Perform part-of-speech tagging on the words
tagged_words = nltk.pos_tag(words)

# Define a chunk grammar to extract the math expression
chunk_grammar = r"""
    NP: {<CD>+}            # One or more cardinal numbers
        {<NN.*>+}         # One or more nouns
        {<SYM>}           # A symbol representing the operator
        {<CD>+}          # One or more cardinal numbers
"""

# Create a chunk parser using the chunk grammar
chunk_parser = nltk.RegexpParser(chunk_grammar)

# Parse the tagged words using the chunk parser
parsed_sentence = chunk_parser.parse(tagged_words)

# Find the math expression in the parse tree
math_expr = None
for subtree in parsed_sentence.subtrees():
    if subtree.label() == "NP":
        # Extract the words from the subtree and join them into a string
        math_expr = " ".join([word for word, pos in subtree.leaves()])
        break

if math_expr:
    # Perform the calculation
    result = eval(math_expr)
    print("Result:", result)
else:
    print("No math expression found.")


Result: 2


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Corbin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Corbin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Error loading chunking: Package 'chunking' not found in
[nltk_data]     index


In [153]:
sentence = "The quick brown fox jumped over the lazy dog."
words = nltk.tokenize.word_tokenize(sentence)
pos = nltk.pos_tag(words)
pos

[('The', 'DT'),
 ('quick', 'JJ'),
 ('brown', 'NN'),
 ('fox', 'NN'),
 ('jumped', 'VBD'),
 ('over', 'IN'),
 ('the', 'DT'),
 ('lazy', 'JJ'),
 ('dog', 'NN'),
 ('.', '.')]