In [1]:
!pip install stanza

Collecting stanza
  Using cached stanza-1.8.1-py3-none-any.whl (970 kB)
Installing collected packages: stanza
Successfully installed stanza-1.8.1


In [9]:
import stanza
# stanza.download('en')       # This downloads the English models for the neural pipeline
nlp = stanza.Pipeline('en') # This sets up a default neural pipeline in English
doc = nlp("Barack Obama was born in Hawaii.  He was elected president in 2008.")
doc.sentences[0].print_dependencies()

2024-04-17 12:45:17 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-04-17 12:45:17 INFO: Downloaded file to /home/zuoyu916/stanza_resources/resources.json
2024-04-17 12:45:18 INFO: Loading these models for language: en (English):
| Processor    | Package                   |
--------------------------------------------
| tokenize     | combined                  |
| mwt          | combined                  |
| pos          | combined_charlm           |
| lemma        | combined_nocharlm         |
| constituency | ptb3-revised_charlm       |
| depparse     | combined_charlm           |
| sentiment    | sstplus_charlm            |
| ner          | ontonotes-ww-multi_charlm |

2024-04-17 12:45:18 INFO: Using device: cpu
2024-04-17 12:45:18 INFO: Loading: tokenize
2024-04-17 12:45:18 INFO: Loading: mwt
2024-04-17 12:45:18 INFO: Loading: pos
2024-04-17 12:45:19 INFO: Loading: lemma
2024-04-17 12:45:19 INFO: Loading: constituency
2024-04-17 12:45:20 INFO: Loading: depparse
2024-04-17 12:45:20 INFO: Loading: sentiment
2024-04-17 12:45:21 INFO: Loading: ner

('Barack', 4, 'nsubj:pass')
('Obama', 1, 'flat')
('was', 4, 'aux:pass')
('born', 0, 'root')
('in', 6, 'case')
('Hawaii', 4, 'obl')
('.', 4, 'punct')


In [3]:
import stanza
#assuming NLP pipeline for english has already been initialised

def extract_words(input_text):

    # Process the input text
    doc = nlp(input_text)

    # Extract nouns (NN, NNS, NNP, NNPS)
    ##ADJ for adjectives
    ##N for nouns
    nouns = []
    for sentence in doc.sentences:
        for word in sentence.words:
            if word.upos.startswith('N') or word.upos == "PROPN":
                print(f"{word.text}, {word.upos}, {word.deprel}")
            else:
                print(f"\t{word.text}, {word.upos}, {word.deprel}")

# Example usage
input_sentence = "The magnetic flux density is defined as the force per unit length per unit current acting on a straight current-carrying conductor placed perpendicular to a magnetic field."
nouns_list = extract_words(input_sentence)

	The, DET, det
	magnetic, ADJ, amod
flux, NOUN, compound
density, NOUN, nsubj:pass
	is, AUX, aux:pass
	defined, VERB, root
	as, ADP, case
	the, DET, det
force, NOUN, obl
	per, ADP, case
unit, NOUN, compound
length, NOUN, nmod
	per, ADP, case
unit, NOUN, compound
current, NOUN, nmod
	acting, VERB, advcl
	on, ADP, case
	a, DET, det
	straight, ADJ, amod
	current, ADJ, amod
	-, PUNCT, punct
	carrying, VERB, amod
conductor, NOUN, obl
	placed, VERB, acl
	perpendicular, ADJ, xcomp
	to, ADP, case
	a, DET, det
	magnetic, ADJ, amod
field, NOUN, obl
	., PUNCT, punct


In [4]:
def extract_subj(input_text):
    doc = nlp(input_text)
    long = []
    for sentence in doc.sentences:
        out = ""
        for word in sentence.words:
            if word.upos == "ADJ":
                out += f"{word.text} "
            elif word.upos.startswith("N"):
                out += f"{word.text} "
            if word.upos == "AUX":
                long.append(out)
                break
    if long:
        return long

In [5]:
input_sentence = """
The naturalistic fallacy: Just because this is the way things CURRENTLY ARE does not mean this is the way things OUGHT TO BE.
"""
print(extract_subj(input_sentence))

['naturalistic fallacy ']


In [16]:
import re
#Assuming stanza is imported
def extract_subj_v2(input_text):
    #Assuming stanza pipeline for english has already been initialised
    long_linetext = re.sub(r'[\n\r]+',': ',input_text)
    # print(long_linetext)
    doc = nlp(long_linetext)
    long = {'subj':''}
    for sentence in doc.sentences:
        out = ""
        for word in sentence.words:
            if word.upos == "ADJ":
                out += f"{word.text} "
            elif word.upos == "ADP":
                out += f"{word.text} "
            elif word.upos.startswith("N") or word.upos == "PROPN":
                out += f"{word.text} "
            elif word.upos == "AUX" or word.text==":":
                if out != '': 
                    long['subj'] = out
                    long[word.upos] = word.text
                break

        # if out != '': long.append({"subj":out,word.upos:word.text})
    if long:
        return long

In [31]:
input_sentence = "A dog is a mammal."
print(extract_subj_v2(input_sentence))

input_sentence = """The magnetic flux density is defined as the force per unit length per unit current acting
on a straight current-carrying conductor placed perpendicular to a magnetic field."""
# extract_words(input_sentence)
print(extract_subj_v2(input_sentence))

input_sentence = """Intuitionism: 
It is the theory that moral truths in metaethics are known directly by intuition."""
# extract_words(input_sentence)
print(extract_subj_v2(input_sentence))

input_sentence = """
The naturalistic fallacy: Just because this is the way things CURRENTLY ARE does not mean this is the way things OUGHT TO BE.
"""
print(extract_subj_v2(input_sentence))

input_sentence = """Divine Command Theory (DCT)
It holds that all moral beliefs come from God."""
# extract_words(input_sentence)
print(extract_subj_v2(input_sentence))

input_sentence = """Force is the product of mass and acceleration. Momentum is the product of force and velocity."""
# extract_words(input_sentence) ##Able to distinguish the key points from 2 dff sentences
print(extract_subj_v2(input_sentence))

input_sentence = """Momentum is the product of force and velocity. It is the integral of Force w.r.t. time."""
extract_words(input_sentence)
print(extract_subj_v2(input_sentence))

input_sentence = """There is a high number of commuters in the evening, which is the cause of the jam."""
extract_words(input_sentence) ##Code understands that 'is' is not used for deginition in this case
print(extract_subj_v2(input_sentence))

input_sentence = """The equation for Force only due to acceleration is $F=ma$"""
extract_words(input_sentence)
print(extract_subj_v2(input_sentence))

{'subj': 'dog ', 'AUX': 'is'}
{'subj': 'magnetic flux density ', 'AUX': 'is'}
{'subj': 'Intuitionism ', 'PUNCT': ':'}
{'subj': ''}
{'subj': 'Divine Command Theory DCT ', 'PUNCT': ':'}
{'subj': 'Momentum ', 'AUX': 'is'}
Momentum, NOUN, nsubj
	is, AUX, cop
	the, DET, det
product, NOUN, root
	of, ADP, case
force, NOUN, nmod
	and, CCONJ, cc
velocity, NOUN, conj
	., PUNCT, punct
	It, PRON, nsubj
	is, AUX, cop
	the, DET, det
integral, NOUN, root
	of, ADP, case
Force, NOUN, nmod
	w.r.t., SYM, case
time, NOUN, appos
	., PUNCT, punct
{'subj': 'Momentum ', 'AUX': 'is'}
	There, PRON, expl
	is, VERB, root
	a, DET, det
	high, ADJ, amod
number, NOUN, nsubj
	of, ADP, case
commuters, NOUN, nmod
	in, ADP, case
	the, DET, det
evening, NOUN, obl
	,, PUNCT, punct
	which, PRON, nsubj
	is, AUX, cop
	the, DET, det
cause, NOUN, acl:relcl
	of, ADP, case
	the, DET, det
jam, NOUN, nmod
	., PUNCT, punct
{'subj': 'high number of commuters in evening ', 'AUX': 'is'}
	The, DET, det
equation, NOUN, nsubj
	for, ADP, c

In [32]:
class flashcard():
    def __init__(self, text):
        self._qns = self.extract(text)
        self._ans = text
    def output(self):
        return f"{self._qns}\n{self._ans}"
    def extract(self,text):
        d = extract_subj_v2(text)
        try:
            aux = d["AUX"]
        except KeyError:
            aux = "is"
        subj = d["subj"]
        return f"What {aux} {subj.rstrip(' ')}?" if len(subj) != 0 else None

dog = flashcard("Momentum is the product of force and velocity. It is the integral of Force w.r.t. time.")
print(dog.output())

What is Momentum?
Momentum is the product of force and velocity. It is the integral of Force w.r.t. time.


In [1]:
import stanza
nlp2 = stanza.Pipeline('en', processors = "tokenize")

2024-04-17 12:43:02 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-04-17 12:43:02 INFO: Downloaded file to /home/zuoyu916/stanza_resources/resources.json
2024-04-17 12:43:02 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| mwt       | combined |

2024-04-17 12:43:02 INFO: Using device: cpu
2024-04-17 12:43:02 INFO: Loading: tokenize
2024-04-17 12:43:03 INFO: Loading: mwt
2024-04-17 12:43:03 INFO: Done loading processors!


In [2]:
doc = nlp2("Testing out a new sentence.")
print([doc.words for doc in doc.sentences])

[[{
  "id": 1,
  "text": "Testing",
  "start_char": 0,
  "end_char": 7
}, {
  "id": 2,
  "text": "out",
  "start_char": 8,
  "end_char": 11
}, {
  "id": 3,
  "text": "a",
  "start_char": 12,
  "end_char": 13
}, {
  "id": 4,
  "text": "new",
  "start_char": 14,
  "end_char": 17
}, {
  "id": 5,
  "text": "sentence",
  "start_char": 18,
  "end_char": 26
}, {
  "id": 6,
  "text": ".",
  "start_char": 26,
  "end_char": 27
}]]


In [40]:
import re
#Assuming stanza is imported
def extract_subj_v3(input_text):
    #Assuming stanza pipeline for english has already been initialised
    long_linetext = re.sub(r'[\n\r]+',': ',input_text)
    # print(long_linetext)
    doc = nlp(nlp2(long_linetext))
    long = {'subj':''}
    for sentence in doc.sentences:
        out = ""
        for word in sentence.words:
            if word.upos == "ADJ":
                out += f"{word.text} "
            elif word.upos == "ADP":
                out += f"{word.text} "
            elif word.upos.startswith("N") or word.upos == "PROPN":
                out += f"{word.text} "
            elif word.upos == "AUX" or word.text==":":
                if out != '': 
                    long['subj'] = out
                    long[word.upos] = word.text
                break

        # if out != '': long.append({"subj":out,word.upos:word.text})
    if long:
        return long

class flashcard2():
    def __init__(self, text):
        self._qns = self.extract(text)
        self._ans = text
    def output(self):
        return f"{self._qns}\n{self._ans}"
    def extract(self,text):
        d = extract_subj_v3(text)
        try:
            aux = d["AUX"]
        except KeyError:
            aux = "is"
        subj = d["subj"]
        return f"What {aux} {subj.rstrip(' ')}?" if len(subj) != 0 else None

In [33]:
input_sentences = ["The equation for Force only due to acceleration is $F=ma$ which is also the rate of change of momentum over time.",
                   "History is a verbal structure in the form of a narrative prose discourse that purports to be a model or icon of past structures.",
                   "The magnetic flux linkage is defined as the product of the number of turns of the coil and the magnetic flux through each turn.",
                   "A historian cannot access past events directly, they can only access them indirectly, and any account of these events is thus an invention rather than a representation of the past.",
                   "Alternating current occurs when charge carriers periodically reverse their direction of motion.",
                   "The side that the induced current points to has higher potential, because the current pushes electrons away from that side."]
import time
for n in input_sentences:
    start = time.process_time()
    v2 = flashcard(n)
    V2 = time.process_time() - start
    start = time.process_time()
    v3 = flashcard2(n)
    V3 = time.process_time() - start
    print(("V3",V3,v3.output()) if V2 > V3 else ("V2",V2,v2.output()))

('V3', 2.620412598999991, 'What is equation for Force due to acceleration?\nThe equation for Force only due to acceleration is $F=ma$ which is also the rate of change of momentum over time.')
('V2', 2.7066345439999964, 'What is History?\nHistory is a verbal structure in the form of a narrative prose discourse that purports to be a model or icon of past structures.')
('V2', 2.6183042689999922, 'What is magnetic flux linkage?\nThe magnetic flux linkage is defined as the product of the number of turns of the coil and the magnetic flux through each turn.')
('V3', 3.480182291999995, 'What can historian?\nA historian cannot access past events directly, they can only access them indirectly, and any account of these events is thus an invention rather than a representation of the past.')
('V2', 1.8347111659999769, 'None\nAlternating current occurs when charge carriers periodically reverse their direction of motion.')
('V2', 2.439261839999972, 'None\nThe side that the induced current points to h

# Displaying LaTex

In [124]:
from IPython.display import display, Latex
display(Latex("Magnetic flux, $\Phi =B_{\perp}NA$"))
#of course making the assumption that the input LaTex is correct

<IPython.core.display.Latex object>

# WE HAVE BEEN BLESSED BY THE COMPUTING GODS 

In [146]:
import stanza
nlp4 = stanza.Pipeline('en', processors = "tokenize,pos")

2024-04-24 04:11:10 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-04-24 04:11:11 INFO: Downloaded file to /home/zuoyu916/stanza_resources/resources.json
2024-04-24 04:11:14 INFO: Loading these models for language: en (English):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

2024-04-24 04:11:14 INFO: Using device: cpu
2024-04-24 04:11:14 INFO: Loading: tokenize
2024-04-24 04:11:14 INFO: Loading: mwt
2024-04-24 04:11:14 INFO: Loading: pos
2024-04-24 04:11:15 INFO: Done loading processors!


In [149]:
import re
#Assuming stanza is imported
def extract_subj_v4(input_text):
    #Assuming stanza pipeline for english has already been initialised
    long_linetext = re.sub(r'[\n\r]+',': ',input_text)
    # print(long_linetext)
    doc = nlp4(long_linetext)
    long = {'subj':''}
    for sentence in doc.sentences:
        out = ""
        for word in sentence.words:
            if word.upos == "ADJ":
                out += f"{word.text} "
            elif word.upos == "ADP":
                out += f"{word.text} "
            elif word.upos.startswith("N") or word.upos == "PROPN":
                out += f"{word.text} "
            elif word.upos == "AUX" or word.text==":": #hardcode
                if out != '': #hardcode
                    long['subj'] = out #hardcode
                    long[word.upos] = word.text #hardcode
                break #hardcode

        # if out != '': long.append({"subj":out,word.upos:word.text})
    if long:
        return long

class flashcard4():
    def __init__(self, text):
        self._qns = self.extract(text)
        self._ans = text
    def output(self):
        return f"{self._qns}\n{self._ans}" if self._qns != None else None
    def extract(self,text): #hardcode
        d = extract_subj_v4(text)
        try:
            aux = d["AUX"]
        except KeyError:
            aux = "is"
        subj = d["subj"]
        return f"What {aux} {subj.rstrip(' ')}?" if len(subj) != 0 else None

dog = flashcard4("Momentum is the product of force and velocity. It is the integral of Force w.r.t. time.")
print(dog.output())

What is Momentum?
Momentum is the product of force and velocity. It is the integral of Force w.r.t. time.


In [61]:
input_sentences = ["The equation for Force only due to acceleration is $F=ma$ which is also the rate of change of momentum over time.",
                   "History is a verbal structure in the form of a narrative prose discourse that purports to be a model or icon of past structures.",
                   "The magnetic flux linkage is defined as the product of the number of turns of the coil and the magnetic flux through each turn.",
                   "A historian cannot access past events directly, they can only access them indirectly, and any account of these events is thus an invention rather than a representation of the past.",
                   "Alternating current occurs when charge carriers periodically reverse their direction of motion.",
                   "The side that the induced current points to has higher potential, because the current pushes electrons away from that side."]

import time
for n in input_sentences:
    start = time.process_time()
    v4 = flashcard4(n)
    V4 = time.process_time() - start
    print(("V3",V3,v3.output()) if V3<V4 else ("V4",V4,v4.output()) )

('V4', 0.33826237100004164, 'What is equation for Force due to acceleration?\nThe equation for Force only due to acceleration is $F=ma$ which is also the rate of change of momentum over time.')
('V4', 0.3240291899999761, 'What is History?\nHistory is a verbal structure in the form of a narrative prose discourse that purports to be a model or icon of past structures.')
('V4', 0.31145768199996837, 'What is magnetic flux linkage?\nThe magnetic flux linkage is defined as the product of the number of turns of the coil and the magnetic flux through each turn.')
('V4', 0.6002836480000155, 'What can historian?\nA historian cannot access past events directly, they can only access them indirectly, and any account of these events is thus an invention rather than a representation of the past.')
('V4', 0.23678522500000554, None)
('V4', 0.30203014700003905, None)


In [81]:
text = """Many countries do not engage in free trade to protect their economies from foreign competition. Protectionism is about policies to restrict international trade, preventing the free movement of goods and services between countries"""

print(nlp4(text))

start = time.process_time()
v4 = flashcard4(text)
V4 = time.process_time() - start
print("V4",V4,v4.output())

# arr = text.split('.')
# parsed = [a+"." for a in [n for n in arr if len(n)!=0]]
# import time
# for n in parsed:
#     start = time.process_time()
#     v4 = flashcard4(n)
#     V4 = time.process_time() - start
#     print("V4",V4,v4.output())

[
  [
    {
      "id": 1,
      "text": "Many",
      "upos": "ADJ",
      "xpos": "JJ",
      "feats": "Degree=Pos",
      "start_char": 0,
      "end_char": 4
    },
    {
      "id": 2,
      "text": "countries",
      "upos": "NOUN",
      "xpos": "NNS",
      "feats": "Number=Plur",
      "start_char": 5,
      "end_char": 14
    },
    {
      "id": 3,
      "text": "do",
      "upos": "AUX",
      "xpos": "VBP",
      "feats": "Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin",
      "start_char": 15,
      "end_char": 17
    },
    {
      "id": 4,
      "text": "not",
      "upos": "PART",
      "xpos": "RB",
      "start_char": 18,
      "end_char": 21
    },
    {
      "id": 5,
      "text": "engage",
      "upos": "VERB",
      "xpos": "VB",
      "feats": "VerbForm=Inf",
      "start_char": 22,
      "end_char": 28
    },
    {
      "id": 6,
      "text": "in",
      "upos": "ADP",
      "xpos": "IN",
      "start_char": 29,
      "end_char": 31
    },
    {
      

In [144]:
text = "A historian cannot access past events directly, they can only access them indirectly, and any account of these events is thus an invention rather than a representation of the past."
IStext = "Force is the product of mass and acceleration."
DOEStext = "The car drives quickly across the highway."
WHENtext = "There is only induced emf when there is a changing magnetic flux."
EQNtext = "The equation for magnetic flux is $\Phi = B_\perp N A$."
WHYtext = "If there is a magnet, it will have a magnetic field."

doc = nlp4(WHYtext)
for sentence in doc.sentences:
    print("'Word','UPOS','XPOS'")
    for word in sentence.words:
        print(f"{word.text},{word.upos},{word.xpos}")
    print()

'Word','UPOS','XPOS'
If,SCONJ,IN
there,PRON,EX
is,VERB,VBZ
a,DET,DT
magnet,NOUN,NN
,,PUNCT,,
it,PRON,PRP
will,AUX,MD
have,VERB,VB
a,DET,DT
magnetic,ADJ,JJ
field,NOUN,NN
.,PUNCT,.



In [None]:
#OBSERVATIONS MADE

# NOUN + AUX,VBZ => WHAT IS/ARE
# NOUN + VERB => WHAT DO/DOES/DID
# NOUN + AUX,MD => WHAT CAN
# NOUN + ADV,WRB + ___ => WHEN...
# NOUN + SCONJ,IN + ___ => WHY...

# PART upos is used to flip the sentece from +ve to -ve

In [170]:
def classifier(text):
    long_linetext = re.sub(r'[\n\r]+',': ',text)
    # print(long_linetext)
    doc = nlp4(long_linetext)
    out = {}
    for sentence in doc.sentences:
        for word in sentence.words:
            out[word.id] = (word.text,word.upos,word.xpos)
    for n in range(1,len(out)+1):
        
            

classifier("The dog is a mammal.")

('dog', 'NOUN', 'NN')
('mammal', 'NOUN', 'NN')
