In [None]:
# each line is sentence- 1000 sentences; no empty catogories (a bar depenedencies)
# count constituents

#call count nodes & corpus (queries); print results
#focus on learning query language

In [28]:
#!/usr/bin/env python
"""Tree queries for HW4."""

from typing import List

from nltk import tgrep, tree


DATA = "data/wsj.parse"


def count_nodes(query: str, forest: List[tree.ParentedTree]) -> int:
    """Computes the number of nodes which match a TGrep query.

    This function runs a TGrep2 query, catches the resulting generator,
    and returns the number of matching nodes.

    Args:
        query: a TGrep2 query string.
        forest: a list of one or more `ParentedTree`s to be queried.

    Returns:
        The number of matching nodes.
    """
    # This a generator whose elements are, for each sentence, a list
    # of the matching nodes.
    gen = tgrep.tgrep_nodes(query, forest)
    return sum(len(nodes) for nodes in gen)


def main() -> None:
    # Loads collection of trees.
    forest = []
    with open("wsj.parse", "r") as source: #modified datasource for using in notebook
        for line in source:
            forest.append(tree.ParentedTree.fromstring(line.rstrip()))

    #complete loop then run queries
    #when nesting queries under for loop, will keep repeating in loop
    #b/c forest is defined outside of with statement, can call forest w/ updated info outside of statement which updates forest (for)
    frag_count = count_nodes("FRAG", forest)
    cd_count = count_nodes("CD", forest)
    determiner_count = count_nodes("NP < DT", forest)
    indefDet_count = count_nodes("NP < (DT < a)", forest)
    coorNP_count = count_nodes("NP < (NP $ CC $ NP)", forest)
    # conjunction is sister to NPs
    VP_count = count_nodes("VP << NP", forest)
    transVP_count = count_nodes("VP < NP", forest)
    intransVP_count = count_nodes("VP !< NP", forest)
    ditransVP_count = count_nodes("VP < (NP $ NP)", forest)
    transVPPP_count = count_nodes("VP < (NP < PP)", forest)

    print (f'1. sentence fragments count: {frag_count:,}')
    print (f'2. cardinal numbers count: {cd_count:,}')
    print (f'3. determiner phrases count: {determiner_count:,}')
    print (f'4. indefinite determiner phrases count: {indefDet_count:,}')
    print (f'5. coordinated noun count: {coorNP_count:,}')
    print (f'6. verb phrases which contain an embedded noun phrase count: {VP_count:,}')
    print (f'7. transitive verb phrases count: {transVP_count:,}')
    print (f'8. intransitive verb phrases count: {intransVP_count:,}')
    print (f'9. ditransitive verb phrases count: {ditransVP_count:,}')
    print (f'10. transitive verb phrases with a low-attached prepositional phrase count: {transVPPP_count:,}')



if __name__ == "__main__":
    main()

1. sentence fragments count: 18
2. cardinal numbers count: 647
3. determiner phrases count: 1,128
4. indefinite determiner phrases count: 274
5. coordinated noun count: 128
6. verb phrases which contain an embedded noun phrase count: 1,535
7. transitive verb phrases count: 548
8. intransitive verb phrases count: 1,288
9. ditransitive verb phrases count: 7
10. transitive verb phrases with a low-attached prepositional phrase count: 139
