# What's in this notebook?
Here I experiment with context free grammars to see use to evaluate markov model sentences (based solely on grammar) and to generate gramatically correct sentences.

In [90]:
# import speech data
import pickle
import pandas as pd
import numpy as np
from nltk import CFG

with open('labeled_data.pickle', 'rb') as f:
    data = pickle.load(f)

We can only use gramatically correct transcripts (ie, not YouTube autogenerated text). Thus, we'll get only the punctuated texts by only looking at those with periods and commas. 

In [69]:
cfg_data = data[data.transcript.apply(lambda x: x.count(',') > 1 and x.count('.') > 2)]

In [70]:
cfg_data

Unnamed: 0,source,transcript,processed,given_by
6,https://www.youtube.com/watch?v=9ZDuNzhelhQ,Will the graduates to the degree of Bachelor\n...,"[graduate, degree, bachelor, art, please, rise...",celebrity
12,https://www.youtube.com/watch?v=RIS2dlnlsRo,Thank you so much Thank you. [LAUGHTER] Great...,"[thank, much, thank, laughter, great, guy, ins...",celebrity
26,https://www.youtube.com/watch?v=LN2ZlDdaOA8,[Music] [Music] I'm also pleased to welcome dr...,"[music, music, also, please, welcome, dr, elia...",celebrity
28,https://www.youtube.com/watch?v=WiWLAEY4XK0,">>John Hope Franklin: Chancellor Mullen,\nmemb...","[john, hope, franklin, chancellor, mullen, mem...",politician
29,https://www.youtube.com/watch?v=e0SsSr9Trro,>> Clifton Wharton: Chancellor Mullen Chairman...,"[clifton, wharton, chancellor, mullen, chairma...",politician
31,https://www.youtube.com/watch?v=UF8uR6Z6KLc,Thank You. I am honored to be with you today a...,"[thank, honor, today, commencementfrom, one, f...",politician
36,https://www.youtube.com/watch?v=28-jRr9MMcY,well we sent an email to all of our faculty st...,"[well, send, email, faculty, staff, student, g...",celebrity
40,https://www.youtube.com/watch?v=4E6ui2HsjjU,please help me in welcoming to the podium the ...,"[please, help, welcome, podium, man, call, bea...",celebrity
44,https://www.youtube.com/watch?v=1BVmRt3ap0o,[Music] [Music] now it is my pleasure to intro...,"[music, music, pleasure, introduce, commenceme...",academic
45,https://www.youtube.com/watch?v=ZzBCv8VoN3I,">> Martha Nussbaum: Faculty, administrators,\n...","[martha, nussbaum, faculty, administrator, alu...",celebrity


In [71]:
def replace_mult(punc_list, to_replace):
    for punc in punc_list:
        to_replace = to_replace.replace(punc, ' ')
    return to_replace

In [97]:
take_out = """ HOME   |  CONTACT         (adsbygoogle = window.adsbygoogle ||   ).push({});       HOME   |  CONTACT         (adsbygoogle = window.adsbygoogle ||   ).push({});      HOME   |  CONTACT         (adsbygoogle = window.adsbygoogle ||   ).push({});            (adsbygoogle = window.adsbygoogle ||   ).push({});           (adsbygoogle = window.adsbygoogle ||   ).push({});           """
cfg_data.transcript = cfg_data.transcript.apply(lambda x: 
                          replace_mult(['\n', '\x96', '\x92',
                                        '\x98', '[LAUGHTER]',
                                        '[Applause]', '[Music]',
                                       '>>', '[', ']', ':', 
                                        take_out, '\\', '#', '$',
                                       "'", '(', ')'], x))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


## Part of Speech Tagging
Need to assign words in the corpora POS tags, so that we can develop a CFG based on that.

In [98]:
import nltk

cfg_data.transcript.apply(lambda x: 
                          nltk.pos_tag(nltk.word_tokenize(x)))

6       [(Will, MD), (the, DT), (graduates, NNS), (to,...
12      [(Thank, NNP), (you, PRP), (so, RB), (much, JJ...
26      [(I, PRP), (m, VBP), (also, RB), (pleased, VBN...
28      [(John, NNP), (Hope, NNP), (Franklin, NNP), (C...
29      [(Clifton, NNP), (Wharton, NNP), (Chancellor, ...
31      [(Thank, NNP), (You, PRP), (., .), (I, PRP), (...
36      [(well, RB), (we, PRP), (sent, VBD), (an, DT),...
40      [(please, VB), (help, VB), (me, PRP), (in, IN)...
44      [(now, RB), (it, PRP), (is, VBZ), (my, PRP$), ...
45      [(Martha, NNP), (Nussbaum, NNP), (Faculty, NNP...
50      [(it, PRP), (almost, RB), (goes, VBZ), (withou...
73      [(Our, PRP$), (next, JJ), (speaker, NN), (,, ,...
85      [(Chancellor, NNP), (Peterson, NNP), (,, ,), (...
93      [(Earnest, JJS), (Gains, NNS), (There, EX), (s...
113     [(President, NNP), (Bush, NNP), (delivered, VB...
118     [(once, RB), (again, RB), (as, IN), (Jayhawks,...
148     [(this, DT), (program, NN), (is, VBZ), (brough...
154     [(so, 

In [99]:
words_with_tags = _

In [388]:
with open('pos_tags_punctuated_speech.pickle', 'wb') as f:
    pickle.dump(words_with_tags, f, pickle.HIGHEST_PROTOCOL)

In [100]:
# get a list of tags for each speech
pos_tag_lists = []
for tup_list in words_with_tags:
    tags = []
    for tup in tup_list:
        tags.append(tup[1])
    pos_tag_lists.append(tags)
pos_tag_lists[55]

['PRP',
 'VBZ',
 'DT',
 'NN',
 'CC',
 'DT',
 'NN',
 'TO',
 'VB',
 'RB',
 ',',
 'PRP',
 'MD',
 'VB',
 'PRP',
 'IN',
 'DT',
 'JJ',
 'NN',
 'NNS',
 'CC',
 'RB',
 'NN',
 'IN',
 'JJ',
 'NN',
 '.',
 'PRP$',
 'NN',
 'IN',
 'JJ',
 'JJ',
 'RB',
 'JJ',
 'NNS',
 'IN',
 'DT',
 'NNP',
 'IN',
 'NNP',
 '.',
 'PRP',
 'VBD',
 'JJ',
 'IN',
 'DT',
 'RB',
 'JJ',
 'NN',
 ',',
 'WRB',
 'VBZ',
 'NN',
 'VB',
 'TO',
 'VB',
 'IN',
 'NNS',
 'IN',
 'NNP',
 'VBP',
 'JJR',
 'IN',
 'PRP',
 'VBP',
 '.',
 'CC',
 'PRP',
 'VBN',
 ',',
 'PRP',
 'VBD',
 'IN',
 ',',
 'CC',
 'RB',
 'PRP',
 'VBZ',
 'JJ',
 ',',
 'NNS',
 'RB',
 'VBG',
 'NNS',
 'IN',
 'NNP',
 ',',
 'VBP',
 'IN',
 'NNS',
 'IN',
 'NNP',
 'VBP',
 'RBR',
 '.',
 'NNS',
 'IN',
 'NNP',
 'VBP',
 'PRP',
 'RB',
 ',',
 'CC',
 'JJ',
 'NNP',
 'NN',
 'JJ',
 ',',
 'PRP',
 'VBP',
 'RB',
 'RB',
 'JJ',
 '.',
 'CC',
 'IN',
 'VBG',
 'TO',
 'VB',
 'WRB',
 'PRP',
 'VBZ',
 'IN',
 'NN',
 'VBZ',
 'IN',
 'NN',
 'NN',
 'WDT',
 'VBZ',
 'JJ',
 'NNS',
 'RB',
 ',',
 'PRP',
 'VBD',
 'DT',
 'N

In [101]:
# break up the list of tags into lists of sentences (sentences end in . or ! or ?)
sentences = []
for tag_list in pos_tag_lists:
    sentence = []
    for tag in tag_list:
        if tag != '.':
            sentence.append(tag)
        else:
            sentence.append(tag)
            sentences.append(sentence)
            sentence = []

## Turn POS Tags into a CFG
First, I'll get an idea of the order of these tags, then I'll use then to determine how the CFG will be written.

In [102]:
# max sentence length
print('max:', max(len(x) for x in sentences))

# min sentence length
print('min:', min(len(x) for x in sentences))

# avg sentence length
print('mean:', np.mean([len(x) for x in sentences]))

# median sentence length
print('median:', np.median([len(x) for x in sentences]))

# avg sentence length
print('1st quartile', np.quantile([len(x) for x in sentences], .25))

# avg sentence length
print('3rd quartile', np.quantile([len(x) for x in sentences], .75))

max: 4706
min: 1
mean: 18.088898802159388
median: 15.0
1st quartile 9.0
3rd quartile 24.0


In [119]:
sentence_set = set()
for sentence in sentences:
    # change to a tuple bc lists are unhashable
    sentence_set.add(tuple(sentence))

# this tells us that there are 226,412 types of sentences
len(sentences) - len(sentence_set)

226412

In [108]:
# look at the first 10 words
print('first word', {sentence[0] for sentence in sentences if len(sentence) > 0}, '\n')
print('second word', {sentence[1] for sentence in sentences if len(sentence) > 1}, '\n')
print('third word', {sentence[2] for sentence in sentences if len(sentence) > 2}, '\n')
print('fourth word', {sentence[3] for sentence in sentences if len(sentence) > 3}, '\n')
print('fifth word', {sentence[4] for sentence in sentences if len(sentence) > 4}, '\n')
print('sixth word', {sentence[5] for sentence in sentences if len(sentence) > 5}, '\n')
print('seventh word', {sentence[6] for sentence in sentences if len(sentence) > 6}, '\n')
print('eighth word', {sentence[7] for sentence in sentences if len(sentence) > 7}, '\n')
print('nineth word', {sentence[8] for sentence in sentences if len(sentence) > 8}, '\n')
print('tenth word', {sentence[9] for sentence in sentences if len(sentence) > 9}, '\n')


first word {'PRP$', 'UH', '$', ',', 'NNS', 'VBZ', 'JJS', 'PDT', 'EX', 'NNPS', 'RBS', '``', 'RBR', 'IN', 'CD', 'VBD', 'VBN', 'RB', 'MD', 'PRP', '.', 'FW', 'RP', ':', 'NNP', 'VBG', 'NN', 'TO', 'VBP', 'WP', 'WDT', "''", 'JJ', 'VB', 'JJR', 'CC', 'DT', 'WRB'} 

second word {'PRP$', 'UH', ',', 'NNS', 'VBZ', 'JJS', 'PDT', 'NNPS', 'EX', 'RBS', '``', 'RBR', 'IN', 'CD', 'VBD', 'VBN', '.', 'PRP', 'RB', 'MD', 'FW', 'RP', ':', 'NNP', 'VBG', 'NN', 'TO', 'VBP', 'WP', 'WDT', "''", 'JJ', 'VB', 'JJR', 'WP$', 'CC', 'DT', 'WRB'} 

third word {'PRP$', 'UH', ',', 'NNS', 'VBZ', 'JJS', 'PDT', 'NNPS', 'EX', 'RBS', '``', 'RBR', 'IN', 'VBD', 'CD', 'VBN', '.', 'PRP', 'RB', 'MD', 'FW', 'RP', ':', 'NNP', 'TO', 'NN', 'VBG', 'VBP', 'WP', 'WDT', "''", 'JJ', 'VB', 'JJR', 'WP$', 'CC', 'DT', 'WRB', 'POS'} 

fourth word {'PRP$', 'UH', '$', ',', 'NNS', 'VBZ', 'JJS', 'PDT', 'NNPS', 'EX', 'RBS', '``', 'RBR', 'IN', 'CD', 'VBD', 'VBN', '.', 'PRP', 'RB', 'MD', 'FW', 'RP', ':', 'NNP', 'TO', 'NN', 'VBG', 'VBP', 'WP', 'WDT', "''",

## Writing a CFG using the NP VP model 
See page 6, Fig 12.3 of http://www.cs.pomona.edu/~kim/CSC181S08/text/12.pdf
Using this basic structure, I'll begin to write a CFG, then improve that CFG based on examples that are deemed "not proper english". There will likely always be room for improvement, but to begin, I will deem the original transcripts as "proper english", and then get an average score (+1 for each sentence that is deemed proper by the CFG, normalized for the number of sentences in the transcript). That will give me a target score for a "good" speech, which will enable me to compare models. 

In [123]:
# getting some ideas for how certain common sentences are tagged
print(nltk.pos_tag(nltk.word_tokenize('Thank you.')))
print(nltk.pos_tag(nltk.word_tokenize('Congratulations class of 2018.')))
print(nltk.pos_tag(nltk.word_tokenize('I am honored to be here with you all')))
print(nltk.pos_tag(nltk.word_tokenize("If I hadn't done this thing, I wouldn't have gotten where I am today")))

[('Thank', 'NNP'), ('you', 'PRP'), ('.', '.')]
[('Congratulations', 'NNS'), ('class', 'NN'), ('of', 'IN'), ('2018', 'CD'), ('.', '.')]
[('I', 'PRP'), ('am', 'VBP'), ('honored', 'VBN'), ('to', 'TO'), ('be', 'VB'), ('here', 'RB'), ('with', 'IN'), ('you', 'PRP'), ('all', 'DT')]
[('If', 'IN'), ('I', 'PRP'), ('had', 'VBD'), ("n't", 'RB'), ('done', 'VBN'), ('this', 'DT'), ('thing', 'NN'), (',', ','), ('I', 'PRP'), ('would', 'MD'), ("n't", 'RB'), ('have', 'VB'), ('gotten', 'VBN'), ('where', 'WRB'), ('I', 'PRP'), ('am', 'VBP'), ('today', 'NN')]


In [209]:
# my starting off CFG, based on the link in the markdown cell
cfg_string = """
S -> NP VP 
NP -> Pronoun | ProperNoun | Det Nominal
Pronoun -> 'WP' | 'WP$' | 'PRP' | 'PRP$'
ProperNoun -> 'NNP' | 'NNPS'
Det -> 'CD' | 'DT' | 'WDT' | 'PDT'
Nominal -> 'NN' | 'NNS'
VP -> Verb | Verb NP | Verb NP PP | Verb PP | Verb VP | toVerb | Verb Adverb| '.'
Verb -> 'MD' 'VB' | 'VBD' | 'VBN' | 'VBP' | 'VBZ' | 'VBG'
Adverb -> 'RB'
toVerb -> 'TO' Verb
PP -> Preposition NP
Preposition -> 'IN'
"""

# ignore this! I am playing with Stanford's CFG developer (https://web.stanford.edu/class/archive/cs/cs103/cs103.1156/tools/cfg/)
# and it did not like things longer than one character
cfg_alias_string = """
S -> A B
A -> C | D | E | F
C -> 'a' | 'b' | 'c' | 'd'
D -> 'e' | 'f'
E -> 'g' | 'h' | 'i' | 'j'
F -> 'k' | 'l'
B -> G | G A | G A H | G H | G B | I | G J
G -> 'm' 'n' | 'o' | 'p' | 'q' | 'r'
J -> 's'
I -> 't' G
H -> K A
k -> 'u'
"""

# to help me interpret POS tags so that it plays nicely with the alias CFG
pos_alias_dict = {'WP':'a', 'WP$':'b', 'PRP':'c', 'PRP$': 'd', 
'NNP':'e', 'NNPS':'f', 'CD':'g', 'DT': 'h', 
'WDT': 'i', 'PDT':'j', 'NN': 'k', 'NNS': 'l',
'MD': 'm', 'VB':'n', 'VBD':'o', 'VBN':'p', 
'VBP': 'q', 'VBZ':'r', 'RB': 's', 'TO':'t', 
'IN':'u'}

In [210]:
grammar = CFG.fromstring(cfg_string)
print(grammar)

Grammar with 34 productions (start state = S)
    S -> NP VP
    NP -> Pronoun
    NP -> ProperNoun
    NP -> Det Nominal
    Pronoun -> 'WP'
    Pronoun -> 'WP$'
    Pronoun -> 'PRP'
    Pronoun -> 'PRP$'
    ProperNoun -> 'NNP'
    ProperNoun -> 'NNPS'
    Det -> 'CD'
    Det -> 'DT'
    Det -> 'WDT'
    Det -> 'PDT'
    Nominal -> 'NN'
    Nominal -> 'NNS'
    VP -> Verb
    VP -> Verb NP
    VP -> Verb NP PP
    VP -> Verb PP
    VP -> Verb VP
    VP -> toVerb
    VP -> Verb Adverb
    VP -> '.'
    Verb -> 'MD' 'VB'
    Verb -> 'VBD'
    Verb -> 'VBN'
    Verb -> 'VBP'
    Verb -> 'VBZ'
    Verb -> 'VBG'
    Adverb -> 'RB'
    toVerb -> 'TO' Verb
    PP -> Preposition NP
    Preposition -> 'IN'


In [302]:
def check_grammar(sentence, grammar):
    rdp = nltk.RecursiveDescentParser(grammar)
    
    just_tags = [word_tup[1] for word_tup in nltk.pos_tag(nltk.word_tokenize(sentence))]
    parsed = rdp.parse(just_tags)
    if len(list(parsed)) > 0:
        print('Correct! Structure:')
        for tree in rdp.parse(just_tags):
            print(tree)
    else:
        print('Not grammatically correct according to this CFG.')
        print(just_tags)
    return 

In [225]:
check_grammar('I will go', grammar)

Correct! Structure:
(S (NP (Pronoun PRP)) (VP (Verb MD VB)))


In [226]:
check_grammar('Mary saw a dog', grammar)

Correct! Structure:
(S (NP (ProperNoun NNP)) (VP (Verb VBD) (NP (Det DT) (Nominal NN))))


In [227]:
check_grammar('Congratulations to the class of 2018', grammar)

Not grammatically correct according to this CFG.
['NNS', 'TO', 'DT', 'NN', 'IN', 'CD']


In [228]:
# let's give it something we know is bad
check_grammar('socks going to mall', grammar)

Not grammatically correct according to this CFG.
['NNS', 'VBG', 'TO', 'VB']


AAAAAaaaaAAhhHH that's so cool! So far it's only working on basic sentences, but that's a start! Now to try and capture more nuances. In order to focus my work on getting better at parsing commencement addresses, specifically, I'll look into the grammar of some random sentences in the corpus. I'm thinking that this time around, I'll have to incorporate tags like "JJ", which is the tag for an adjective.

In [374]:
cfg_data.iloc[3].transcript

' John Hope Franklin  Chancellor Mullen, members of the board of regents, members of the board of trustees, faculty, students, friends of the university, members of the class of 2001, parents and loved ones. This is from this point of a view, a beautiful sight. The most magnificent umbrellas I think I ve ever seen, and I hope that you are comfortable under them. Despite the fact that some of the most cynical are the less sentimental among us, or plan denigrate commencement exercises, I for one, regard them as not only significant, but down right exciting. Anyone who s attended as many commencements as I have must have made peace with them in one way or another. I have made peace with them by falling in love with them. We in front of us these graduates, who have devoted years of arduous work in their courses of study, and the completion of their task is symbolized by the ritual of this commencement. Here are these relatives, and friends, who have given so much of themselves as a token o

In [217]:
# an easier sentence
check_grammar('One hundred and sixty-three classes of Notre Dame graduates have sat where you sit today', grammar)

ValueError: Grammar does not cover some of the input words: "'CC', 'JJ', 'WRB'".

In [221]:
nltk.pos_tag(nltk.word_tokenize('163 classes of Notre Dame graduates have sat where you sit today'))

[('163', 'CD'),
 ('classes', 'NNS'),
 ('of', 'IN'),
 ('Notre', 'NNP'),
 ('Dame', 'NNP'),
 ('graduates', 'NNS'),
 ('have', 'VBP'),
 ('sat', 'VBN'),
 ('where', 'WRB'),
 ('you', 'PRP'),
 ('sit', 'VBP'),
 ('today', 'NN')]

First thing to fix: `Grammar does not cover some of the input words: "'CC', 'JJ', 'WRB'"` This tell us what I expected -- I've got to incorporate more POS tags. Also, spelled out numbers trip up NLTK's POS tagging, so I'll need to convert spelled out numbers to digits before passing it to the POS tagger. 
- 'CC' is a 'coordinating conjunction', which, according to google is "a conjunction placed between words, phrases, clauses, or sentences of equal rank", like 'and', 'but', or 'or'
    - A sentence that uses coordinating conjunctions, such as "I want tacos and pizza" should be broken down into:
    ```(S (NP (PRP I)) (VP (VBP want) (NP (NNS tacos) (CC and) (NN pizza))))```
- 'JJ' is an adjective
- 'WRB' is wh-abverb, such as 'where' or 'when'

In [385]:
# second CFG attempt
cfg_string1 = """
S -> NPS VP | 'IN' NPS VP | 'DT' NPS VP | 'EX' NPS VP
NPS -> NP | NP 'CC' NP | 'WRB' NP
NP -> Pronoun | ProperNoun | Det| Det Nominal | Nominal
Pronoun -> 'WP' | 'WP$' | 'PRP' | 'PRP$'
ProperNoun -> 'NNP' | 'NNPS'
Det -> 'CD' | 'DT' | 'WDT' | 'PDT' | 'TO'
Nominal -> 'NN' | 'NNS'
VP -> Verb | Verb NPS | Verb NPS PP | Verb PP | Verb VP | toVerb Adverb | Verb Adj | Verb NPS 'VBG'
Verb -> 'MD' 'VB' | 'VBD' | 'VBN' | 'VBP' | 'VBZ' | 'VBG'
Adverb -> 'RB'| 'RBS' | 'RBR'
toVerb -> 'TO' Verb
Adj -> 'JJ' | 'JJR' | Det 'JJS'
PP -> Preposition NPS
Preposition -> 'IN'
"""

grammar1 = CFG.fromstring(cfg_string1)

In [279]:
check_grammar('Tacos and pizza are great', grammar1)

Correct! Structure:
(S
  (NPS (NP (Nominal NN)) CC (NP (Nominal NN)))
  (VP (Verb VBP) (Adj JJ)))


In [280]:
check_grammar('Tacos and pizza are the best', grammar1)

Correct! Structure:
(S
  (NPS (NP (Nominal NN)) CC (NP (Nominal NN)))
  (VP (Verb VBP) (Adj (Det DT) JJS)))


In [281]:
check_grammar('Tacos and pizza are best', grammar1)

Not grammatically correct according to this CFG.
['NN', 'CC', 'NN', 'VBP', 'JJS']


In [290]:
check_grammar('You are the greatest class we have ever seen', grammar1)

Not grammatically correct according to this CFG.
['PRP', 'VBP', 'DT', 'JJS', 'NN', 'PRP', 'VBP', 'RB', 'VBN']


In [289]:
check_grammar("I'm honored to be here", grammar1)

Not grammatically correct according to this CFG.
['PRP', 'VBP', 'VBN', 'TO', 'VB', 'RB']


In [360]:
def grammar_score(transcript, grammar):
    rdp = nltk.RecursiveDescentParser(grammar)
    correct = 0
    incorrect = 0
    
    # get the sentences by themselves
    sentences = []
    sentence = []
    for tag in [word_tup[1] for word_tup in nltk.pos_tag(nltk.word_tokenize(transcript))]:
        if tag in [',',"'",'!','?',':', ""'``'"", "\'\'"]:
            pass
        elif tag == 'RP':
            sentence.append('TO')
        elif tag != '.':
            sentence.append(tag)
        else:
            sentences.append(sentence)
            sentence = []
        
    for sentence in sentences:
        parsed = rdp.parse(sentence)
        if len(list(parsed)) > 0:
            correct += 1
            for tree in rdp.parse(sentence):
                print(tree)
        else:
            incorrect += 1
    return correct/(correct + incorrect)

In [361]:
check_grammar("I took the dog for a walk", grammar1)
check_grammar("He was happy", CFG.fromstring(cfg_string1))
print(grammar_score('I took the dog for a walk. He was happy.', CFG.fromstring(cfg_string1)))

Correct! Structure:
(S
  (NPS (NP (Pronoun PRP)))
  (VP
    (Verb VBD)
    (NPS (NP (Det DT) (Nominal NN)))
    (PP (Preposition IN) (NPS (NP (Det DT) (Nominal NN))))))
Correct! Structure:
(S (NPS (NP (Pronoun PRP))) (VP (Verb VBD) (Adj JJ)))
(S
  (NPS (NP (Pronoun PRP)))
  (VP
    (Verb VBD)
    (NPS (NP (Det DT) (Nominal NN)))
    (PP (Preposition IN) (NPS (NP (Det DT) (Nominal NN))))))
(S (NPS (NP (Pronoun PRP))) (VP (Verb VBD) (Adj JJ)))
1.0


In [376]:
print(grammar_score(cfg_data.iloc[3].transcript, grammar1)

0.0


In [378]:
check_grammar('This is from this point of a view a beautiful sight', grammar1)

Not grammatically correct according to this CFG.
['DT', 'VBZ', 'IN', 'DT', 'NN', 'IN', 'DT', 'NN', 'DT', 'NN', 'NN']


NLTK is not doing the best job always of tagging speeches! Instead of using NLTK, we'll try using the slower, but more accurate spaCy library.

In [381]:
import spacy

def check_grammar_spc(sentence, grammar):
    nlp = spacy.load("en_core_web_sm")
    rdp = nltk.RecursiveDescentParser(grammar)
    
    just_tags = [token.tag_ for token in nlp(sentence)]
    parsed = rdp.parse(just_tags)
    if len(list(parsed)) > 0:
        print('Correct! Structure:')
        for tree in rdp.parse(just_tags):
            print(tree)
    else:
        print('Not grammatically correct according to this CFG.')
        print(just_tags)
    return 

def grammar_score_spc(transcript, grammar):
    rdp = nltk.RecursiveDescentParser(grammar)
    correct = 0
    incorrect = 0
    
    # get the sentences by themselves
    sentences = []
    sentence = []
    for tag in [token.tag_ for token in nlp(transcript)]:
        if tag in [',',"'",'!','?',':', ""'``'"", "\'\'"]:
            pass
        elif tag == 'RP':
            sentence.append('TO')
        elif tag != '.':
            sentence.append(tag)
        else:
            sentences.append(sentence)
            sentence = []
        
    for sentence in sentences:
        parsed = rdp.parse(sentence)
        if len(list(parsed)) > 0:
            correct += 1
            for tree in rdp.parse(sentence):
                print(tree)
        else:
            incorrect += 1
    return correct/(correct + incorrect)


In [387]:
check_grammar("This is a beatiful day", grammar1)

Not grammatically correct according to this CFG.
['DT', 'VBZ', 'DT', 'JJ', 'NN']


In [None]:
text_model.make_sentence()