### Importing modules and corpus

In [17]:
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk import CFG
from nltk.parse import ChartParser

import pandas as pd
import string 

# Initialize corpus
raw = open("yoda.txt").read()
corpus = raw.lower()
corpus = corpus.replace("\n", " ")

### Analyzing corpus

In [20]:
sentences = nltk.sent_tokenize(corpus)
words = nltk.word_tokenize(corpus)

# Remove punctuation
words = [i.lower() for i in words if i.isalpha()]

# Calculate corpus data
length_corpus = len(corpus)
amount_sentences = len(sentences)
hapaxes = len(set(words))
average_sentence_length = length_corpus / amount_sentences
most_frequent_words = nltk.FreqDist(words).most_common(20)

# Visualize the data
data = {"Most frequent words": most_frequent_words}
df = pd.DataFrame(data)
df.index = pd.RangeIndex(start=1, stop=len(df) + 1)
print(df)
print(f"Length of the text in characters: {length_corpus}")
print(f"Amount of sentences in corpus: {amount_sentences}")
print(f"Number of unique words: {hapaxes}")
print(f"Avarage sentence length: {average_sentence_length}")

   Most frequent words
1            (the, 64)
2            (you, 54)
3             (to, 33)
4             (is, 31)
5              (i, 27)
6              (a, 22)
7           (will, 20)
8             (of, 17)
9           (have, 16)
10            (we, 15)
11           (not, 15)
12            (be, 14)
13          (your, 14)
14          (this, 13)
15          (must, 13)
16          (with, 13)
17         (force, 13)
18           (are, 11)
19            (in, 11)
20           (and, 11)
Length of the text in characters: 6005
Amount of sentences in corpus: 171
Number of unique words: 397
Avarage sentence length: 35.11695906432749


### Constructing CFG
1. Object-Subject-Verb Structure:
    - "Hard to see, the dark side is."
2. Interrogative Sentences:
    - "Afraid are you?"
    - "Master Qui-Gon, more to say have you?"
    - "A vergence, you say?"
3. Complex Sentences:
    - "Fear is the path to the dark side, fear leads to anger, anger leads to hate, hate leads to suffering."
    - "Confer on you, the level of Jedi Knight the Council does. But agree on you taking this boy as your Padawan learner, I do not."
4. Idiomatic phrases:
    - "May the Force be with you."
5. Special Constructions:
    - "An apprentice, you have, Qui-Gon. Impossible, to take on a second."
    - "Ready so early, are you? What know you of ready?"

In [31]:
# Define CFG rules
grammar = CFG.fromstring('''
    V -> 'is' | 'see' | 'discover' | 'must' | 'stay' | 'protect' | 'have' | 'say' | 'do' | 'request' | 'will' | 'feel' | 'are' | 'think' | 'leads' | 'continue' | 'were' | 'masked' | 'take' | 'know' | 'draw' | 'decided' | 'train' | 'confer' | 'agree' | 'fear' | 'sense' | 'need'
    P -> 'to' | 'with' | 'on' | 'in' | 'by' | 'for' | 'of'
    Det -> 'the' | 'this' | 'a' | 'an' | 'our' | 'your' | 'that'
    N -> 'republic' | 'side' | 'assassin' | 'queen' | 'force' | 'master' | 'opinion' | 'jedi' | 'one' | 'path' | 'anger' | 'hate' | 'suffering' | 'commitment' | 'mind' | 'future' | 'youth' | 'apprentice' | 'second' | 'council' | 'attacker' | 'fate' | 'level' | 'knight' | 'council' | 'boy' | 'padawan' | 'learner' | 'danger' | 'training' | 'defiance'
    PN -> 'Naboo' | 'Qui-Gon' | 'Force' | 'Jedi' | 'Knight' | 'Council' | 'Chosen' | 'One' | 'Skywalker' | 'Sith'
    Adv -> 'not' | 'more' | 'very' | 'hard' | 'always' | 'nevertheless' 
    Conj -> ',' | '.'
    Adj -> 'dark' | 'good' | 'young' | 'afraid' | 'deep' | 'serious' | 'ready' | 'grave' | 'early'
    I -> 'May' | 'Then' | 'With' | 'But'
''')

### Parsing sentences

In [32]:
sentences = [
    "The very Republic is threatened, if involved the Sith are.",
    "Hard to see, the dark side is.",
    "Discover who this assassin is, we must.",
    "With this Naboo queen you must stay, Qui-Gon. Protect her.",
    "May the Force be with you.",
    "Master Qui-Gon more to say have you?",
    "A vergence, you say?",
    "But you do! Revealed your opinion is.",
    "Trained as a Jedi, you request for him?",
    "Tested he will be.",
    "Good, good, young one. How feel you?",
    "Afraid are you?",
    "See through you, we can.",
    "Afraid to lose her, I think.",
    "Everything. Fear is the path to the dark side, fear leads to anger, anger leads to hate, hate leads to suffering.",
    "A Jedi must have the deepest commitment, the most serious mind. I sense much fear in you.",
    "Then continue, we will.",
    "Correct you were, Qui-Gon.",
    "Clouded, this boy's future is. Masked by his youth.",
    "An apprentice, you have, Qui-Gon. Impossible, to take on a second.",
    "Ready so early, are you? What know you of ready?",
    "Our own council we will keep on who is ready. More to learn, he has.",
    "And draw out the Queen's attacker.",
    "Young Skywalker's fate will be decided later.",
    "Train him not. Take him with you, but train him not!",
    "May the Force be with you.",
    "Confer on you, the level of Jedi Knight the Council does. But agree on you taking this boy as your Padawan learner, I do not.",
    "The Chosen One the boy may be, nevertheless, grave danger I fear in his training.",
    "Qui-Gon's defiance I sense in you. Need that, you do not. Agree, the council does. Your apprentice, young Skywalker will be.",
    "Always two there are, no more, no less. A master and an apprentice."
]

tokenized_sentences = [nltk.word_tokenize(i) for i in sentences]

#Parse each sentence according to CFG
parser = ChartParser(grammar)

for i in tokenized_sentences:
    try:
        for tree in parser.parse(i):
            print(tree)
    except ValueError:
        print(f'Error parsing: {i}')


Error parsing: ['The', 'very', 'Republic', 'is', 'threatened', ',', 'if', 'involved', 'the', 'Sith', 'are', '.']
Error parsing: ['Hard', 'to', 'see', ',', 'the', 'dark', 'side', 'is', '.']
Error parsing: ['Discover', 'who', 'this', 'assassin', 'is', ',', 'we', 'must', '.']
Error parsing: ['With', 'this', 'Naboo', 'queen', 'you', 'must', 'stay', ',', 'Qui-Gon', '.', 'Protect', 'her', '.']
Error parsing: ['May', 'the', 'Force', 'be', 'with', 'you', '.']
Error parsing: ['Master', 'Qui-Gon', 'more', 'to', 'say', 'have', 'you', '?']
Error parsing: ['A', 'vergence', ',', 'you', 'say', '?']
Error parsing: ['But', 'you', 'do', '!', 'Revealed', 'your', 'opinion', 'is', '.']
Error parsing: ['Trained', 'as', 'a', 'Jedi', ',', 'you', 'request', 'for', 'him', '?']
Error parsing: ['Tested', 'he', 'will', 'be', '.']
Error parsing: ['Good', ',', 'good', ',', 'young', 'one', '.', 'How', 'feel', 'you', '?']
Error parsing: ['Afraid', 'are', 'you', '?']
Error parsing: ['See', 'through', 'you', ',', 'we', 