### Importing modules and corpus

In [17]:
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk import CFG
from nltk.parse import ChartParser

import pandas as pd
import string 

# Initialize corpus
raw = open("yoda.txt").read()
corpus = raw.lower()
corpus = corpus.replace("\n", " ")

### Analyzing corpus

In [20]:
sentences = nltk.sent_tokenize(corpus)
words = nltk.word_tokenize(corpus)

# Remove punctuation
words = [i.lower() for i in words if i.isalpha()]

# Calculate corpus data
length_corpus = len(corpus)
amount_sentences = len(sentences)
hapaxes = len(set(words))
average_sentence_length = length_corpus / amount_sentences
most_frequent_words = nltk.FreqDist(words).most_common(20)

# Visualize the data
data = {"Most frequent words": most_frequent_words}
df = pd.DataFrame(data)
df.index = pd.RangeIndex(start=1, stop=len(df) + 1)
print(df)
print(f"Length of the text in characters: {length_corpus}")
print(f"Amount of sentences in corpus: {amount_sentences}")
print(f"Number of unique words: {hapaxes}")
print(f"Avarage sentence length: {average_sentence_length}")

   Most frequent words
1            (the, 64)
2            (you, 54)
3             (to, 33)
4             (is, 31)
5              (i, 27)
6              (a, 22)
7           (will, 20)
8             (of, 17)
9           (have, 16)
10            (we, 15)
11           (not, 15)
12            (be, 14)
13          (your, 14)
14          (this, 13)
15          (must, 13)
16          (with, 13)
17         (force, 13)
18           (are, 11)
19            (in, 11)
20           (and, 11)
Length of the text in characters: 6005
Amount of sentences in corpus: 171
Number of unique words: 397
Avarage sentence length: 35.11695906432749


### Constructing CFG
1. Object-Subject-Verb Structure:
    - "Hard to see, the dark side is."
2. Interrogative Sentences:
    - "Afraid are you?"
    - "Master Qui-Gon, more to say have you?"
    - "A vergence, you say?"
    - "What know you of ready?"
3. Idiomatic Phrases: 
    - "May the Force be with you."

In [36]:
# Define CFG rules
grammar = CFG.fromstring("""
  S -> NP VP | VP NP 
  NP -> 'you' | 'the Force' | 'we' | 'the dark side' | 'Qui-Gon' | 'this Naboo queen' | 'the very Republic' | 'the Sith' | 'this assassin' | 'him' | 'the Jedi' | 'young one' | 'her' | 'a Jedi' | 'the most serious mind' | 'the deepest commitment' | 'Qui-Gon' | "this boy's future" | 'an apprentice' | 'ready' | 'the Queen\'s attacker' | 'Young Skywalker\'s fate' | 'the level of Jedi Knight' | 'the Council' | 'your apprentice' | 'young Skywalker' | 'two'
  VP -> V NP | NP V | V
  V -> 'is threatened' | 'be with' | 'must stay' | 'to see' | 'hard' | 'is' | 'discover' | 'have' | 'say' | 'request' | 'trained as' | 'tested' | 'will be' | 'feel' | 'afraid' | 'think' | 'to lose' | 'have' | 'continue' | 'were' | 'clouded' | 'masked by' | 'his youth' | 'have' | 'know' | 'learn' | 'draw out' | 'be decided' | 'be with' | 'confer on' | 'does' | 'agree' | 'will be' | 'are'
""")

### Parsing sentences

In [37]:
sentences = [
    "The very Republic is threatened, if involved the Sith are.",
    "Hard to see, the dark side is.",
    "Discover who this assassin is, we must.",
    "With this Naboo queen you must stay, Qui-Gon.",
    "May the Force be with you.",
    "Master Qui-Gon more to say have you?",
    "A vergence, you say?",
    "Revealed your opinion is.",
    "Trained as a Jedi, you request for him?",
    "Tested he will be.",
    "Good, good, young one. How feel you?",
    "Afraid are you?",
    "See through you, we can.",
    "Afraid to lose her, I think.",
    "A Jedi must have the deepest commitment, the most serious mind.",
    "Then continue, we will.",
    "Correct you were, Qui-Gon.",
    "Clouded, this boy's future is.",
    "An apprentice, you have, Qui-Gon.",
    "Ready so early, are you?",
    "What know you of ready?",
    "More to learn, he has.",
    "And draw out the Queen's attacker.",
    "Young Skywalker's fate will be decided later.",
    "May the Force be with you.",
    "Confer on you, the level of Jedi Knight the Council does.",
    "Agree, the council does.",
    "Your apprentice, young Skywalker will be.",
    "Always two there are, no more, no less."
]


tokenized_sentences = [nltk.word_tokenize(i) for i in sentences]

#Parse each sentence according to CFG
parser = ChartParser(grammar)

for i in tokenized_sentences:
    try:
        for tree in parser.parse(i):
            print(tree)
    except ValueError:
        print(f'Error parsing: {i}')


Error parsing: ['The', 'very', 'Republic', 'is', 'threatened', ',', 'if', 'involved', 'the', 'Sith', 'are', '.']
Error parsing: ['Hard', 'to', 'see', ',', 'the', 'dark', 'side', 'is', '.']
Error parsing: ['Discover', 'who', 'this', 'assassin', 'is', ',', 'we', 'must', '.']
Error parsing: ['With', 'this', 'Naboo', 'queen', 'you', 'must', 'stay', ',', 'Qui-Gon', '.']
Error parsing: ['May', 'the', 'Force', 'be', 'with', 'you', '.']
Error parsing: ['Master', 'Qui-Gon', 'more', 'to', 'say', 'have', 'you', '?']
Error parsing: ['A', 'vergence', ',', 'you', 'say', '?']
Error parsing: ['Revealed', 'your', 'opinion', 'is', '.']
Error parsing: ['Trained', 'as', 'a', 'Jedi', ',', 'you', 'request', 'for', 'him', '?']
Error parsing: ['Tested', 'he', 'will', 'be', '.']
Error parsing: ['Good', ',', 'good', ',', 'young', 'one', '.', 'How', 'feel', 'you', '?']
Error parsing: ['Afraid', 'are', 'you', '?']
Error parsing: ['See', 'through', 'you', ',', 'we', 'can', '.']
Error parsing: ['Afraid', 'to', 'los