### Importing modules and corpus

In [48]:
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk import CFG
from nltk.parse import ChartParser

import pandas as pd
import string 

# Initialize corpus
raw = open("yoda.txt").read()
corpus = raw.lower()
corpus = corpus.replace("\n", " ")

#Download POS tagger
nltk.download('averaged_perceptron_tagger')

### Analyzing corpus

In [49]:
sentences = nltk.sent_tokenize(corpus)
words = nltk.word_tokenize(corpus)

# Remove punctuation
words = [i.lower() for i in words if i.isalpha()]

# Calculate corpus data
length_corpus = len(corpus)
amount_sentences = len(sentences)
hapaxes = len(set(words))
average_sentence_length = length_corpus / amount_sentences
most_frequent_words = nltk.FreqDist(words).most_common(20)

# Visualize the data
data = {"Most frequent words": most_frequent_words}
df = pd.DataFrame(data)
df.index = pd.RangeIndex(start=1, stop=len(df) + 1)
print(df)
print(f"Length of the text in characters: {length_corpus}")
print(f"Amount of sentences in corpus: {amount_sentences}")
print(f"Number of unique words: {hapaxes}")
print(f"Avarage sentence length: {average_sentence_length}")

   Most frequent words
1            (the, 64)
2            (you, 54)
3             (to, 33)
4             (is, 31)
5              (i, 27)
6              (a, 22)
7           (will, 20)
8             (of, 17)
9           (have, 16)
10            (we, 15)
11           (not, 15)
12            (be, 14)
13          (your, 14)
14          (this, 13)
15          (must, 13)
16          (with, 13)
17         (force, 13)
18           (are, 11)
19            (in, 11)
20           (and, 11)
Length of the text in characters: 6005
Amount of sentences in corpus: 171
Number of unique words: 397
Avarage sentence length: 35.11695906432749


### Constructing CFG

1. Object-Subject-Verb Structure:
   - "Done, it is."
   - "A Jedi's strength flows from the Force."
2. Interrogative Sentences:
   - "More to learn, he has?"
   - "What know you of ready?"
3. Idiomatic Phrases:
   - "May the Force be with you."
   - "Fear is the path to the dark side."

In [54]:
sentences = [
    "Master Qui-Gon, more to say have you?",
    "A vergence, you say?",
    "An apprentice, you have, Qui-Gon.",
    "Master Kenobi, dark times are these.",
    "Master Kenobi, our spies contact, you must.",
    "Master Kenobi, my choice is.",
    "Your father he is.",
    "May the Force be with you.",
    "Fear is the path to the dark side.",
    "Death is a natural part of life.",
    "Train yourself to let go.",
    "In a dark place we find ourselves.",
    "Only pain will you find.",
    "At an end your rule is.",
    "Failed to stop the Sith Lord, I have.",
    "To become one with the Force.",
    "To his family, send him.",
    "Done, it is.",
    "Soon will I rest.",
    "No more training do you require.",
    "A Jedi's strength flows from the Force.",
    "Revealed your opinion is.",
    "Good, young one. How feel you?",
    "Afraid are you?",
    "See through you, we can?",
    "Ready are you?",
    "More to learn, he has?",
    "What know you of ready?",
    "Told you, did he?",
    "Unexpected this is."
]

# Tag words in corpus
tagged = []

for i in sentences:
    words = nltk.word_tokenize(i)
    tagged.extend(nltk.pos_tag(words))

# Define CFG rules
# grammar = CFG.fromstring("""""")

LookupError: 
**********************************************************************
  Resource [93maveraged_perceptron_tagger[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('averaged_perceptron_tagger')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtaggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle[0m

  Searched in:
    - '/home/jesse/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


### Parsing sentences

In [51]:
tokenized_sentences = [nltk.word_tokenize(i) for i in sentences]

#Parse each sentence according to CFG
parser = ChartParser(grammar)

for i in tokenized_sentences:
    try:
        for tree in parser.parse(i):
            print(tree)
    except ValueError:
        print(f'Error parsing: {i}')

Error parsing: ['Master', 'Qui-Gon', ',', 'more', 'to', 'say', 'have', 'you', '?']
Error parsing: ['A', 'vergence', ',', 'you', 'say', '?']
Error parsing: ['An', 'apprentice', ',', 'you', 'have', ',', 'Qui-Gon', '.']
Error parsing: ['Master', 'Kenobi', ',', 'dark', 'times', 'are', 'these', '.']
Error parsing: ['Master', 'Kenobi', ',', 'our', 'spies', 'contact', ',', 'you', 'must', '.']
Error parsing: ['Master', 'Kenobi', ',', 'my', 'choice', 'is', '.']
Error parsing: ['Your', 'father', 'he', 'is', '.']
Error parsing: ['Fear', 'is', 'the', 'path', 'to', 'the', 'dark', 'side', '.']
Error parsing: ['Death', 'is', 'a', 'natural', 'part', 'of', 'life', '.']
Error parsing: ['Train', 'yourself', 'to', 'let', 'go', '.']
Error parsing: ['In', 'a', 'dark', 'place', 'we', 'find', 'ourselves', '.']
Error parsing: ['Only', 'pain', 'will', 'you', 'find', '.']
Error parsing: ['At', 'an', 'end', 'your', 'rule', 'is', '.']
Error parsing: ['Failed', 'to', 'stop', 'the', 'Sith', 'Lord', ',', 'I', 'have',