# Reading Data

In [1]:
# Open the file 'Data_2.txt' in read mode and read its content into the variable 'sentence'
with open("Data_2.txt", 'r') as file:
    sentence = file.read()

In [2]:
# Print the content of the file
sentence

'The big black dog barked at the white cat and chased away.'

# NLTK POS Tagger

In [3]:
# Import necessary NLTK modules
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

In [4]:
# Create a set of English stopwords using NLTK
py_sword = set(stopwords.words('english'))

In [5]:
# Tokenize the text into sentences using NLTK
py_token = sent_tokenize(sentence)

In [6]:
# Print the NLTK POS Tagger output for each sentence
print("NLTK POS Tagger:")
for i in py_token:
    py_lword = nltk.word_tokenize(i)
    # Perform POS tagging using NLTK
    py_tag = nltk.pos_tag(py_lword)
    print(py_tag)

NLTK POS Tagger:
[('The', 'DT'), ('big', 'JJ'), ('black', 'JJ'), ('dog', 'NN'), ('barked', 'VBD'), ('at', 'IN'), ('the', 'DT'), ('white', 'JJ'), ('cat', 'NN'), ('and', 'CC'), ('chased', 'VBD'), ('away', 'RB'), ('.', '.')]


# TextBlob POS Tagger

In [7]:
# Import the TextBlob module
from textblob import TextBlob

In [8]:
# Create a TextBlob object for the text
blob_object = TextBlob(sentence)

In [9]:
# Print the TextBlob POS Tagger output
print('TextBlob POS Tagger:')
print(blob_object.tags)

TextBlob POS Tagger:
[('The', 'DT'), ('big', 'JJ'), ('black', 'JJ'), ('dog', 'NN'), ('barked', 'VBD'), ('at', 'IN'), ('the', 'DT'), ('white', 'JJ'), ('cat', 'NN'), ('and', 'CC'), ('chased', 'VBD'), ('away', 'RB')]


# Regular Expression Tagger

In [10]:
# Import necessary NLTK modules
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import RegexpTagger

In [11]:
# Define patterns for POS tagging using regular expressions
patterns = [
    (r'.*ing$', 'VBG'),             
    (r'.*ed$', 'VBD'),              
    (r'.*es$', 'VBZ'),              
    (r'.*ould$', 'MD'),             
    (r'.*\'s$', 'NN$'),             
    (r'.*s$', 'NNS'),               
    (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
    (r'^\d+$', 'CD'),               
    (r'.*ing$', 'VBG'),             
    (r'.*ment$', 'NN'),             
    (r'.*ful$', 'JJ'),
    (r'\b(?:and)\b', 'CC'),
    (r'\b(?:away)\b', 'RB'),
    (r'(The|the|A|a|An|an)$', 'AT'),
    (r'\b(?:big|black|white)\b', 'JJ'),
    (r'\b(?:at)\b', 'IN'),
    (r'\.$', 'PUN'),
    (r'.*', 'NN'),
]

In [12]:
# Create a RegexpTagger using the defined patterns
tagger = nltk.tag.sequential.RegexpTagger(patterns)

In [13]:
# Tokenize the text using NLTK word tokenizer
tokenized_text = word_tokenize(sentence)

In [14]:
# Print the output of the Regex Tagger
print("Regex Tagger:")
# Perform POS tagging using the Regex Tagger
print(tagger.tag(tokenized_text))

Regex Tagger:
[('The', 'AT'), ('big', 'JJ'), ('black', 'JJ'), ('dog', 'NN'), ('barked', 'VBD'), ('at', 'IN'), ('the', 'AT'), ('white', 'JJ'), ('cat', 'NN'), ('and', 'CC'), ('chased', 'VBD'), ('away', 'RB'), ('.', 'PUN')]


# Parse Trees

In [15]:
# Import necessary NLTK module
from nltk.tokenize import RegexpTokenizer

In [16]:
# Convert the text to lowercase
sentence_lower = sentence.lower()

In [17]:
sentence_lower

'the big black dog barked at the white cat and chased away.'

In [18]:
# Initialize a regular expression tokenizer to tokenize words
tokenizer = RegexpTokenizer(r'\w+')

In [19]:
# Tokenize the lowercased sentence
tokens = tokenizer.tokenize(sentence_lower)

In [20]:
# Join the tokens back into a cleaned sentence
sentence_cleaned = ' '.join(tokens)

In [21]:
# Define a CFG for parsing sentences
text2 = nltk.CFG.fromstring("""
S -> NP VP
NP -> DT NOM | DT NN
NOM -> ADJ NOM | ADJ NN | ADJ ADJ NN
VP -> VP Conj VP | VB RB | VB PP
PP -> IN NP
DT -> 'the'
Conj -> 'and'
NN -> 'dog' | 'cat'
ADJ -> 'big' | 'black' | 'white'
RB -> 'away'
IN -> 'at'
VB -> 'barked' | 'chased'
""")

In [22]:
# Tokenize the cleaned sentence
text1 = nltk.tokenize.word_tokenize(sentence_cleaned)

In [23]:
text1

['the',
 'big',
 'black',
 'dog',
 'barked',
 'at',
 'the',
 'white',
 'cat',
 'and',
 'chased',
 'away']

In [24]:
# Initialize a chart parser with the defined CFG
parser = nltk.ChartParser(text2)

In [25]:
# Parse the tokenized sentence using the CFG and print the parse trees
for tree in parser.parse(text1):
    print(tree)
    tree.pretty_print()

(S
  (NP (DT the) (NOM (ADJ big) (ADJ black) (NN dog)))
  (VP
    (VP
      (VB barked)
      (PP (IN at) (NP (DT the) (NOM (ADJ white) (NN cat)))))
    (Conj and)
    (VP (VB chased) (RB away))))
                              S                                            
          ____________________|____________                                 
         |                                 VP                              
         |                         ________|________________________        
         |                        VP                    |           |      
         |               _________|___                  |           |       
         |              |             PP                |           |      
         |              |      _______|____             |           |       
         NP             |     |            NP           |           |      
  _______|____          |     |    ________|____        |           |       
 |           NOM        |     |   |   