<a href="https://colab.research.google.com/github/emy05/NLP-Tasks-Data-Analytics/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

SENTENCE TOKENIZATION

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

In [None]:
# Define the text to be analyzed
text = "Space exploration has captivated human curiosity for centuries, beginning with ancient astronomers observing the night sky and progressing to modern-day missions to the International Space Station and beyond.In recent years, there has been a renewed interest in space exploration, particularly with the rise of private companies like SpaceX and Blue Origin. These companies have been developing reusable rockets and other technologies to make space travel more affordable and accessible.The future of space exploration is also shaped by global collaboration, with the International Space Station being a prime example of this. NASA has plans to return to the moon with the Artemis program, which aims to establish a sustainable presence on the lunar surface by 2024. There are also plans for crewed missions to Mars in the coming decades, with NASA's Mars 2020 mission already underway."
# Tokenize the text into sentences
sentences = sent_tokenize(text)
print(sentences)

WORD TOKENIZATION

In [None]:
# Perform word tokenization on each sentence
from nltk.tokenize import word_tokenize

In [None]:
words = []
for sentence in sentences:
    sentence_words = word_tokenize(sentence)
    words.append(sentence_words)
print(words)

REMOVING STOP WORDS

In [None]:
# Remove stop words from each sentence
from nltk.corpus import stopwords
nltk.download('stopwords')

In [None]:
stop_words = set(stopwords.words('english'))
filtered_words = []
for sentence_words in words:
    filtered_sentence_words = [word for word in sentence_words if word.lower() not in stop_words]
    filtered_words.append(filtered_sentence_words)
print(filtered_words)

STEMMING

In [None]:
# Perform stemming on each word
from nltk.stem import PorterStemmer

In [None]:
stemmer = PorterStemmer()
stemmed_words = []
for sentence_words in filtered_words:
    stemmed_sentence_words = [stemmer.stem(word) for word in sentence_words]
    stemmed_words.append(stemmed_sentence_words)
print(stemmed_words)

LEMMATIZATION

In [None]:
# Perform lemmatization on each word
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
lemmatizer = WordNetLemmatizer()

lemmatized_words = []
for sentence_words in filtered_words:
    lemmatized_sentence_words = [lemmatizer.lemmatize(word) for word in sentence_words]
    lemmatized_words.append(lemmatized_sentence_words)
print(lemmatized_words)

DEPENDENCY PARSING

In [None]:
import spacy
from spacy import displacy

In [None]:
# Load the pre-trained English model
nlp = spacy.load("en_core_web_sm")

In [None]:
sentence = text
# nlp function returns an object with individual token information, 
# linguistic features and relationships
doc = nlp(sentence)

print("{:<15} | {:<8} | {:<15} | {:<20}".format('Token','Relation','Head', 'Children'))
print("-" * 70)

for token in doc:
    # Print the token, dependency nature, head and all dependents of the token
    print("{:<15} | {:<8} | {:<15} | {:<20}"
          .format(str(token.text), str(token.dep_), str(token.head.text), str([child for child in token.children])))

# Use displacy to visualize the dependency 
displacy.render(doc, style='dep', options={'distance': 120})

PERCEPTRON

In [None]:
from nltk.corpus import brown
from nltk.tag import PerceptronTagger
nltk.download('brown')

In [None]:
# Train the Perceptron tagger on the Brown corpus
corpus = brown.tagged_sents(categories="news")
tagger = PerceptronTagger(load=False)
tagger.train(corpus)

# Tag a new sentence
sentence = "Space exploration has captivated human curiosity for centuries."
tokens = nltk.word_tokenize(sentence)
tags = tagger.tag(tokens)

# Print the tagged sentence
print(tags)

NER AND CHUNKING

In [None]:
import spacy

# Load the language model
nlp = spacy.load("en_core_web_sm")

sentence = text

# nlp function returns an object with individual token information, 
# linguistic features and relationships
doc = nlp(sentence)

# Extract Named Entities
print("{:<15} | {:<10}".format('Entity', 'Label'))
print("-" * 25)
for ent in doc.ents:
    print("{:<15} | {:<10}".format(ent.text, ent.label_))

# Extract Noun Chunks
print("\nNoun chunks:")
print("-" * 25)
for chunk in doc.noun_chunks:
    print(chunk.text)
