# Introduction to Natural Language Processing with NLTK

This Python script provides an introductory guide to using the Natural Language Toolkit (NLTK),
a popular library for natural language processing (NLP) in Python. It covers basic NLP tasks such as
tokenization, part-of-speech (POS) tagging, lemmatization, sentiment analysis, and other NLP tasks.

In [None]:
# Import necessary libraries
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag, ne_chunk
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import wordnet, stopwords
from nltk.chunk import RegexpParser
from nltk.probability import FreqDist
from nltk.util import ngrams

In [None]:
# Download necessary NLTK data. Do it only once
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('wordnet')
#nltk.download('vader_lexicon')

# --------------------
# Tokenization
# --------------------

Tokenization is the process of breaking the text into words, phrases, symbols, or other meaningful elements called tokens.

In [None]:
text = "Hello! This is an example text. Let's see how NLTK handles this."
# Word Tokenization
word_tokens = word_tokenize(text)
print('Word Tokens:', word_tokens)

# Sentence Tokenization
sentence_tokens = sent_tokenize(text)
print('Sentence Tokens:', sentence_tokens)

# --------------------
# POS Tagging
# --------------------
Part-of-Speech Tagging assigns parts of speech to each word (such as nouns, verbs, adjectives, etc.)

In [None]:
pos_tags = pos_tag(word_tokens)
print('POS Tags:', pos_tags)

# --------------------
# Lemmatization
# --------------------

Lemmatization is the process of grouping together the different inflected forms of a word so they can be analysed as a single item.

In [None]:
lemmatizer = WordNetLemmatizer()

# Lemmatize with POS Tag
lemmatized_word = lemmatizer.lemmatize('running', pos='v')  # Verb
print('Lemmatized Word:', lemmatized_word)
lemmatized_word = lemmatizer.lemmatize('went', pos='v')
print('Lemmatized Word:', lemmatized_word)

# --------------------
# Sentiment Analysis
# --------------------

Sentiment Analysis is the process of determining the emotional tone behind a series of words.

In [None]:
sia = SentimentIntensityAnalyzer()
sentence_1 = "I love natural language processing!"
sentiment_score = sia.polarity_scores(sentence_1)
print('Sentiment Score for sentence_1:', sentiment_score)

sentence_2 = "I hate math! It is the worst subject"
sentiment_score = sia.polarity_scores(sentence_2)
print('Sentiment Score for sentence_2:', sentiment_score)

sentence_3 = "The new Samsung is very expenisve but the camera is really great!"
sentiment_score = sia.polarity_scores(sentence_3)
print('Sentiment Score for sentence_3:', sentiment_score)

sentence_4 = "I bought this phone 3 weeks ago. The camera stopped working last week."
sentiment_score = sia.polarity_scores(sentence_4)
print('Sentiment Score for sentence_4:', sentiment_score)

# --------------------
# NER
# --------------------

Named Entity Recognition (NER) is a process in natural language processing that identifies and classifies named entities within text into predefined categories such as names of persons, organizations, locations, dates, quantities, and monetary values.

In [None]:
# Sample text for analysis
text = """New York City is one of the largest cities in the US and worldwid. 
          The Knicks is the largest basketball team in the city. 
          It won the championship in the 1970 and 1973."""

# Tokenization
tokens = word_tokenize(text)

# POS Tagging
tags = pos_tag(tokens)


# Named Entity Recognition (NER)
ner_result = ne_chunk(tags)
print("Named Entity Recognition:")
print(ner_result)

# --------------------
# Accessing Corpora 
# --------------------

NLTK provides many high-quality corpora that can be used for many purposes and accessed easily 

In [None]:
# Let's use WordNet as an example to access synsets for a word
synsets = wordnet.synsets("computer")
print("WordNet Synsets for 'computer':", synsets)

# Frequency Distribution
fdist = FreqDist(tokens)
print("Frequency Distribution for the top 5 words:")
print(fdist.most_common(5))

# Stop Words Removal
stop_words = set(stopwords.words('english'))
filtered_sentence = [w for w in tokens if not w.lower() in stop_words]
print("Filtered Sentence without Stop Words:", filtered_sentence)

# n-grams
bigrams = list(ngrams(tokens, 2))
print("Bigrams:", bigrams)

<b> What else can you do with NLTK?</b>