<a href="https://colab.research.google.com/github/guilhermelaviola/NaturalLanguageProcessing/blob/main/ClassificationModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Classification Model**

In [1]:
# Importing all the necessary libraries:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag, FreqDist
from nltk.corpus import stopwords
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy
import random
import string

In [9]:
# Ensure you have the necessary NLTK resources downloaded:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
# Tokenization and Grammatical Analysis:
text = 'NLTK is a powerful library for processing and analyzing text data!'
tokens = word_tokenize(text)  # Tokenization
pos_tags = pos_tag(tokens)  # Part of speech tagging

In [12]:
# Extracting information:
word_freq = FreqDist(tokens)  # Word frequency analysis
print('Tokens:', tokens)
print('Parts of Speech:', pos_tags)
print('Most Common Words:', word_freq.most_common(5))

Tokens: ['NLTK', 'is', 'a', 'powerful', 'library', 'for', 'processing', 'and', 'analyzing', 'text', 'data', '!']
Parts of Speech: [('NLTK', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('powerful', 'JJ'), ('library', 'NN'), ('for', 'IN'), ('processing', 'NN'), ('and', 'CC'), ('analyzing', 'VBG'), ('text', 'JJ'), ('data', 'NN'), ('!', '.')]
Most Common Words: [('NLTK', 1), ('is', 1), ('a', 1), ('powerful', 1), ('library', 1)]


In [14]:
# Defining sample dataset:
dataset = [
    ("I love this movie, it's amazing!", 'pos'),
    ('This film was terrible, I hated it.', 'neg'),
    ('Absolutely fantastic experience, highly recommended!', 'pos'),
    ('Worst movie ever, a total waste of time.', 'neg'),
]


In [15]:
# Preprocessing (Tokenization and stopword removal):
stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = word_tokenize(text.lower())  # Lowercase & tokenize
    tokens = [word for word in tokens if word not in stop_words and word.isalnum()]  # Remove stopwords & punctuation
    return tokens


In [16]:
# Feature extraction woth Bag-of-words:
def extract_features(text):
    words = set(preprocess(text))
    return {word: True for word in words}

feature_set = [(extract_features(text), label) for (text, label) in dataset]

In [17]:
# Training a Naive Bayes Classifier:
random.shuffle(feature_set)
train_set = feature_set[:3]  # 75% training data
test_set = feature_set[3:]  # 25% test data

classifier = NaiveBayesClassifier.train(train_set)

In [18]:
# Accuracy calculation:
print('Accuracy:', accuracy(classifier, test_set))

Accuracy: 0.0


In [19]:
# Precision, Recall and Confusion Matrix:
from nltk.metrics import ConfusionMatrix
from collections import defaultdict

# Get actual and predicted labels
actual = [label for (_, label) in test_set]
predicted = [classifier.classify(features) for (features, _) in test_set]

# Compute Confusion Matrix
conf_matrix = ConfusionMatrix(actual, predicted)
print('Confusion Matrix:\n', conf_matrix)

Confusion Matrix:
     | n p |
    | e o |
    | g s |
----+-----+
neg |<.>. |
pos | 1<.>|
----+-----+
(row = reference; col = test)

