In [1]:
import nltk
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from spacy.lang.en import English

# Download NLTK
nltk.download("punkt")
nltk.download("stopwords")
# tokenizer model used for splitting text into sentences to words

nlp = English()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Chris\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Chris\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
en_text = "This is an Introduction to Natural Language Processing"
ph_text = "Mahilig ako sa Math"

en_token = word_tokenize(en_text)
ph_token = word_tokenize(ph_text)

print(f"Tokens: {en_token}")
print(f"Tokens: {ph_token}")

Tokens: ['This', 'is', 'an', 'Introduction', 'to', 'Natural', 'Language', 'Processing']
Tokens: ['Mahilig', 'ako', 'sa', 'Math']


In [10]:
doc = nlp(en_text)
spacy_tokens = [token.text for token in doc]
print(f"Tokens using SpaCy: {spacy_tokens}")

Tokens using SpaCy: ['This', 'is', 'an', 'Introduction', 'to', 'Natural', 'Language', 'Processing']


In [14]:
# collection of text document into matrix of toxen
from sklearn.feature_extraction.text import CountVectorizer

# Naive Bayes Classifier for multinomially distributed data: represents freq or counts
from sklearn.naive_bayes import MultinomialNB

# Function used to create a pipeline that sequentially combine several processing steps into a single object
from sklearn.pipeline import make_pipeline

texts = [
    "I love machine learning",
    "I hate you",
    "Today is Thursday",
    "ChatGPT is a tool",
    "NLP is fun",
    "I cannot stand errors",
    "Math is interesting",
    "Bugs are frustrating",
]
labels = [
    "Positive",
    "Negative",
    "Neutral",
    "Neutral",
    "Positive",
    "Negative",
    "Positive",
    "Negative",
]

model = make_pipeline(CountVectorizer(), MultinomialNB())
model.fit(texts, labels)

user_input = input("Enter a text: ")
prediction = model.predict([user_input])
print(f"Predicted Sentiment: {prediction[0]}")

Predicted Sentiment: Negative


**NLTK** (Natural Language Toolkit)

Comprehensive library in NLP that provides easy to use interfaces for over 500 corpora and lexical analysis such as WordNet
- Classification 
- Tokenization
- Stemming
- Tagging
- Parsing
- Semantic Reasoning

```python
def preprocess(user_text):
    text_token = word_tokenize(user_text.lower())
    stopwords_en = stopwords.words("english")
    lemmatizer = WordNetLemmatizer()
    clean_words = [
        lemmatizer.lemmatize(word)
        for word in text_token
        if word not in punctuation and word not in stopwords_en
    ]
    return " ".join(clean_words)
```