## Mini Activity

In [98]:
import nltk
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from spacy.lang.en import English
from sklearn.metrics import f1_score, recall_score, accuracy_score

# Download NLTK
nltk.download("punkt")
nltk.download("stopwords")
# tokenizer model used for splitting text into sentences to words

nlp = English()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Chris\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Chris\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [119]:
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer


def preprocess(user_text):
    text_token = word_tokenize(user_text)
    stopwords_en = stopwords.words("english")
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()

    lemmatized_words = [
        lemmatizer.lemmatize(word.lower())
        for word in text_token
        if word.lower() not in stopwords_en
    ]

    clean_words = [stemmer.stem(word) for word in lemmatized_words]

    return " ".join(clean_words)

In [120]:
texts = [
    "The movie was fantastic, I loved every moment of it",
    "The food was terrible, I would never eat there again",
    "I had a great time at the concert",
    "The service at the restaurant was horrible",
    "I realty enjoyed the book",
    "The hotel room was dirty and uncomfortable",
    "I am very satisfied with my purchase",
    "The delivery was late and the package was damaged",
    "The customer support was very helpful",
    "I am disappointed with the quality of the product",
]

labels = [
    "Positive",
    "Negative",
    "Positive",
    "Negative",
    "Positive",
    "Negative",
    "Positive",
    "Negative",
    "Positive",
    "Negative",
]

In [121]:
new_texts = [
    "The vacation was amazing, I wish I could relive it",
    "The coffee was bitter, I would not order it again",
    "The staff at the hotel were rude and unprofessional",
    "The shoes were uncomfortable, and I regret buying them",
    "I had a wonderful time visiting the museum, it was fascinating",
    "The product arrived damaged, and the return process was frustrating",
    "The waiter was polite and made the dining experience enjoyable",
]

labels_new_texts = [
    "Positive",
    "Negative",
    "Negative",
    "Negative",
    "Positive",
    "Negative",
    "Positive",
]

## Using ``CountVectorizer()``

In [160]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline


model = make_pipeline(CountVectorizer(), MultinomialNB())
model.fit(texts, labels)

predictions = [model.predict([text])[0] for text in texts]
print(predictions)
print(f"Train Accuracy Score: {accuracy_score(labels, predictions)}%")
print(f"Train F1 Score: {f1_score(labels, predictions, pos_label='Positive')}%")
print(
    f"Train Recall Score: {recall_score(labels, predictions, pos_label='Positive')}%\n"
)

['Positive', 'Negative', 'Positive', 'Negative', 'Positive', 'Negative', 'Positive', 'Negative', 'Positive', 'Negative']
Train Accuracy Score: 1.0%
Train F1 Score: 1.0%
Train Recall Score: 1.0%



In [161]:
predictions_new = [model.predict([new_text])[0] for new_text in new_texts]

for i, prediction in enumerate(predictions_new):
    print(f"{new_texts[i]}: \t\t ---> {prediction}")


print(
    f"\nTest Accuracy Score: {accuracy_score(labels_new_texts, predictions_new):.3f}%"
)
print(
    f"Test F1 Score: {f1_score(labels_new_texts, predictions_new, pos_label='Positive')}%"
)
print(
    f"Test Recall Score: {recall_score(labels_new_texts, predictions_new, pos_label='Positive'):.3f}%\n"
)

The vacation was amazing, I wish I could relive it: 		 ---> Negative
The coffee was bitter, I would not order it again: 		 ---> Negative
The staff at the hotel were rude and unprofessional: 		 ---> Negative
The shoes were uncomfortable, and I regret buying them: 		 ---> Negative
I had a wonderful time visiting the museum, it was fascinating: 		 ---> Positive
The product arrived damaged, and the return process was frustrating: 		 ---> Negative
The waiter was polite and made the dining experience enjoyable: 		 ---> Negative

Test Accuracy Score: 0.714%
Test F1 Score: 0.5%
Test Recall Score: 0.333%



## Using ``TfidfVectorizer()``

In [157]:
from sklearn.feature_extraction.text import TfidfVectorizer

preprocessed_texts = [preprocess(text) for text in texts]

model2 = make_pipeline(TfidfVectorizer(), MultinomialNB())
model2.fit(preprocessed_texts, labels)

predictions = [
    model2.predict([preprocessed_text])[0] for preprocessed_text in preprocessed_texts
]
print(predictions)
print(f"Train Accuracy Score: {accuracy_score(labels, predictions)}%")
print(f"Train F1 Score: {f1_score(labels, predictions, pos_label='Positive')}%")
print(
    f"Train Recall Score: {recall_score(labels, predictions, pos_label='Positive')}%\n"
)

['Positive', 'Negative', 'Positive', 'Negative', 'Positive', 'Negative', 'Positive', 'Negative', 'Positive', 'Negative']
Train Accuracy Score: 1.0%
Train F1 Score: 1.0%
Train Recall Score: 1.0%



In [168]:
preprocessed_new_texts = [preprocess(new_text) for new_text in new_texts]

predictions = model2.predict(preprocessed_new_texts)

for i, prediction in enumerate(predictions):
    print(f"{preprocessed_new_texts[i]}: \t\t ---> {prediction}")

print(f"\nTest Accuracy Score: {accuracy_score(labels_new_texts, predictions):.3f}%")
print(
    f"Test F1 Score: {f1_score(labels_new_texts, predictions, pos_label='Positive')}%"
)
print(
    f"Test Recall Score: {recall_score(labels_new_texts, predictions, pos_label='Positive'):.3f}%\n"
)

vacat amaz , wish could reliv: 		 ---> Negative
coffe bitter , would order: 		 ---> Negative
staff hotel rude unprofession: 		 ---> Negative
shoe uncomfort , regret buy: 		 ---> Negative
wonder time visit museum , fascin: 		 ---> Positive
product arriv damag , return process frustrat: 		 ---> Negative
waiter polit made dine experi enjoy: 		 ---> Positive

Test Accuracy Score: 0.857%
Test F1 Score: 0.8%
Test Recall Score: 0.667%

