In [None]:
import pandas as pd
import nltk

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import  WordNetLemmatizer

nltk.download("vader_lexicon")
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("omw-1.4")

In [6]:
dataset = pd.read_csv('amazon.csv')

In [7]:
lemmatizer = WordNetLemmatizer()
def clean_preprocess_data(text):
    tokens = word_tokenize(text.lower())
    filtered_tokens = [token for token in tokens if token not in stopwords.words("english")]
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    processed_tokens = " ".join(lemmatized_tokens)
    return processed_tokens

dataset["reviewText2"] = dataset["reviewText"].apply(clean_preprocess_data)

In [8]:
# sentiment analysis
analysis = SentimentIntensityAnalyzer()

def get_sentiments(text):
    score = analysis.polarity_scores(text)
    sentiment = 1 if score["pos"] > 0 else 0
    return sentiment

dataset["sentiment"] = dataset["reviewText2"].apply(get_sentiments)

In [9]:
dataset.columns

Index(['reviewText', 'Positive', 'reviewText2', 'sentiment'], dtype='object')

In [13]:
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(dataset.Positive, dataset.sentiment)
print(f"Confusion Matrix: \n{cm}")

Confusion Matrix: 
[[ 1131  3636]
 [  576 14657]]


In [14]:
cr = classification_report(dataset.Positive, dataset.sentiment)
print(f"Classification Report: \n {cr}")

Classification Report: 
               precision    recall  f1-score   support

           0       0.66      0.24      0.35      4767
           1       0.80      0.96      0.87     15233

    accuracy                           0.79     20000
   macro avg       0.73      0.60      0.61     20000
weighted avg       0.77      0.79      0.75     20000

