In [1]:
# !pip install pandas nltk scikit-learn

import pandas as pd
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

print("Downloading NLTK resources...")
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
print("Downloads complete.")

Downloading NLTK resources...
Downloads complete.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tarru\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tarru\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tarru\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [4]:
dataset = [
    {'review_id': 1, 'text': 'The product is amazing! I love it and would recommend it.', 'sentiment': 'Positive'},
    {'review_id': 2, 'text': 'A terrible experience. The item was broken and customer service was unhelpful.', 'sentiment': 'Negative'},
    {'review_id': 3, 'text': 'It\'s an okay product, not great but not terrible either.', 'sentiment': 'Neutral'},
    {'review_id': 4, 'text': 'I will definitely be buying this again. LOVED the quality!', 'sentiment': 'Positive'}
]

df = pd.DataFrame(dataset)

print("--- Original Dataset ---")
df

--- Original Dataset ---


Unnamed: 0,review_id,text,sentiment
0,1,The product is amazing! I love it and would re...,Positive
1,2,A terrible experience. The item was broken and...,Negative
2,3,"It's an okay product, not great but not terrib...",Neutral
3,4,I will definitely be buying this again. LOVED ...,Positive


In [5]:
def process_text(text):
    text = text.lower()

    tokens = word_tokenize(text)

    clean_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]

    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in clean_tokens]
    
    return " ".join(lemmatized_tokens)

df['processed_text'] = df['text'].apply(process_text)

print("--- Dataset After Text Processing ---")
df[['text', 'processed_text']]

--- Dataset After Text Processing ---


Unnamed: 0,text,processed_text
0,The product is amazing! I love it and would re...,product amazing love would recommend
1,A terrible experience. The item was broken and...,terrible experience item broken customer servi...
2,"It's an okay product, not great but not terrib...",okay product great terrible either
3,I will definitely be buying this again. LOVED ...,definitely buying loved quality


In [6]:
tfidf_vectorizer = TfidfVectorizer()

tfidf_features = tfidf_vectorizer.fit_transform(df['processed_text'])

df_tfidf = pd.DataFrame(
    tfidf_features.toarray(), 
    columns=tfidf_vectorizer.get_feature_names_out()
)

print("--- TF-IDF Feature Vectors ---")
df_tfidf

--- TF-IDF Feature Vectors ---


Unnamed: 0,amazing,broken,buying,customer,definitely,either,experience,great,item,love,loved,okay,product,quality,recommend,service,terrible,unhelpful,would
0,0.465162,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.465162,0.0,0.0,0.366739,0.0,0.465162,0.0,0.0,0.0,0.465162
1,0.0,0.388614,0.0,0.388614,0.0,0.0,0.388614,0.0,0.388614,0.0,0.0,0.0,0.0,0.0,0.0,0.388614,0.306388,0.388614,0.0
2,0.0,0.0,0.0,0.0,0.0,0.485461,0.0,0.485461,0.0,0.0,0.0,0.485461,0.382743,0.0,0.0,0.0,0.382743,0.0,0.0
3,0.0,0.0,0.5,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0
