**Bag of Words**

In [28]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import re, nltk
from nltk.corpus import stopwords
#Logistic Regression
from sklearn.linear_model import LogisticRegression

In [29]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [30]:
data = {
    "review": [
        "I absolutely love this phone! The camera quality is fantastic and battery lasts all day.",
        "Terrible experience. The screen broke within a week, and customer service was useless.",
        "Great value for money. Performance is smooth and build quality feels premium.",
        "The laptop overheats and the battery drains too fast. Very disappointed.",
        "Excellent sound quality and very comfortable to wear. Highly recommend these headphones!",
        "Worst product I’ve ever bought. It stopped working after two days.",
        "Amazing performance and display. Totally worth the price.",
        "Poor design and fragile build. I regret buying this item.",
        "Fast delivery and great packaging. The product works as expected.",
        "Customer support was unhelpful. Will not buy from this brand again."
    ],
    "sentiment": [
        "positive", "negative", "positive", "negative", "positive",
        "negative", "positive", "negative", "positive", "negative"
    ]
}
df=pd.DataFrame(data)
df.head(5)

Unnamed: 0,review,sentiment
0,I absolutely love this phone! The camera quali...,positive
1,Terrible experience. The screen broke within a...,negative
2,Great value for money. Performance is smooth a...,positive
3,The laptop overheats and the battery drains to...,negative
4,Excellent sound quality and very comfortable t...,positive


In [31]:
#function to change data to lowercase and clean it from non alphabetical characters
# stopwords - remove less meaningful words
def clean_text(text):
  text=text.lower()
  text=re.sub(r'[^a-zA-Z0-9\s]','',text)
  text=' '.join([word for word in text.split() if word not in stopwords.words('english')])
  return text

In [32]:
df['clean_review']=df['review'].apply(clean_text)
df

Unnamed: 0,review,sentiment,clean_review
0,I absolutely love this phone! The camera quali...,positive,absolutely love phone camera quality fantastic...
1,Terrible experience. The screen broke within a...,negative,terrible experience screen broke within week c...
2,Great value for money. Performance is smooth a...,positive,great value money performance smooth build qua...
3,The laptop overheats and the battery drains to...,negative,laptop overheats battery drains fast disappointed
4,Excellent sound quality and very comfortable t...,positive,excellent sound quality comfortable wear highl...
5,Worst product I’ve ever bought. It stopped wor...,negative,worst product ive ever bought stopped working ...
6,Amazing performance and display. Totally worth...,positive,amazing performance display totally worth price
7,Poor design and fragile build. I regret buying...,negative,poor design fragile build regret buying item
8,Fast delivery and great packaging. The product...,positive,fast delivery great packaging product works ex...
9,Customer support was unhelpful. Will not buy f...,negative,customer support unhelpful buy brand


In [33]:
cv=CountVectorizer(max_features=25)
X=cv.fit_transform(df['clean_review']).toarray()
y=df['sentiment']
print ("Top 25 feature names in X")
cv.get_feature_names_out()
bow_df=pd.DataFrame(X,columns=cv.get_feature_names_out())
bow_df

Top 25 feature names in X


Unnamed: 0,amazing,battery,bought,brand,build,buy,buying,camera,comfortable,customer,...,drains,ever,excellent,expected,fantastic,fast,great,performance,product,quality
0,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,1
3,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,1
5,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
6,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
7,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,1,0,1,0
9,0,0,0,1,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [34]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)
#0.3 - 30% for test size, 70% training size

In [35]:
model=LogisticRegression()
model.fit(X_train,y_train)


In [36]:
# User Prediction
# Taking review from user
print("\n Enter your review below: \n")
user_review=input("Enter your product review")

#clean the review
user_review=clean_text(user_review)
user_review_bow=cv.transform([user_review]).toarray()

#predict using model
user_sentiment=model.predict(user_review_bow)

#output predicted sentiment
print("\nCleaned Review", user_review)
print("\n Predicted Sentiment: ", user_sentiment[0])


 Enter your review below: 

Enter your product reviewgood product, good camera, amazing experience

Cleaned Review good product good camera amazing experience

 Predicted Sentiment:  positive
