In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from wordcloud import WordCloud
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
import pickle
import re

: 

In [None]:
data = pd.read_csv('Amazon_Unlocked_Mobile.csv')
data.head()

In [None]:
print(f"Dataset shape: {data.shape}")
print (f"Feature names: {data.columns.values}")

In [None]:
#check for null
data.isnull().sum()

In [None]:
#getting the record where 'verified_reviews ' is null
data[data['Reviews'].isna() == True]

In [None]:
#dropping null record
data.dropna(inplace = True)
#dataset after dropping null valus
print(f"Dataset shape: {data.shape}")

In [None]:
# creating a new column 'length' that will contain the length of the string in 'verified_reviews" column
data['length'] = data['Reviews'].apply(len)
data.head()

In [None]:
print(f"'Reviews' column value: {data.iloc[10]['Reviews']}") #Original value
print(f"Length of Review: {len(data.iloc[10]['Reviews'])}") #Length of review using len()",
print(f"'length' column value : {data.iloc[10]['length']}") #Value of the column 'length'"

In [None]:
data.dtypes

ANALYZING THE RATING COLUMN

In [None]:
len(data)

In [None]:
#DISTINCT VALUES OF 'RATING' AND ITS COUNT
print(f"Rating value count: \n{data['Rating'].value_counts()}")

In [None]:
#bar plots to visualize the total count of each rating
import matplotlib.pyplot as plt
data['Rating'].value_counts().plot.bar(color = 'blue')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.title('Total count of each rating')
plt.show()

In [None]:
#Finding the percentage distribution of each rating - we'll divide the number of records for each rating by total number of records\n",
print(f"Rating value count - percentage distribution: \\n{round(data['Rating'].value_counts()/data.shape[0]*100,2)}")

In [None]:
# Create the figure
fig = plt.figure(figsize=(7,7))

# Define colors and wedge properties
colors = ['red', 'green']
wp = {'linewidth': 1, "edgecolor": 'black'}

# Calculate the proportion of each feedback type
tags = data['Review Votes'].value_counts() / data.shape[0]

# Ensure explode matches the length of tags
explode = [0.1] * len(tags)

# Define labels to match the unique values in 'Review Votes' (assuming 0=Negative and 1=Positive)


# Plot the pie chart
tags.plot(kind='pie', autopct="%1.1f%%", shadow=True, colors=colors,
          startangle=90, wedgeprops=wp, explode=explode)

# Set title
plt.title('Percentage-wise distribution of Review Votes')
plt.ylabel('')  # Optional: remove the default y-axis label
plt.show()


ANALYZING FEEDBACK COLUMN

#Distinct values of 'feedback' and its count
print(f"Feedback value count: \n{data['Review Votes'].value_counts()}")

In [None]:
#Extracting the 'verified_reviews' value for one record with feedback = 0
review_1 = data[data['Review Votes'] == 0].iloc[1]['Reviews']
print(review_1)

In [None]:
#Extracting the 'verified_reviews' value for one record with feedback = 1
review_0 = data[data['Review Votes'] == 1].iloc[1]['Reviews']
print(review_0)

In [None]:
#Bar graph to visualize the total counts of each feedback
data['Review Votes Binned'] = pd.cut(data['Review Votes'], bins=[0, 5, 10, 20])
data['Review Votes Binned'].value_counts().sort_index().plot.bar(color='red')

plt.xlabel('Review Votes')
plt.ylabel('Count')
plt.title('Total count of each Review Votes')
plt.show()

In [None]:
#Finding the percentage distribution of each feedback - we'll divide the number of records for each feedback by total number of records
print(f"Feedback value count - percentage distribution: \n{round(data['Review Votes'].value_counts()/data.shape[0]*100,2)}")

In [None]:
import matplotlib.pyplot as plt

# Assuming 'tags' has been defined as in your code
num_tags = len(tags)

# Adjust explode to match the number of tags
explode = (0.1,) * num_tags  # This will create a tuple of the correct length

# Plotting
tags.plot(kind='pie', autopct="%1.1f%%", shadow=True, colors=colors,
          startangle=90, wedgeprops=wp, explode=explode)

plt.title('Percentage-wise Distribution of Feedback')
plt.ylabel('')  # Hides the y-label
plt.show()


In [None]:
#Feedback = 0
data[data['Review Votes'] == 0]['Rating'].value_counts()

In [None]:
#Feedback = 1
data[data['Review Votes'] == 1]['Rating'].value_counts()

ANALYZING VARIATION

#DISTINCT VALUES OF 'VARIATION' AND ITS COUNT
print(f"Variation value count: \n {data['Brand Name'].value_counts()}")

In [None]:
# Plotting the top 50 brands by count in a bar chart
data['Brand Name'].value_counts().head().plot.bar(color='red')
plt.xlabel('Brand Name')
plt.ylabel('Count')
plt.title('Top 30 Brands by Count')
plt.show()


In [None]:
#Finding the percentage distribution of each variation - we'll divide the number of records for each variation by total number of records
print(f"Variation value count - percentage distribution: \n{round(data['Brand Name'].value_counts()/data.shape[0]*100,2)}")

In [None]:
data.groupby('Brand Name')['Rating'].mean()

In [None]:
data.groupby('Brand Name')['Rating'].mean().sort_values().plot.bar(color = 'brown', figsize=(11, 6))
plt.title("Mean rating according to variation")
plt.xlabel('Brand Name')
plt.ylabel('Mean rating')
plt.show()

In [None]:
data['length'].describe()


In [None]:
sns.histplot(data['length'],color='blue').set(title='Distribution of length of review ')

In [None]:
sns.histplot(data[data['Review Votes']==0]['length'],color='red').set(title='Distribution of length of review if feedback = 0')

In [None]:
sns.histplot(data[data['Review Votes']==1]['length'],color='green').set(title='Distribution of length of review if feedback = 1')

In [None]:
data.groupby('length')['Rating'].mean().plot.hist(color = 'blue', figsize=(7, 6), bins = 20)
plt.title(" Review length wise mean ratings")
plt.xlabel('Ratings')
plt.ylabel('length')
plt.show()

In [None]:
cv = CountVectorizer(stop_words='english')
words = cv.fit_transform(data.Reviews)

In [None]:
# Combine all reviews
all_reviews = ' '.join(data['Reviews'])

In [None]:
# Initialize wordcloud object
wordcloud = WordCloud(max_words=50, background_color='white')
# Generate and plot wordcloud
wordcloud.generate(all_reviews)
plt.figure(figsize=(10, 10))
plt.title('Wordcloud for all reviews', fontsize=10)
plt.imshow(wordcloud.generate(all_reviews))
plt.axis('off')

Finding Unique words i each category

In [None]:
# Combine all reviews for each feedback category and splitting them into individual words
positive_reviews = ' '.join(data[data['Review Votes'] == 1]['Reviews']).split()

negative_reviews = ' '.join(data[data['Review Votes'] == 0]['Reviews']).split()


In [None]:
#Finding words from reviews which are present in that feedback category only
unique_positive_words = [word for word in positive_reviews if word not in negative_reviews]
unique_positive = " ".join(unique_positive_words)

unique_negative_words = [word for word in negative_reviews if word not in positive_reviews]
unique_negative = " ".join(unique_negative_words)

In [None]:
wordcloud = WordCloud(background_color='white', max_words=50)

# Generate and plot wordcloud
plt.figure(figsize=(10,10))
plt.title('Wordcloud for negative reviews', fontsize=10)
plt.imshow(wordcloud.generate(unique_negative))
plt.axis('off')
plt.show()

negative words can be found above

In [None]:
wordcloud = WordCloud(background_color='white', max_words=50)

# Generate and plot wordcloud
plt.figure(figsize=(10,10))
plt.title('Wordcloud for positive reviews', fontsize=10)
plt.imshow(wordcloud.generate(unique_positive))
plt.axis('off')
plt.show()

postove words can be found above

PREPROCESSING AND MODELLING
- To build the corpus from the 'Reviews' we perform the following :
 1. Replace any non alphabet characters with a space
 2. Covert to lower case and split into words 
 3. Iterate over the individual words and if it is not a stopword then add the stemmed form of the word to the corpus

corpus = []
stemmer = PorterStemmer()
for i in range(0, data.shape[0]):
    review = re.sub('[^a-zA-Z]', ' ', data.iloc[i]['Reviews'])
    review = review.lower().split()
    review = [stemmer.stem(word) for word in review if word not in STOPWORDS]
    review = ' '.join(review)
    corpus.append(review)

Using Count Vectorizer to create bag of words

In [None]:
cv = CountVectorizer(max_features = 2500)

#Storing independent and dependent variables in X and y
X = cv.fit_transform(corpus).toarray()
y = data['Review Votes'].values

Checking X and Y shape

In [None]:
#Saving the Count Vectorizer
pickle.dump(cv, open('Models/countVectorizer.pkl', 'wb'))

In [None]:
#checking x and y shape
print(f"X shape {X.shape}")
print(f"Y shape {y.shape}")

Splitting data into train and test set

#Splitting data into train and test set with 30% data with testing.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 15)
print(f"X train: {X_train.shape}")
print(f"y train: {y_train.shape}")
print(f"X test: {X_test.shape}")
print(f"y test: {y_test.shape}")

In [None]:
print(f"X train max value: {X_train.max()}")
print(f"X test max value: {X_test.max()}")

In [None]:
scaler = MinMaxScaler()
X_train_scl = scaler.fit_transform(X_train)
X_test_scl = scaler.transform(X_test)

In [None]:
#Saving the scaler model
pickle.dump(scaler, open('Models/scaler.pkl', 'wb'))

Random Forest

#Fitting scaled X_train and y_train on Random Forest Classifier
model_rf = RandomForestClassifier()
model_rf.fit(X_train_scl, y_train)

In [None]:
#Accuracy of the model on training and testing data

print("Training Accuracy :", model_rf.score(X_train_scl, y_train))
print("Testing Accuracy :", model_rf.score(X_test_scl, y_test))

In [None]:
#Predicting on the test set
y_preds = model_rf.predict(X_test_scl)

In [None]:
#Confusion Matrix
cm = confusion_matrix(y_test, y_preds)