In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import  TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [2]:
# read the data set
review_df = pd.read_csv('review_senti.csv')
review_df.head()

Unnamed: 0,product_name,product_price,Rate,Review,Summary,Sentiment
0,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,5,super!,great cooler excellent air flow and for this p...,positive
1,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,5,awesome,best budget 2 fit cooler nice cooling,positive
2,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,3,fair,the quality is good but the power of air is de...,positive
3,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,1,useless product,very bad product its a only a fan,negative
4,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,3,fair,ok ok product,neutral


In [3]:
# check the info of the data set
review_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205052 entries, 0 to 205051
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   product_name   205052 non-null  object
 1   product_price  205052 non-null  object
 2   Rate           205052 non-null  object
 3   Review         180388 non-null  object
 4   Summary        205041 non-null  object
 5   Sentiment      205052 non-null  object
dtypes: object(6)
memory usage: 9.4+ MB


In [4]:
review_df.describe()

Unnamed: 0,product_name,product_price,Rate,Review,Summary,Sentiment
count,205052,205052,205052,180388,205041,205052
unique,958,525,8,1324,92923,3
top,cello Pack of 18 Opalware Cello Dazzle Lush Fi...,1299,5,wonderful,good,positive
freq,6005,9150,118765,9016,17430,166581


In [5]:
# find the total number of null  value
review_df.isnull().sum()

product_name         0
product_price        0
Rate                 0
Review           24664
Summary             11
Sentiment            0
dtype: int64

In [6]:
# filter the data and take only the rate , sentiment and summary
filter_df = review_df[['Rate','Summary','Sentiment']]
print(filter_df.head())

  Rate                                            Summary Sentiment
0    5  great cooler excellent air flow and for this p...  positive
1    5              best budget 2 fit cooler nice cooling  positive
2    3  the quality is good but the power of air is de...  positive
3    1                  very bad product its a only a fan  negative
4    3                                      ok ok product   neutral


In [7]:
# find the total number of null  value in filter data set
print(filter_df.isnull().sum())

Rate          0
Summary      11
Sentiment     0
dtype: int64


In [8]:
# drop the null value from the data set
filter_df.dropna(inplace= True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filter_df.dropna(inplace= True)


In [9]:
# check if the data set has null value 
print(filter_df.isnull().sum())

Rate         0
Summary      0
Sentiment    0
dtype: int64


In [10]:
# convert the rate column into the numeric from
filter_df['Rate'] = pd.to_numeric(filter_df['Rate'], errors='coerce')
df = filter_df[filter_df['Rate'].isin([1, 2, 3, 4, 5])]
print(df['Rate'].value_counts())

Rate
5.0    118758
4.0     41891
1.0     21300
3.0     16599
2.0      6490
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filter_df['Rate'] = pd.to_numeric(filter_df['Rate'], errors='coerce')


In [11]:
# check the sentiment number
print(filter_df['Sentiment'].value_counts())

Sentiment
positive    166575
negative     28232
neutral      10234
Name: count, dtype: int64


In [12]:
# seperate the sentiment 
positive = filter_df[filter_df['Sentiment']=='positive']
negative = filter_df[filter_df['Sentiment']=='negative']
neutral = filter_df[filter_df['Sentiment']=='neutral']


In [13]:
# check the individual number of sentiment
print(positive['Sentiment'].count(),
negative['Sentiment'].count(),
neutral['Sentiment'].count())

166575 28232 10234


In [14]:
# fix the number of individual sentiment
min_sample = neutral['Sentiment'].count()
print(min_sample)

10234


In [15]:
# concate the all the sentiment
df = pd.concat([positive.sample(min_sample),
                         negative.sample(min_sample), 
                         neutral.sample(min_sample)], axis=0)
df.sample(10)

Unnamed: 0,Rate,Summary,Sentiment
203121,5.0,nyc set,positive
37084,1.0,quality not upto mark,neutral
105172,1.0,outside fridge fully heat and water drops come...,negative
157409,2.0,disappointed bubseeing plastic connecter it wa...,negative
55343,3.0,no bad,neutral
15840,2.0,food over burn at bottom,negative
49920,1.0,quality is not good like as used,negative
987,5.0,best cooler air delivery high other cooler air...,positive
204520,5.0,very nice as aspected,positive
48004,2.0,small projects,neutral


In [16]:
# check if all the sentiment have same number
print(df['Sentiment'].value_counts())

Sentiment
positive    10234
negative    10234
neutral     10234
Name: count, dtype: int64


In [17]:
# set levet to the sentiment 
lvl_encoder = LabelEncoder()
df['Sentiment'] = lvl_encoder.fit_transform(df['Sentiment'])
print(df.head(5))

        Rate                                            Summary  Sentiment
114858   5.0  i am really happy with the performance of this...          2
32324    4.0      no second thoughts its cheap and and it works          2
11198    3.0                                          good work          2
33810    4.0                          worth for the price rs799          2
171684   5.0                                              happy          2


In [18]:
# print the mapping of sentiment labels to their encoded values
for i, class_label in enumerate(lvl_encoder.classes_):
    print(f"{class_label} -> {i}")

negative -> 0
neutral -> 1
positive -> 2


In [19]:
# check the info
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30702 entries, 114858 to 174863
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Rate       30702 non-null  float64
 1   Summary    30702 non-null  object 
 2   Sentiment  30702 non-null  int32  
dtypes: float64(1), int32(1), object(1)
memory usage: 839.5+ KB


In [20]:
# download the stopword
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\FAHAD\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [21]:
# remove stopwords and apply stemming
stemer = PorterStemmer()
def steming(content):

    if not isinstance(content, str):
        content = ''
    stem_content = re.sub('[^a-zA-Z]',' ', content)
    stem_content = stem_content.lower()
    stem_content = stem_content.split()
    stem_content = [stemer.stem(word) for word in stem_content if not word in stopwords.words('english')]
    stem_content = ' '.join(stem_content)

    return stem_content

In [22]:
# appling the steming function
df['Summary'] = df['Summary'].apply(steming)
df.head()

Unnamed: 0,Rate,Summary,Sentiment
114858,5.0,realli happi perform led tvit qualiti best sma...,2
32324,4.0,second thought cheap work,2
11198,3.0,good work,2
33810,4.0,worth price rs,2
171684,5.0,happi,2


In [23]:
# seperate the the summary as x for model traning
x= df['Summary'].values
print(x)

['realli happi perform led tvit qualiti best smart tv also thank flipkart deliveri time'
 'second thought cheap work' 'good work' ... 'averag product'
 'livpur water purifi super water purifi cove avail anay market even onlin alsow avail kindli provid cover market'
 'ok']


In [24]:
# seperate the the sentiment as y for model traning
y=  df['Sentiment'].values
print(y)

[2 2 2 ... 1 1 1]


In [25]:
# split the x , y into the train and text group
x_train, x_test, y_train, y_text = train_test_split(x, y, test_size=.2, stratify=y, random_state=2)
print(x_train.shape)

(24561,)


In [26]:
# initialize the TfidfVectorizer
vectorize = TfidfVectorizer()

# fit the vectorizer on the training data and transform it
x_train = vectorize.fit_transform(x_train)

# transform the test data using the fitted vectorizer
x_test = vectorize.transform(x_test)


In [27]:
# initialize the model
lg = LogisticRegression()
lg.fit(x_train, y_train)

# predict on the train data
x_train_prdiction_lg = lg.predict(x_train)
traning_data_accuracy_lg = accuracy_score(y_train, x_train_prdiction_lg)
print('train-->',traning_data_accuracy_lg)

# predict on the test data
x_text_prediction_lg = lg.predict(x_test)
text_data_accuracy_lg = accuracy_score(y_text, x_text_prediction_lg)
print('test-->',text_data_accuracy_lg)

train--> 0.8373437563617117
test--> 0.7816316560820713


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [28]:
# initialize the model
rf = RandomForestClassifier(
    n_estimators=350,    # number of trees in the forest
    max_depth=25,        # maximum depth of the tree
    min_samples_split=5, # minimum number of samples required to split an internal node
    min_samples_leaf=2,  # minimum number of samples required to be at a leaf node
    bootstrap=True       # bootstrap samples are used when building trees
)

# fit the model to the training data
rf.fit(x_train, y_train)

# predict on the train data
x_train_prdiction_rf = rf.predict(x_train)
traning_data_accuracy_rf = accuracy_score(y_train, x_train_prdiction_rf)
print('train-->',traning_data_accuracy_rf)

# predict on the test data
x_text_prediction_rf = rf.predict(x_test)
text_data_accuracy_rf = accuracy_score(y_text, x_text_prediction_rf)
print('test-->',text_data_accuracy_rf)


train--> 0.7751313057285941
test--> 0.763393584106823


In [29]:
# initialize the model
xgb = XGBClassifier(
    n_estimators=450,       # number of boosting rounds (trees)
    max_depth=8,            # maximum depth of a tree
    learning_rate=0.15,     # step size shrinkage used to prevent overfitting
    subsample=0.75,         # subsample ratio of the training instance
)

# fit the model to the training data
xgb.fit(x_train, y_train)

# predict on the train data
x_train_prdiction_xgb = xgb.predict(x_train)
traning_data_accuracy_xgb = accuracy_score(y_train, x_train_prdiction_xgb)
print('train-->',traning_data_accuracy_rf)

# predict on the test data
x_text_prediction_xgb = xgb.predict(x_test)
text_data_accuracy_xgb = accuracy_score(y_text, x_text_prediction_xgb)
print('test-->',text_data_accuracy_xgb)


train--> 0.7751313057285941
test--> 0.782608695652174


In [32]:
# import the save model

import pickle
model = 'model.sav'
vectorizer = 'vectorizer.sav'
pickle.dump(xgb,open(model, 'wb'))
pickle.dump(vectorize,open(vectorizer, 'wb'))


In [31]:
# load the model
'''
model = pickle.load(open('model.sav','rb'))
prd = model.predict(x_test[20])
print(prd)

'''


"\nmodel = pickle.load(open('model.sav','rb'))\nprd = model.predict(x_test[20])\nprint(prd)\n\n"