In [2]:
#Natural Language Processing Project
#Sentiment Analysis on Google Play Store Applications user Review Data
#Dataset Collected from Kaggle
#Sentiment Analysis using NLTK

In [3]:
#importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

#importing libraries for Data Cleaning & NLTK Processing
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to C:\Users\Chirag
[nltk_data]     mahawar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
#importing the dataset
dataset = pd.read_csv('googleplaystore_user_reviews.csv')

In [5]:
dataset.head()

Unnamed: 0,Translated_Review,Sentiment
0,I like eat delicious food. That's I'm cooking ...,Positive
1,This help eating healthy exercise regular basis,Positive
2,,
3,Works great especially going grocery store,Positive
4,Best idea us,Positive


In [6]:
#Dropping NA
dataset=dataset.dropna(axis=0)

In [7]:
data=dataset.values

In [8]:
len(data)

37427

In [9]:
#Cleaning the dataset
def clean(data):
    corpus = []
    all_stop=stopwords.words('english')
    for i in range(len(data)):
        review = re.sub('[^a-zA-Z]', ' ',data[i] )
        review = review.lower()
        review = review.split()
        ps = PorterStemmer()
        review = [ps.stem(word) for word in review if not word in all_stop]
        review=[rev for rev in review if len(rev)>1]
        review = ' '.join(review)
        corpus.append(review)
    return corpus

In [10]:
data=clean(dataset.iloc[:,0].values)

In [11]:
len(data)

37427

In [12]:
data[0]

'like eat delici food cook food case best food help lot also best shelf life'

In [13]:
from sklearn.preprocessing import LabelEncoder

In [14]:
le=LabelEncoder()

In [15]:
labels=le.fit_transform(dataset.iloc[:,1])

In [16]:
labels.shape

(37427,)

In [17]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)                      
X = cv.fit_transform(data).toarray()

In [18]:
#splitting the dataset into training & test set
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X , labels , test_size = 0.20 , random_state = 0)

In [19]:
#using random Forest Classifier as Classification Model for NLP
#Fitting the Random Forest to the training set
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(30)
classifier.fit(X_train , y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [20]:
#predicting the test set results
y_pred = classifier.predict(X_test)

In [21]:
#making the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test , y_pred)

In [22]:
cm

array([[1323,   61,  303],
       [  29,  914,  117],
       [ 106,   94, 4539]], dtype=int64)

In [23]:
#Calculating Accuracy of the Model
from sklearn.metrics import accuracy_score
print(round(accuracy_score(y_test,y_pred)*100,2),"%",sep=" ")

90.52 %


In [30]:
classifier.predict(cv.transform(clean(["I badly hate this app"])))

array([0], dtype=int64)

In [25]:
le.classes_

array(['Negative', 'Neutral', 'Positive'], dtype=object)

In [26]:
classifier.predict(cv.transform(clean(["I think I am loving this app"])))

array([2], dtype=int64)

In [28]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [29]:
X.shape

(37427, 1500)