In [97]:
import pyforest
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings 
warnings.filterwarnings('ignore')
import nltk          # NLP package
from nltk.corpus import stopwords      # Stopwords are most common words which are to ignore. 
from nltk.stem import WordNetLemmatizer 
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score    # Evaluation - performance matrics
from sklearn.naive_bayes import GaussianNB  # Naive Bayes theorem 
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
data = pd.read_csv("Restaurant_Reviews.tsv",sep = "\t")
data.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday off Rick Steve recommendation and loved it.,1
4,The selection on the menu was great and so were the prices.,1


In [98]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KIRAN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [99]:
# from nltk.stem.porter import PorterStemmer
# we do have other stemmer model like Snowball.Stemmer. Checkout the differences.
ps = PorterStemmer()

In [100]:
corpus_stem = []

for i in range(len(data)):
    review = re.sub('[^a-zA-Z]', ' ',data['Review'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    
    corpus_stem.append(review)
    
print(corpus_stem)  

['wow love place', 'crust good', 'tasti textur nasti', 'stop late may bank holiday rick steve recommend love', 'select menu great price', 'get angri want damn pho', 'honeslti tast fresh', 'potato like rubber could tell made ahead time kept warmer', 'fri great', 'great touch', 'servic prompt', 'would go back', 'cashier care ever say still end wayyy overpr', 'tri cape cod ravoli chicken cranberri mmmm', 'disgust pretti sure human hair', 'shock sign indic cash', 'highli recommend', 'waitress littl slow servic', 'place worth time let alon vega', 'like', 'burritto blah', 'food amaz', 'servic also cute', 'could care less interior beauti', 'perform', 'right red velvet cake ohhh stuff good', 'never brought salad ask', 'hole wall great mexican street taco friendli staff', 'took hour get food tabl restaur food luke warm sever run around like total overwhelm', 'worst salmon sashimi', 'also combo like burger fri beer decent deal', 'like final blow', 'found place accid could happier', 'seem like go

In [101]:
wordnet = WordNetLemmatizer()
corpus_lem = []

for i in range(len(data)):
    review = re.sub('[^a-zA-Z]', ' ',data['Review'][i])
    review = review.lower()
    review = review.split()
    review = [wordnet.lemmatize(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    
    corpus_lem.append(review)
    
print(corpus_lem)  



In [102]:
# from sklearn.feature_extraction.text import TfidfVectorizer
def Tfidf_nlp_fn(corpus):
    tfidf_model = TfidfVectorizer()
    return tfidf_model.fit_transform(corpus).toarray()
    

In [103]:
stem_x = Tfidf_nlp_fn(corpus_stem)
lem_x = Tfidf_nlp_fn(corpus_lem)

print(stem_x.shape, lem_x.shape)

(1000, 1565) (1000, 1766)


In [104]:
# from sklearn.feature_extraction.text import CountVectorizer

def bow_cv(corpus):
    cv = CountVectorizer()
    
    return cv.fit_transform(corpus_stem).toarray()

In [105]:
cv_stem_x = bow_cv(corpus_stem)
cv_lem_x = bow_cv(corpus_lem)

print(cv_stem_x.shape, cv_lem_x.shape)

(1000, 1565) (1000, 1565)


In [106]:
y = data.iloc[:, 1].values
y.shape

(1000,)

In [107]:
# splitting the data into training and test for model building and prediction 

# from sklearn.model_selection import train_test_split
def my_split(nlp_model_x):
    x_train, x_test, y_train, y_test = train_test_split(nlp_model_x, y,train_size=0.80,random_state=101)
    return x_train, x_test, y_train, y_test

In [108]:
# Im using only Lemmatazition not stemming in both BOW and tfidf
x_train_lem_tf, x_test_lem_tf, y_train_lem_tf, y_test_lem_tf = my_split(lem_x)
x_train_cv_lem, x_test_cv_lem, y_train_cv_lem, y_test_cv_lem = my_split(cv_lem_x)

In [110]:
# from sklearn.naive_bayes import GaussianNB
# Build the ML Model to predict the target value

def ml_model(x,y):
    classifier = GaussianNB()
    return classifier.fit(x, y)
classifier_lem_tf = ml_model(x_train_lem_tf, y_train_lem_tf)
classifier_cv_lem = ml_model(x_train_cv_lem, y_train_cv_lem)

In [111]:
# predict the train and test data sets

def predict_dt(model,x_train,x_test):
    y_pred_train = model.predict(x_train)
    y_pred_test = model.predict(x_test)
    return y_pred_train,y_pred_test

In [112]:
y_pred_train_l_tf,y_pred_test_l_tf = predict_dt(classifier_lem_tf,x_train_lem_tf,x_test_lem_tf)
y_pred_train_l_cv,y_pred_test_l_cv = predict_dt(classifier_cv_lem,x_train_cv_lem,x_test_cv_lem)

In [113]:
def conf_matrix(y_train,y_pred_train,y_test,y_pred_test):
    print(f"Confusion Matrix for train data")
    print(confusion_matrix(y_train, y_pred_train))
    print("####"*25)
    print(f"Confusion Matrix for test data")
    print(confusion_matrix(y_test, y_pred_test))

In [114]:
print(f"Confusion Matrix for tf-idf Lemmatized data using NB ML model")
conf_matrix(y_train_lem_tf,y_pred_train_l_tf,y_test_lem_tf,y_pred_test_l_tf)
print("\n")
print("*"*100)
print("\n")
print(f"Confusion Matrix for BOW lemmatized data using NB ML model")
conf_matrix(y_train_cv_lem,y_pred_train_l_cv,y_test_cv_lem,y_pred_test_l_cv)

Confusion Matrix for tf-idf Lemmatized data using NB ML model
Confusion Matrix for train data
[[356  42]
 [  0 402]]
####################################################################################################
Confusion Matrix for test data
[[59 43]
 [19 79]]


****************************************************************************************************


Confusion Matrix for BOW lemmatized data using NB ML model
Confusion Matrix for train data
[[336  62]
 [  0 402]]
####################################################################################################
Confusion Matrix for test data
[[55 47]
 [19 79]]


In [115]:
def Classification_re(y_train,y_pred_train,y_test,y_pred_test):
    print(f"Classification Report for train data")
    print(classification_report(y_train, y_pred_train))
    print("####"*25)
    print(f"Classification Report for test data")
    print(classification_report(y_test, y_pred_test))

In [116]:
print(f"Classification Report for tf-idf Lemmatized data using NB ML model")
Classification_re(y_train_lem_tf,y_pred_train_l_tf,y_test_lem_tf,y_pred_test_l_tf)
print("\n")
print("*"*100)
print("\n")
print(f"Classification Report for BOW lemmatized data using NB ML model")
Classification_re(y_train_cv_lem,y_pred_train_l_cv,y_test_cv_lem,y_pred_test_l_cv)

Classification Report for tf-idf Lemmatized data using NB ML model
Classification Report for train data
              precision    recall  f1-score   support

           0       1.00      0.89      0.94       398
           1       0.91      1.00      0.95       402

    accuracy                           0.95       800
   macro avg       0.95      0.95      0.95       800
weighted avg       0.95      0.95      0.95       800

####################################################################################################
Classification Report for test data
              precision    recall  f1-score   support

           0       0.76      0.58      0.66       102
           1       0.65      0.81      0.72        98

    accuracy                           0.69       200
   macro avg       0.70      0.69      0.69       200
weighted avg       0.70      0.69      0.69       200



****************************************************************************************************




In [27]:
def accuracy_report(y_train,y_pred_train,y_test,y_pred_test):
    print(f"Accuracy for train data")
    print(classification_report(y_train, y_pred_train))
    print("####"*25)
    print(f"Accuracy for test data")
    print(classification_report(y_test, y_pred_test))

In [117]:
print(f"Accuracy for tf-idf Lemmatized data using NB ML model")
accuracy_report(y_train_lem_tf,y_pred_train_l_tf,y_test_lem_tf,y_pred_test_l_tf)
print("\n")
print("*"*100)
print("\n")
print(f"Accuracy for BOW lemmatized data using NB ML model")
accuracy_report(y_train_cv_lem,y_pred_train_l_cv,y_test_cv_lem,y_pred_test_l_cv)

Accuracy for tf-idf Lemmatized data using NB ML model
Accuracy for train data
              precision    recall  f1-score   support

           0       1.00      0.89      0.94       398
           1       0.91      1.00      0.95       402

    accuracy                           0.95       800
   macro avg       0.95      0.95      0.95       800
weighted avg       0.95      0.95      0.95       800

####################################################################################################
Accuracy for test data
              precision    recall  f1-score   support

           0       0.76      0.58      0.66       102
           1       0.65      0.81      0.72        98

    accuracy                           0.69       200
   macro avg       0.70      0.69      0.69       200
weighted avg       0.70      0.69      0.69       200



****************************************************************************************************


Accuracy for BOW lemmatized data using 

In [29]:
# cross validation (k-fold method)
# from sklearn.model_selection import cross_val_score

accuracy = cross_val_score(classifier_lem_tf, x_test_lem_tf, y_pred_test_l_tf, cv = 15)
accuracy

array([0.35714286, 0.64285714, 0.35714286, 0.5       , 0.42857143,
       0.30769231, 0.46153846, 0.53846154, 0.69230769, 0.30769231,
       0.84615385, 0.61538462, 0.53846154, 0.46153846, 0.38461538])

In [30]:
accuracy = cross_val_score(classifier_cv_lem, x_test_cv_lem, y_pred_test_l_cv, cv = 15)
accuracy

array([0.35714286, 0.71428571, 0.35714286, 0.64285714, 0.57142857,
       0.46153846, 0.46153846, 0.38461538, 0.69230769, 0.23076923,
       0.69230769, 0.53846154, 0.38461538, 0.53846154, 0.53846154])

In [31]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression()
lr_model.fit(x_train_lem_tf,y_train_lem_tf)
y_pred = lr_model.predict(x_test_lem_tf)
lr_acc_score= accuracy_score(y_test_lem_tf,y_pred)

print("Accuracy Score:", round(lr_acc_score * 100,2),"%")
print("Confusion Matrix")
lr_cm = confusion_matrix(y_test_lem_tf, y_pred)
print(lr_cm)

Accuracy Score: 80.5 %
Confusion Matrix
[[82 20]
 [19 79]]


In [32]:
rfc_model = RandomForestClassifier(n_estimators=1000,random_state = 101,
                                max_features = 15, min_samples_split = 30)
rfc_model.fit(x_train_lem_tf,y_train_lem_tf)
y_pred = rfc_model.predict(x_test_lem_tf)
rfc_acc_score= accuracy_score(y_test_lem_tf,y_pred)
print("Accuracy Score:", round(rfc_acc_score * 100,2),"%")
print("Confusion Matrix")
rfc_cm = confusion_matrix(y_test_lem_tf, y_pred)
print(rfc_cm)

Accuracy Score: 79.5 %
Confusion Matrix
[[85 17]
 [24 74]]


In [33]:
from sklearn.svm import SVC
svc_model = SVC(C=100, gamma='auto')
svc_model.fit(x_train_lem_tf,y_train_lem_tf)
y_pred = svc_model.predict(x_test_lem_tf)
svc_acc_score= accuracy_score(y_test_lem_tf,y_pred)
print("Accuracy Score:", round(svc_acc_score * 100,2),"%")
print("Confusion Matrix")
svc_cm = confusion_matrix(y_test_lem_tf, y_pred)
print(svc_cm)

Accuracy Score: 69.0 %
Confusion Matrix
[[98  4]
 [58 40]]


In [47]:
from sklearn.naive_bayes import GaussianNB
nb_model = GaussianNB()
nb_model.fit(x_train_lem_tf,y_train_lem_tf)
y_pred = nb_model.predict(x_test_lem_tf)
lr_acc_score= accuracy_score(y_test_lem_tf,y_pred)

print("Accuracy Score:", round(lr_acc_score * 100,2),"%")
print("Confusion Matrix")
lr_cm = confusion_matrix(y_test_lem_tf, y_pred)
print(lr_cm)

Accuracy Score: 69.0 %
Confusion Matrix
[[59 43]
 [19 79]]


# Using Pipeline¶
Pipeline is used to merge NLP model and ML model where tf-idf will convert the words to vector and predicted by given ML model

# PIPELINE+Randomforest

In [48]:
x_train_pipe, x_test_pipe, y_train_pipe, y_test_pipe = my_split(data['Review'])

In [35]:
classifier = Pipeline([('tfidf',TfidfVectorizer()),('classifier',
                                RandomForestClassifier(n_estimators = 500,max_depth = None,
                                max_features = 5, min_samples_split = 20))])

In [36]:
classifier.fit(x_train_pipe,y_train_pipe)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('classifier',
                 RandomForestClassifier(max_features=5, min_samples_split=20,
                                        n_estimators=500))])

In [37]:
y_pred = classifier.predict(x_test_pipe)

In [38]:
print(confusion_matrix(y_test_pipe,y_pred))

[[71 31]
 [13 85]]


In [39]:
print(accuracy_score(y_test_pipe,y_pred))

0.78


# PIPELINE+SVC

In [133]:
classifier = Pipeline([('tfidf',TfidfVectorizer()),('classifier',
                                SVC( probability=True))])

In [134]:
classifier.fit(x_train_pipe,y_train_pipe)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('classifier', SVC(probability=True))])

In [135]:
y_pred = classifier.predict(x_test_pipe)

In [136]:
print(confusion_matrix(y_test_pipe,y_pred))

[[83 19]
 [15 83]]


In [137]:
print(accuracy_score(y_test_pipe,y_pred))

0.83


In [138]:
# create an iterator object with write permission - model.pkl
with open('sentiment_model_pipeline', 'wb') as files:
    pickle.dump(classifier, files)

<IPython.core.display.Javascript object>

# Streamlit App

In [139]:
%%writefile app.py

# import packages
import streamlit as st
import os
import numpy as np
 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
 
# text preprocessing modules
from string import punctuation
import pickle
# text preprocessing modules
from nltk.tokenize import word_tokenize
 
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re  # regular expression
import joblib
 
import warnings
 
warnings.filterwarnings("ignore")
# seeding
np.random.seed(123)
 
# load stop words
stop_words = stopwords.words("english")

# function to clean the text
@st.cache
def text_cleaning(text, remove_stop_words=True, lemmatize_words=True):
    # Clean the text, with the option to remove stop_words and to lemmatize word
 
    # Clean the text
    text = re.sub(r"[^A-Za-z0-9]", " ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"http\S+", " link ", text)
    text = re.sub(r"\b\d+(?:\.\d+)?\s+", "", text)  # remove numbers
 
    # Remove punctuation from text
    text = "".join([c for c in text if c not in punctuation])
 
    # Optionally, remove stop words
    if remove_stop_words:
        text = text.split()
        text = [w for w in text if not w in stop_words]
        text = " ".join(text)
 
    # Optionally, shorten words to their stems
    if lemmatize_words:
        text = text.split()
        lemmatizer = WordNetLemmatizer()
        lemmatized_words = [lemmatizer.lemmatize(word) for word in text]
        text = " ".join(lemmatized_words)
 
    # Return a list of words
    return text
# functon to make prediction
@st.cache
def make_prediction(review):
 
    # clearn the data
    clean_review = text_cleaning(review)
 
    # load the model and make prediction
    with open('sentiment_model_pipeline' , 'rb') as f:
        model = pickle.load(f)
#     model = joblib.load("sentiment_model_pipeline.pkl")
 
    # make prection
    result = model.predict([clean_review])
 
    # check probabilities
    
    probas = model.predict_proba([clean_review])
    probability = "{:.2f}".format(float(probas[:, result]))
 
    return result, probability

# Set the app title
st.title("Sentiment Analyisis App")
st.write(
    "A simple machine laerning app to predict the sentiment of a movie's review")

# Declare a form to receive a movie's review
form = st.form(key="my_form")
review = form.text_input(label="Enter the text of your movie review")
submit = form.form_submit_button(label="Make Prediction")

if submit:
    # make prediction from the input text
    result, probability = make_prediction(review)
 
    # Display results of the NLP task
    st.header("Results")
 
    if int(result) == 1:
        st.write("This is a positive review with a probabiliy of ", probability)
    else:
        st.write("This is a negative review with a probabiliy of ", probability)

Overwriting app.py


In [None]:
!streamlit run app.py 