In [44]:
import joblib
import nltk
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import pickle
import numpy as np
import re

In [45]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [46]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

##Text preprocessing:

In [47]:
stpwrds = stopwords.words('english')

#removing some of the words from the stopwords since here these words are very useful for our task
stpwrds.remove("not")
stpwrds.remove("nor")
stpwrds.remove("no")

In [48]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

**Apply stemming:**

Here we will do stemming where the inflected words will get reduced to their root form i.e. word stem:

In [49]:
porter = PorterStemmer()

In [50]:
def stemming(sent):
    #getting actual word from the sent
    tokens = word_tokenize(sent)
    return " ".join(porter.stem(w) for w in tokens)

In [51]:
def preprocess(sent):
    X = sent.lower()
    #removing all the urls since not useful in classificatioin
    X = re.sub(r"http\S+", "", X)

    #used for just getting text and removing all the html tags
    X = BeautifulSoup(X, 'lxml').get_text()

    #Decontracting
    X = decontracted(X)

    #removing words with numbers and then removing extra spaces at start and end
    X = re.sub("\S*\d\S*", "", X).strip()

    X = ' '.join(w for w in X.split() if w not in stpwrds)

    X = stemming(X)
    return X

In [52]:
#loading bow and normalization scaler from training for avoiding data leakage:
with open("/content/drive/MyDrive/applied ai/myself work/amazon fine food reviews/saved_vectorizer/bow_scaler.pkl", 'rb') as f:
    bow, bow_scaler = pickle.load(f)

In [53]:
best_model = joblib.load("/content/drive/MyDrive/applied ai/myself work/amazon fine food reviews/saved_models/best_lgr.sav")

#Testing:

In [54]:
#testing
while True:
    query = input("Input query:")
    if query == 'q':
        break
    else:
        query = preprocess(query)
        X = bow.transform([query])
        X = bow_scaler.transform(X)
        cls = best_model.predict(X)[0]
        neg_prob, pos_prob = best_model.predict_proba(X)[0][0], best_model.predict_proba(X)[0][1]
        if pos_prob>0.5:
            print(f"Positive with {pos_prob*100:.2f}% sure")
        else:
            print(f"Negative {neg_prob*100:.2f}% sure")

**The End**