In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from nltk.tokenize import RegexpTokenizer
from sklearn.pipeline import Pipeline
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet 


In [2]:
import pandas as pd
import pickle

In [3]:
df = pd.read_csv('processed_data.csv')
df['Review'] = df['Review'].astype(str)
df.head()

Unnamed: 0,Review,Sentiment
0,book horrible possible rate lower star avoid r...,0
1,amazon reviews purchasing books especially ale...,0
2,book horrible possible rate lower star avid re...,0
3,whos writing reviews read repitition repititio...,0
4,picked book series eyre affair based purely pr...,0


In [4]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, reviews):
        return [self.wnl.lemmatize(t) for t in word_tokenize(reviews)]

cv = CountVectorizer(tokenizer=LemmaTokenizer(),
                       strip_accents = 'unicode',
                       stop_words = 'english',
                       lowercase = True,
                       token_pattern = r'\b[a-zA-Z]{3,}\b', # keeps words of 3 or more characters
                       max_df = 0.75,
                       min_df = 0.0)

text_counts = cv.fit_transform(df['Review'])



In [5]:
text_counts.shape

(10000, 47156)

In [6]:
text_counts

<10000x47156 sparse matrix of type '<class 'numpy.int64'>'
	with 413692 stored elements in Compressed Sparse Row format>

In [None]:
y = df['Sentiment'].to_numpy()
y

In [None]:
k = 10
kf = KFold(n_splits=k, random_state=1, shuffle=True) 

In [None]:
MNB = MultinomialNB()

In [None]:
acc_score = []
 
for train_index , test_index in kf.split(text_counts):
    X_train , X_test = text_counts[train_index],text_counts[test_index]
    y_train , y_test = y[train_index] , y[test_index]
     
    MNB.fit(X_train,y_train)
    pred_values = MNB.predict(X_test)
     
    acc = accuracy_score(pred_values , y_test)
    acc_score.append(acc)
     
avg_acc_score = sum(acc_score)/k
 
print('accuracy of each fold - {}'.format(acc_score))
print('Avg accuracy : {}'.format(avg_acc_score))

In [None]:
pickle.dump(cv, open('CV_model.pkl', 'wb'))

In [None]:
pickle.dump(MNB, open('MNB_model.pkl', 'wb'))

In [None]:
pickledCV_model = pickle.load(open('CV_model.pkl', 'rb'))

In [None]:
pickled_model = pickle.load(open('MNB_model.pkl', 'rb'))

In [None]:
list = []
review = 'best'
list.append(review)
print(list)
list = pickledCV_model.transform(list)
print(list)
pred = pickled_model.predict(list)
#pred = pred.tolist()
print(pred[0])
if pred[0] == 1:
    print("positive review")
else:
    print("negative review")