# Create the same classifier that we created in Chapter_08

In [2]:
import numpy as np
import re
from nltk.corpus import stopwords
stop = stopwords.words('english')
path = r'/Users/Jaan/Documents/gitHubCode/PythonMachineLearningBook/Chapter_08/data'
#define method to tokenize sentences and remove all HTML markups and other punctuations
def tokenizer(sentence):
    sentence = re.sub('<[^>]*>', '', sentence)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', sentence.lower())
    sentence = re.sub('[\W]', ' ', sentence.lower()) + ''.join(emoticons).replace('-', '')
    tokenized = [w for w in sentence.split() if w not in stop]
    return tokenized

#function to read and return one document at a time
def stream_docs(path):
    with open(path, 'r') as file:
        next(file) #skip header
        for line in file:
            text, label = line[:-3], int(line[-2])
#             print (text)
#             print(label)
#             print
            yield text, label
stream_docs(path = path + '/movie_data.csv')

#function that returns only specified number of documents from document stream
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration: 
        return None, None
    return docs, y

#train classifier
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vect = HashingVectorizer(decode_error = 'ignore', n_features = 2 ** 21, 
                        preprocessor = None, tokenizer = tokenizer)
clf = SGDClassifier(loss = 'log', random_state = 1, n_iter = 1)
doc_stream = stream_docs(path = path +'/movie_data.csv')

classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size = 1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes = classes)
    
#test the model
X_test, y_test = get_minibatch(doc_stream, size = 5000)
X_test = vect.transform(X_test)
print ('Accuracy: %.3f' %clf.score(X_test, y_test))

#create and store classifier as variable called clf
clf = clf.partial_fit(X_train, y_train)

Accuracy: 0.868


# Serializing fitted scikit-learn estimators

In [3]:
import pickle
import os
dest = os.path.join('movieclassfier', 'pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)
pickle.dump(stop, open(os.path.join(dest, 'stopwords.pkl'), 'wb'), 
            protocol = 2)
pickle.dump(clf, open(os.path.join(dest, 'classifier.pkl'), 'wb'), 
           protocol = 2)

Before executing the following commands, restart the kernel and start executing commands from the cell below. 

In [3]:
import os
os.getcwd()
os.chdir('/Users/Jaan/Documents/gitHubCode/PythonMachineLearningBook/Chapter_09/movieclassfier')
os.getcwd()

'/Users/Jaan/Documents/gitHubCode/PythonMachineLearningBook/Chapter_09/movieclassfier'

In [8]:
#load vectorizer and unpickled classifier
import pickle
import re
import os
from vectorizer import vect
clf = pickle.load(open(os.path.join('pkl_objects', 'classifier.pkl'), 'rb'))

In [9]:
#preprocess document samples and make predictions
import numpy as np
label = {0: 'negative', 1:'positive'}
example = ['I love this movie']
X = vect.transform(example)
print ('Predction: %s\nProbability: %.2f%%' %(label[clf.predict(X)[0]], np.max(clf.predict_proba(X) * 100)))

Predction: positive
Probability: 82.90%


# Setting up a SQLite database for data storage

In [16]:
#create review database and update it with two example rows of data
import sqlite3
import os
conn = sqlite3.connect('reviews.sqlite')
c = conn.cursor()
c.execute('''CREATE TABLE review_db 
            (review TEXT, 
            sentiment INTEGER, 
            date TEXT);''' )
example1 = 'I love this movie'
c.execute("INSERT INTO review_db (review, sentiment, date) VALUES(?, ?, DATETIME('now'))", (example1, 1))
example2 = 'I do not like this movie'
c.execute("INSERT INTO review_db (review, sentiment, date) VALUES(?, ?, DATETIME('now'))", (example2, 0))
conn.commit()
conn.close()

In [15]:
#use in case we need to drop the review table before needing to create it again
# conn = sqlite3.connect('reviews.sqlite')
# c = conn.cursor()
# c.executescript('drop table review_db')

<sqlite3.Cursor at 0x10432c030>

In [19]:
#check entries entered into the databse
conn = sqlite3.connect('reviews.sqlite')
c = conn.cursor()
c.execute("SELECT * FROM review_db WHERE date BETWEEN '2015-01-01 00:00:00' AND DATETIME('now')")
results = c.fetchall()
conn.close()
print (results)

[('I love this movie', 1, '2015-12-31 20:06:19'), ('I do not like this movie', 0, '2015-12-31 20:06:19')]
