# Load Python Libraries

In [1]:
from sklearn2pmml import PMMLPipeline, sklearn2pmml
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
import pickle
from sklearn.externals import joblib
import time

# Load local datafile with 123,419 reviews containing text and sentiment (0,1) from .csv file

In [2]:
df = pd.read_csv('5_4_3_all_data.csv', header = 0 )

# Display dataframe head and shape

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,no problem with constant improvement i switche...,1
1,i live minutes away from the nearest branch a...,1
2,i am very annoyed with this app i do like bein...,0
3,i dont see how so many people on the reviews c...,1
4,you can do almost every function the full site...,1


In [24]:
print(df['review'][1])
print(df.shape)

i live  minutes away from the nearest branch and this app lets me do basically all of my banking  the main reason i stopped using local banks was lack of overdraft protection which unfortunately was a problem for my wife and i until she landed a decent paying job  being over drafted for bills should say we were having financial trouble but not allowing us the opportunity to deposit cash i made from tips the night before to catch it up and instead charging a painful  dollar fee was killing me  chase offers overdraft protection so it became a no brainer even though i live almost an hour away from the nearest branch  even considering this distance i usually dont have to worry about depositing cash or withdrawals just my paycheck being deposited which this app does beautifully and quickly with zero hassle  i can view my statements and every transaction quickly  it got old typing in my password all the time but since they added the finger print login its gold  i also have the security of kn

# Assign the review text and the sentiment to separate input and label dataframes, then assign the raw values to X, Y

In [26]:
input_X = df['review']
label_Y = df['sentiment']

In [27]:
X = input_X.values
Y = label_Y.values

# Create a CountVectorizer object using Sklearn's CountVectorizer
###### After creating the CountVectorizer object, we perform a fit and transform on our input_X in a single method call. The transform uses the CountVectorizer algorithm to convert our review data that is held in our input_X into a document term matrix. Next we use a fit to learn a dictionary of all of the transformed tokens.

In [28]:
tic = time.time()
vect = CountVectorizer(min_df = 1, max_features = 900)
X = vect.fit_transform(X)
toc = time.time()
print("Total time to fit_transform CountVectorizer :", (toc-tic))

('Total time to fit_transform CountVectorizer :', 3.961841106414795)


# Create a Logistic Regression Model using Sklearn's Linear Model Library

In [8]:
model = LogisticRegression()

In [18]:
tic = time.time()
model.fit(X, Y)
toc = time.time()
print("Total time to fit Logistic Regression Model :", (toc - tic))

('Total time in seconds elapsed :', 1.4745359420776367)


# Create a PMML Pipeline object and add the Logistic Regression to it

In [13]:
tic = time.time()
myPMMLPipeline = PMMLPipeline([
    ("classifier", model)
])
toc = time.time()
print("Total time to build PMML Pipeline :", (toc-tic))

('Total time in seconds elapsed :', 0.0005259513854980469)


# Export PMML Pipeline with Logistic Regression model to .pmml file for use in SparkML

In [14]:
tic = time.time()
sklearn2pmml(myPMMLPipeline, "LogisticRegressionCountVectorizerModel.pmml", with_repr = True)
toc = time.time()
print("Total time to convert Sklearn to PMML :", (toc - tic))

('Total time in seconds elapsed :', 1.479334831237793)


# Test prediction
### First we will use our CountVectorizer on a single review randomly pulled from the data set

In [15]:
singleReview = ["i love this app  it keeps me up to date on how much and what is in my acct i could never be without this app after having had it  i have been blessed to never once had a problem so far and ive been using it for around six months now  im never giving it up"]
vectorizedSentence = vect.transform(singleReview)

# Display the transformed review

In [22]:
print(vectorizedSentence.data)
print(vectorizedSentence.indices)

[1 1 2 2 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 4 1 1 1 1 1 1 1 3 1 1 1 1 1 2 2 2 1
 1 1]
[ 10  20  42  53  60  85  89 172 184 266 290 317 330 346 348 363 372 383
 399 403 406 411 456 473 487 494 498 510 519 530 531 592 696 767 781 820
 835 861 878]


# Make a prediction on the vectorized review text

In [17]:
print(myPMMLPipeline.predict_proba(vectorizedSentence))

[[ 0.07459858  0.92540142]]


# Export model to Pickle and Joblib

In [20]:
pickle_file = open("pickleFile.pkl", 'wb')
pickle.dump(model, pickle_file)
pickle_file.close()

In [21]:
joblib_file = "joblibLRModel.sav"
joblib.dump(model,joblib_file)

['joblibLRModel.sav']