In [1]:
import pandas as pd
import numpy as np

# Part I: Train classifiers

In [2]:
trainDf=pd.read_pickle('train.pkl')

In [4]:
trainDf.head()

Unnamed: 0,business_id,date,review_id,stars,text,user_id,active
467878,IZI9BdElfGB-WcHpd7Wg7Q,2014-02-20,CPj_ZWalJpnebGrFxV8ujg,5,Came here for a late lunch after being slightl...,PTcKYk_qU-WzbQjraE3TgQ,1
450196,Ja4stXdNYr39u5CZHMNtjw,2012-05-14,S1EKSngF9LVvFQ08Pye59Q,4,What a cute little place in Old Town! We went...,l5x4wzz9VOcQ22XMthU6uw,1
1295251,bjllusUsLq9zE4yZnLpOzg,2015-09-19,9ndOEDaRsYjETx0kl83h7w,3,Food was better than average. We got three dif...,76crp1vXF1A3LOoNP6zcUQ,1
379392,QdbwZGnBWdkdN2XAG7rwLA,2014-08-15,raKjAf09z4FtAqmqfb5lUw,5,Korean Style Fried Chicken has become the only...,4KbUaerUBoiUwZGMHKVFxQ,1
564809,WKOUTdVJS58E178JjhwidQ,2016-01-10,DQGL9V4vhF7wNBdVRcZ1VQ,4,"Cute little cafe, the crepe was so good. I tri...",XrYTMhY9YJvzX2pMepIz7A,1


In [6]:
X_train=trainDf['text'].values

In [10]:
Y_train=trainDf['active'].values

### Tokenize traning texts

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape

(339700, 180431)

In [43]:
type(X_train_counts)

scipy.sparse.csr.csr_matrix

### Count word frequency using tf-idf
*Occurrence count is a good start but there is an issue: longer documents will have higher average count values than shorter documents, even though they might talk about the same topics.
To avoid these potential discrepancies it suffices to divide the number of occurrences of each word in a document by the total number of words in the document: these new features are called tf for Term Frequencies.
Another refinement on top of tf is to downscale weights for words that occur in many documents in the corpus and are therefore less informative than those that occur only in a smaller portion of the corpus.
This downscaling is called tf–idf for “Term Frequency times Inverse Document Frequency”.*
————http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [9]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(339700, 180431)

### Train a naive Bayes classifier

In [11]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, Y_train)

#### Evaluate the performance on the test set

In [13]:
testDf=pd.read_pickle('test.pkl')

In [14]:
testDf.head()

Unnamed: 0,business_id,date,review_id,stars,text,user_id,active
840771,LbyCngxEveE7G6w2mqeueg,2015-10-12,PhBoxX0S_htKMGweVmA7iA,4,"Delicious old school pizza, only deliver in a ...",UGlJYHK0I7TM6yLB3Rkp2w,1
524329,5N8R7ALESZ30EoAzVJtabw,2015-11-26,2ZLg2o_yh3pvXnQhdnBCFg,4,Probably like a lot of you that reviewed your ...,HgCv2lm-GuACRanD7bSqyA,1
1022586,hPO2di89hgfsD9inbKl7BQ,2014-12-03,z48GmttobtQ28hmKChpSRQ,4,This Ulta has greatly improved! The customer s...,HV5HZf7DOQHiwCtNdor2Zg,1
795885,5B-V6vr6TbtMjrRaBAtHqQ,2012-09-27,Y8jWjRX69baonQSXKLtPHg,2,"Not my favorite place to go, but I've only bee...",CPodThzsqATG_Bc8a4an_w,1
769454,u-SJ5QUwrNquL9VnXwl8cg,2013-08-08,UnZirwkho6bJkffEhSWlug,1,"I try to like you Postinos, I really do but yo...",k6vOe0_qglL2PZlBldoKeQ,1


In [16]:
X_test=testDf['text'].values

In [17]:
Y_test=testDf['active'].values

In [18]:
X_test_counts = count_vect.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

In [19]:
predicted = clf.predict(X_test_tfidf)

In [20]:
np.mean(predicted == Y_test)

0.77375597579068833

We achieved 77.4% accuracy.

### Train a linear support vector machine (SVM) model

In [24]:
from sklearn.linear_model import SGDClassifier
svm_clf = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42,max_iter=5, tol=None).fit(X_train_tfidf, Y_train)

In [49]:
svm_clf.coef_

array([[-0.10710299, -0.03974785, -0.00047589, ...,  0.        ,
        -0.00014326,  0.        ]])

#### Evaluate the performance on the test set

In [25]:
svm_predicted = svm_clf.predict(X_test_tfidf)

In [26]:
np.mean(svm_predicted == Y_test)

0.79949603183948381

In [31]:
from sklearn import metrics
print(metrics.classification_report(Y_test, svm_predicted,target_names=['inactive','active']))

             precision    recall  f1-score   support

   inactive       0.79      0.82      0.80     42811
     active       0.81      0.78      0.79     42115

avg / total       0.80      0.80      0.80     84926



### Parameter tuning using grid search

In [40]:
from sklearn.model_selection import GridSearchCV
parameters = {'alpha': (1e-2, 1e-3)}
gs_clf = GridSearchCV(svm_clf, parameters)

In [44]:
gs_clf.fit(X_train_counts, Y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=SGDClassifier(alpha=0.001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=5, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': (0.01, 0.001)}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring=None, verbose=0)

In [45]:
gs_clf.best_params_

{'alpha': 0.01}

In [46]:
gs_clf.best_score_

0.826264350897851

In [50]:
svm_clf_tuned = SGDClassifier(loss='hinge', penalty='l2',alpha=0.01, random_state=42,max_iter=5, tol=None).fit(X_train_tfidf, Y_train)

In [53]:
svm_predicted_tuned = svm_clf_tuned.predict(X_test_tfidf)

In [54]:
np.mean(svm_predicted_tuned == Y_test)

0.73732425876645546