In [45]:
import pandas as pd
import numpy as np

# Part I: Train classifiers
### Split data into train set and test set

In [2]:
trainDf=pd.read_pickle('train.pkl')

In [4]:
trainDf.head()

Unnamed: 0,business_id,date,review_id,stars,text,user_id,active
467878,IZI9BdElfGB-WcHpd7Wg7Q,2014-02-20,CPj_ZWalJpnebGrFxV8ujg,5,Came here for a late lunch after being slightl...,PTcKYk_qU-WzbQjraE3TgQ,1
450196,Ja4stXdNYr39u5CZHMNtjw,2012-05-14,S1EKSngF9LVvFQ08Pye59Q,4,What a cute little place in Old Town! We went...,l5x4wzz9VOcQ22XMthU6uw,1
1295251,bjllusUsLq9zE4yZnLpOzg,2015-09-19,9ndOEDaRsYjETx0kl83h7w,3,Food was better than average. We got three dif...,76crp1vXF1A3LOoNP6zcUQ,1
379392,QdbwZGnBWdkdN2XAG7rwLA,2014-08-15,raKjAf09z4FtAqmqfb5lUw,5,Korean Style Fried Chicken has become the only...,4KbUaerUBoiUwZGMHKVFxQ,1
564809,WKOUTdVJS58E178JjhwidQ,2016-01-10,DQGL9V4vhF7wNBdVRcZ1VQ,4,"Cute little cafe, the crepe was so good. I tri...",XrYTMhY9YJvzX2pMepIz7A,1


In [47]:
trainDf.shape

(339700, 7)

In [3]:
X_train=trainDf['text'].values

In [14]:
Y_train=trainDf['active'].values

In [None]:
testDf=pd.read_pickle('test.pkl')

In [43]:
testDf.head()

Unnamed: 0,business_id,date,review_id,stars,text,user_id,active
840771,LbyCngxEveE7G6w2mqeueg,2015-10-12,PhBoxX0S_htKMGweVmA7iA,4,"Delicious old school pizza, only deliver in a ...",UGlJYHK0I7TM6yLB3Rkp2w,1
524329,5N8R7ALESZ30EoAzVJtabw,2015-11-26,2ZLg2o_yh3pvXnQhdnBCFg,4,Probably like a lot of you that reviewed your ...,HgCv2lm-GuACRanD7bSqyA,1
1022586,hPO2di89hgfsD9inbKl7BQ,2014-12-03,z48GmttobtQ28hmKChpSRQ,4,This Ulta has greatly improved! The customer s...,HV5HZf7DOQHiwCtNdor2Zg,1
795885,5B-V6vr6TbtMjrRaBAtHqQ,2012-09-27,Y8jWjRX69baonQSXKLtPHg,2,"Not my favorite place to go, but I've only bee...",CPodThzsqATG_Bc8a4an_w,1
769454,u-SJ5QUwrNquL9VnXwl8cg,2013-08-08,UnZirwkho6bJkffEhSWlug,1,"I try to like you Postinos, I really do but yo...",k6vOe0_qglL2PZlBldoKeQ,1


In [48]:
testDf.shape

(84926, 7)

In [44]:
X_test=testDf['text'].values
Y_test=testDf['active'].values

### Tokenize traning texts and count word frequency using tf-idf
Reference: http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape

(339700, 180431)

In [14]:
type(X_train_counts)

scipy.sparse.csr.csr_matrix

In [57]:
tfidf_transformer = TfidfTransformer(sublinear_tf=True)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(339700, 180431)

In [49]:
type(X_train_tfidf)

scipy.sparse.csr.csr_matrix

In [58]:
X_test_counts = count_vect.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

### Train a naive Bayes classifier and evaluate the performance on the test set

In [59]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, Y_train)

In [60]:
predicted = clf.predict(X_test_tfidf)
np.mean(predicted == Y_test)

0.77591079292560583

We achieve 77.6% accuracy. Not bad!

### Train a linear support vector machine (SVM) model and evalute the performance on the test set

In [61]:
from sklearn.linear_model import SGDClassifier
svm_clf = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42,max_iter=5, tol=None).fit(X_train_tfidf, Y_train)

In [62]:
svm_clf.coef_

array([[-0.10526299, -0.04814701, -0.00158775, ..., -0.00067528,
        -0.00071631, -0.0008268 ]])

In [63]:
svm_predicted = svm_clf.predict(X_test_tfidf)

In [64]:
np.mean(svm_predicted == Y_test)

0.80533641052210159

In [65]:
from sklearn import metrics
print(metrics.classification_report(Y_test, svm_predicted,target_names=['inactive','active']))

             precision    recall  f1-score   support

   inactive       0.80      0.82      0.81     42811
     active       0.81      0.79      0.80     42115

avg / total       0.81      0.81      0.81     84926



#### SVM is able to predict whether a use in the test set is active by analyzing the contents of the review with an accuracy of 80%.

### Parameter tuning using grid search

In [66]:
from sklearn.model_selection import GridSearchCV
parameters = {'alpha': (1e-2, 1e-3)}
gs_clf = GridSearchCV(svm_clf, parameters)

In [67]:
gs_clf.fit(X_train_counts, Y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=SGDClassifier(alpha=0.001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=5, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': (0.01, 0.001)}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring=None, verbose=0)

In [68]:
gs_clf.best_params_

{'alpha': 0.01}

In [69]:
gs_clf.best_score_

0.826264350897851

In [70]:
svm_clf_tuned = SGDClassifier(loss='hinge', penalty='l2',alpha=0.01, random_state=42,max_iter=5, tol=None).fit(X_train_tfidf, Y_train)

In [71]:
svm_predicted_tuned = svm_clf_tuned.predict(X_test_tfidf)

In [72]:
np.mean(svm_predicted_tuned == Y_test)

0.72279396180203948

## Part II: Visualize & Analyze the result
Reference: https://buhrmann.github.io/tfidf-analysis.html

In [73]:
#holds a list of all the words in the tf-idf’s vocabulary
features=count_vect.get_feature_names()
features

['00',
 '000',
 '0000',
 '00000',
 '0000000',
 '0000000000000001',
 '0000000003',
 '00001',
 '00007651',
 '0002',
 '0002136',
 '0003s',
 '0004318',
 '0004318zz',
 '000dollars',
 '000hr',
 '000iu',
 '000km',
 '000kms',
 '000lbs',
 '000mg',
 '000miles',
 '000s',
 '000sf',
 '000x',
 '001',
 '0015',
 '002',
 '003',
 '0030',
 '004',
 '005',
 '0051',
 '006',
 '0067',
 '007',
 '00719',
 '00789',
 '0089',
 '009',
 '00____',
 '00_____',
 '00______',
 '00a',
 '00am',
 '00dlls',
 '00dollars',
 '00each',
 '00eur',
 '00for',
 '00h',
 '00hh',
 '00hr',
 '00hrs',
 '00ish',
 '00lb',
 '00noon',
 '00off',
 '00or',
 '00p',
 '00pm',
 '00pmish',
 '00s',
 '00tip',
 '00uhr',
 '01',
 '010',
 '0105',
 '0109',
 '011856',
 '012',
 '0131',
 '0135',
 '013672',
 '014',
 '0143',
 '016',
 '017',
 '0176',
 '019',
 '01am',
 '01pm',
 '02',
 '0200',
 '0213',
 '0214',
 '0222000',
 '0233',
 '025',
 '0279',
 '0295',
 '02am',
 '02pm',
 '03',
 '030',
 '0300',
 '033',
 '0330',
 '0359',
 '03607',
 '0362',
 '0368',
 '0383',
 '03a

Create a function that takes a single row of the tf-idf matrix (corresponding to a particular document), and return the n highest scoring words (or more generally tokens or features):

In [74]:
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

The result of a tf-idf is a sparse matrix, which doesn’t support all the usual matrix or array operations. So in order to apply the above function to inspect a particular document, we convert a single row into dense format first:

In [75]:
def top_feats_in_doc(Xtr, features, row_id, top_n=25):
    ''' Top tfidf features in specific document (matrix row) '''
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

In [78]:
len(trainDf.loc[trainDf['active']==1])trainDf.iloc[168456]

168457

In [79]:
trainDf.iloc[168456]

business_id                               xmARZ_MuSfrQCXP4e8RQjg
date                                                  2013-03-03
review_id                                 DbTzR3TDySXvYjZRWCO10g
stars                                                          3
text           I want to be able to give Nello's 3.5 stars. \...
user_id                                   BuwzRmdVk0uu17qG49l37A
active                                                         1
Name: 291321, dtype: object

In [80]:
trainDf.iloc[168457]

business_id                               Eif660dJb3nqORM6LF3UYg
date                                                  2015-12-07
review_id                                 a6wkURhUcsDihsNqv0p8LQ
stars                                                          1
text           I was forced by the app to give it a star. Jus...
user_id                                   m44kwkYYFTOMpQM1DeIg4w
active                                                         0
Name: 52298, dtype: object

Due to the way I constructed the train data, *the active users are stored in the first 168457 rows in the trainDf, and the rows are all reviews that belong to inactive users.*

Try using top_feats_in_doc to find the top 10 most important words for the active user on row 0, and those for the active user on row 4.

In [81]:
top_feats_in_doc(X_train_tfidf,features,0,10)

Unnamed: 0,feature,tfidf
0,yellow,0.230024
1,dahl,0.221549
2,whey,0.203711
3,curry,0.189788
4,hungover,0.179498
5,herbal,0.172362
6,sautéed,0.163888
7,thinly,0.159444
8,scrumptious,0.155432
9,triple,0.151808


In [84]:
top_feats_in_doc(X_train_tfidf,features,4,10)

Unnamed: 0,feature,tfidf
0,crepe,0.506723
1,aromatic,0.365277
2,sweet,0.291273
3,latte,0.275677
4,overly,0.252598
5,cafe,0.233309
6,potato,0.224764
7,cute,0.221078
8,okay,0.203687
9,big,0.163168


In [34]:
def top_mean_feats(Xtr, features, grp_ids, min_tfidf=0.1, top_n=25):
    ''' Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids. '''
    D = Xtr[grp_ids].toarray()
    D = Xtr.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

It would be nice if I can have the top features for all active users, and all inactive users. However, due to the limitation of my laptop's memory, I can only run top_mean_feats for a subset of the users. 

In [None]:
top_mean_feats(X_train_tfidf,features, slice(0,5000))

In [7]:
D=X_train_tfidf[:5000].toarray()

In [8]:
D[D <= 0.1] = 0

In [9]:
tfidf_means = np.mean(D, axis=0)

In [23]:
len(tfidf_means)

180431

In [18]:
top_tfidf_feats(tfidf_means, features, 50)

Unnamed: 0,feature,tfidf
0,the,0.117207
1,and,0.055301
2,was,0.037273
3,to,0.032746
4,it,0.030709
5,is,0.022037
6,of,0.02037
7,you,0.019955
8,we,0.019106
9,they,0.012762


In [16]:
D2=X_train_tfidf[-5000:].toarray()

In [17]:
D2[D2 <= 0.1] = 0

In [19]:
tfidf_means2 = np.mean(D2, axis=0)

In [20]:
top_tfidf_feats(tfidf_means2, features, 50)

Unnamed: 0,feature,tfidf
0,the,0.068166
1,and,0.047413
2,to,0.030508
3,was,0.029605
4,we,0.023821
5,great,0.016369
6,she,0.014331
7,he,0.01429
8,my,0.014239
9,food,0.013874


In [22]:
D3=X_train_tfidf2[-5000:].toarray()

In [23]:
D3[D3 <= 0.1] = 0

In [24]:
tfidf_means3 = np.mean(D3, axis=0)

In [25]:
top_tfidf_feats(tfidf_means3, features, 50)

Unnamed: 0,feature,tfidf
0,great,0.014699
1,we,0.013122
2,food,0.012076
3,she,0.010914
4,he,0.010662
5,service,0.010655
6,was,0.010122
7,very,0.009971
8,best,0.009592
9,amazing,0.009336


In [26]:
D4=X_train_tfidf2[:5000].toarray()

In [27]:
D4[D4 <= 0.1] = 0

In [28]:
tfidf_means4 = np.mean(D4, axis=0)

In [29]:
top_tfidf_feats(tfidf_means4, features, 50)

Unnamed: 0,feature,tfidf
0,chicken,0.006611
1,we,0.006401
2,always,0.006118
3,pizza,0.005761
4,lunch,0.005708
5,was,0.005382
6,location,0.005242
7,good,0.005207
8,sushi,0.005145
9,burger,0.005068
