In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
cd drive/'My Drive'/'Data mining'/bbc_text_categorization

/content/drive/My Drive/Data mining/bbc_text_categorization


In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn import metrics as sklm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB, CategoricalNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from pprint import pprint
from time import time
from sklearn.preprocessing import FunctionTransformer

In [None]:
DATA_PATH='data/bbc-text.csv'

## Data overview

In [None]:
df=pd.read_csv(DATA_PATH)
df.sample(10)

Unnamed: 0,category,text
883,politics,brown in appeal for labour unity gordon brown ...
1989,sport,hodgson shoulders england blame fly-half charl...
1587,sport,wenger signs new deal arsenal manager arsene w...
2179,entertainment,musicians to tackle us red tape musicians gro...
1867,sport,federer breezes into semi-finals roger federer...
1534,entertainment,oscar nominees gear up for lunch leonardo dica...
356,politics,green fear for transport ballot the green part...
36,tech,gamers snap up new sony psp gamers have bought...
1629,business,wmc says xstrata bid is too low australian min...
468,politics,guantanamo four free in weeks all four britons...


In [None]:
df.describe(include='all')

Unnamed: 0,category,text
count,2225,2225
unique,5,2126
top,sport,queen recruit singer for new tour the remainin...
freq,511,2


Note: There are some duplicate samples in data: 2225 sample but 2126 unique text values

In [None]:
df.drop_duplicates(inplace=True)
df.nunique()

category       5
text        2126
dtype: int64

In [None]:
df['category'].value_counts()

sport            504
business         503
politics         403
entertainment    369
tech             347
Name: category, dtype: int64

In [None]:
print('Example sample:')
print(f"- Label: {df['category'][0]}")
print('- Content:', df['text'][0], sep='\n')

Example sample:
- Label: tech
- Content:
tv future in the hands of viewers with home theatre systems  plasma high-definition tvs  and digital video recorders moving into the living room  the way people watch tv will be radically different in five years  time.  that is according to an expert panel which gathered at the annual consumer electronics show in las vegas to discuss how these new technologies will impact one of our favourite pastimes. with the us leading the trend  programmes and other content will be delivered to viewers via home networks  through cable  satellite  telecoms companies  and broadband service providers to front rooms and portable devices.  one of the most talked-about technologies of ces has been digital and personal video recorders (dvr and pvr). these set-top boxes  like the us s tivo and the uk s sky+ system  allow people to record  store  play  pause and forward wind tv programmes when they want.  essentially  the technology allows for much more personalised 

In [None]:
CATEGORIES=df['category'].unique().tolist()
print(CATEGORIES)

['tech', 'business', 'sport', 'entertainment', 'politics']


In [None]:
X=df['text'].to_numpy()
Y=df['category'].to_numpy()

In [None]:
np.unique(Y)

array(['business', 'entertainment', 'politics', 'sport', 'tech'],
      dtype=object)

## Train test split

In [None]:
# Test set of 500 samples
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=500, random_state=42)
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)
# Test set 2, a subset of test set contains only 300 samples
x_test2, _, y_test2, __= train_test_split(x_test, y_test, test_size=300, random_state=42)
print(x_test2.shape, y_test2.shape)

(1626,) (1626,) (500,) (500,)
(200,) (200,)


## Metrics evaluation
Function to evaluate some metrics

In [None]:
def evaluate_metrics(y_true, y_pred, labels=None):
    metrics = sklm.precision_recall_fscore_support(y_true, y_pred)
    cm = sklm.confusion_matrix(y_true, y_pred, labels=labels)
    num_classes=cm.shape[0]
    
    # For each class, calculate metrics
    metrics=pd.DataFrame(columns=['support', 'True Positive Rate(TPR)', 'False Positive Rate(FPR)',
        'Sensitivity(SE)','Specificity(SP)', 'Accuracy(ACC)', 'Precision(P)','Recall(R)','F1-Score'])
    n_sample=np.sum(cm)
    for i in range(num_classes):
        # True positive rate, False Positive rate, Sensitivity, Specificity
        support=np.sum(cm[i,:])
        tp=cm[i,i]
        fn=support - tp
        fp=np.sum(cm[:, i])-tp
        tn=n_sample-tp-fn-fp
        tpr=tp/(tp+fn)
        fpr=fp/(fp+tn)
        se=tpr
        sp=1-fpr
        acc=(tp+tn)/n_sample
        precision=tp/(tp+fp)
        recall=tpr
        f1_s=2*precision*recall/(precision+recall)
        metrics.loc[labels[i]]=[support, tpr, fpr, se, sp, acc, precision, recall, f1_s]
    return cm, metrics

## Binary classification
Testing with binary classification problem: sport vs non-sport labels

### Transform labels to binary classification
Tranform label array for binary classes: sport vs non-sport

In [None]:
y_test_bin=y_test.copy()
y_test2_bin=y_test2.copy()

for arr in [y_train_bin, y_test_bin, y_test2_bin]:
    for i in range(arr.shape[0]):
        if arr[i] != 'sport':
            arr[i]='non-sport'
    # Verify
    print(arr.shape)
    print(np.unique(arr))
    print('----------')

(1626,)
['non-sport' 'sport']
----------
(500,)
['non-sport' 'sport']
----------
(200,)
['non-sport' 'sport']
----------


### Binary feature vector using CountVectorizer
Using CountVectorizer to construct binary features:   
https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer

In [None]:
vectorizer=CountVectorizer(lowercase=True, analyzer='word', binary=True, stop_words='english', max_df=0.75, max_features=1000, ngram_range=(1,2))
x_train_bin=vectorizer.fit_transform(x_train).todense()
print(x_train_bin.shape)
print(x_train_bin)

(1626, 1000)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]]


In [None]:
print('Stopwords:', vectorizer.get_stop_words())
print('Vocabulary:', vectorizer.get_feature_names())

Stopwords: frozenset({'themselves', 'over', 'thin', 'might', 'whereupon', 'for', 'us', 'becoming', 'something', 'whither', 'back', 'do', 'interest', 'once', 'they', 'my', 'cant', 'here', 'con', 'beyond', 'her', 'done', 'meanwhile', 'side', 'whereafter', 'or', 'someone', 'besides', 'well', 'bottom', 'sincere', 'thru', 'without', 'almost', 'per', 'seems', 'yourself', 'least', 'fifteen', 'anywhere', 'four', 'hereby', 'latter', 'serious', 'himself', 'top', 'myself', 'towards', 'three', 'that', 'detail', 'can', 'thereafter', 'via', 'am', 'nothing', 'much', 'now', 'rather', 'a', 'still', 'we', 'any', 'latterly', 'anything', 'front', 'wherever', 're', 'again', 'thereupon', 'how', 'nevertheless', 'this', 'between', 'de', 'each', 'him', 'always', 'across', 'often', 'ours', 'would', 'both', 'found', 'off', 'less', 'herein', 'everywhere', 'his', 'some', 'the', 'everything', 'becomes', 'indeed', 'are', 'never', 'see', 'since', 'though', 'had', 'what', 'as', 'whereas', 'which', 'at', 'afterwards', 

In [None]:
x_train_bin[:1]

matrix([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
         0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0

In [None]:
x_test_bin=vectorizer.transform(x_test).todense()
x_test2_bin=vectorizer.transform(x_test2).todense()
print(x_test_bin.shape, x_test2_bin.shape)

(500, 1000) (200, 1000)


#### Which Naive Bayes model performs better   
https://scikit-learn.org/stable/modules/naive_bayes.html

In [None]:
def train_and_test(clf, x_train, y_train, x_test, y_test):
    clf.fit(x_train_bin, y_train_bin)
    print(clf)
    print(f'Train set accuracy: {clf.score(x_train_bin, y_train_bin)}')
    print(f'Test set accuracy: {clf.score(x_test_bin, y_test_bin)}')

In [None]:
clfs=[MultinomialNB(), BernoulliNB(), CategoricalNB()]

BernoulliNB and CategoricalNB should produces the same result since features in data are all in binary form

In [None]:
for clf in clfs:
    train_and_test(clf, x_train_bin, y_train_bin, x_test_bin, y_test_bin)
    print('---------------------')

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
Train set accuracy: 0.992619926199262
Test set accuracy: 0.996
---------------------
BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
Train set accuracy: 0.9895448954489545
Test set accuracy: 0.992
---------------------
CategoricalNB(alpha=1.0, class_prior=None, fit_prior=True)
Train set accuracy: 0.9895448954489545
Test set accuracy: 0.992
---------------------


In [None]:
clf=MultinomialNB()
clf.fit(x_train_bin, y_train_bin)
y_test_pred=clf.predict(x_test_bin)
y_test2_pred=clf.predict(x_test2_bin)

##### Test set of 500 samples

In [None]:
cm, metrics= evaluate_metrics(y_test_bin, y_test_pred, labels=['sport', 'non-sport'])
print('Confusion matrix')
print(cm)
metrics

Confusion matrix
[[117   2]
 [  0 381]]


Unnamed: 0,support,True Positive Rate(TPR),False Positive Rate(FPR),Sensitivity(SE),Specificity(SP),Accuracy(ACC),Precision(P),Recall(R),F1-Score
sport,119.0,0.983193,0.0,0.983193,1.0,0.996,1.0,0.983193,0.991525
non-sport,381.0,1.0,0.016807,1.0,0.983193,0.996,0.994778,1.0,0.997382


Verify result metrics, our evaluate_metrics() funtion should produce the same result as sklearn 's classification_report() function

In [None]:
report = sklm.classification_report(y_test_bin, y_test_pred, labels= ['sport', 'non-sport'])
print(report)

              precision    recall  f1-score   support

       sport       1.00      0.98      0.99       119
   non-sport       0.99      1.00      1.00       381

    accuracy                           1.00       500
   macro avg       1.00      0.99      0.99       500
weighted avg       1.00      1.00      1.00       500



##### Test set of 300 samples

In [None]:
cm2, metrics2= evaluate_metrics(y_test2_bin, y_test2_pred, labels=['sport', 'non-sport'])
print('Confusion matrix')
print(cm2)
metrics2

Confusion matrix
[[ 52   2]
 [  0 146]]


Unnamed: 0,support,True Positive Rate(TPR),False Positive Rate(FPR),Sensitivity(SE),Specificity(SP),Accuracy(ACC),Precision(P),Recall(R),F1-Score
sport,54.0,0.962963,0.0,0.962963,1.0,0.99,1.0,0.962963,0.981132
non-sport,146.0,1.0,0.037037,1.0,0.962963,0.99,0.986486,1.0,0.993197


### TF-IDF features vector using TfidfVectorizer

In [None]:
vectorizer=TfidfVectorizer(lowercase=True, analyzer='word', stop_words='english', max_df=0.75,
                           max_features=1000, ngram_range=(1,2), use_idf=True, smooth_idf=True)
x_train_tfidf=vectorizer.fit_transform(x_train).todense()
print(x_train_tfidf.shape)
print(x_train_tfidf)

(1626, 1000)
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.13231895 0.         0.         ... 0.         0.         0.        ]
 [0.07246767 0.         0.         ... 0.         0.         0.        ]]


In [None]:
print('Stopwords:', vectorizer.get_stop_words())
print('Vocabulary:', vectorizer.get_feature_names())

Stopwords: frozenset({'themselves', 'over', 'thin', 'might', 'whereupon', 'for', 'us', 'becoming', 'something', 'whither', 'back', 'do', 'interest', 'once', 'they', 'my', 'cant', 'here', 'con', 'beyond', 'her', 'done', 'meanwhile', 'side', 'whereafter', 'or', 'someone', 'besides', 'well', 'bottom', 'sincere', 'thru', 'without', 'almost', 'per', 'seems', 'yourself', 'least', 'fifteen', 'anywhere', 'four', 'hereby', 'latter', 'serious', 'himself', 'top', 'myself', 'towards', 'three', 'that', 'detail', 'can', 'thereafter', 'via', 'am', 'nothing', 'much', 'now', 'rather', 'a', 'still', 'we', 'any', 'latterly', 'anything', 'front', 'wherever', 're', 'again', 'thereupon', 'how', 'nevertheless', 'this', 'between', 'de', 'each', 'him', 'always', 'across', 'often', 'ours', 'would', 'both', 'found', 'off', 'less', 'herein', 'everywhere', 'his', 'some', 'the', 'everything', 'becomes', 'indeed', 'are', 'never', 'see', 'since', 'though', 'had', 'what', 'as', 'whereas', 'which', 'at', 'afterwards', 

In [None]:
x_train_tfidf[:1]

matrix([[0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.05408359,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.03378191,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0

In [None]:
x_test_tfidf=vectorizer.transform(x_test).todense()
x_test2_tfidf=vectorizer.transform(x_test2).todense()
print(x_test_tfidf.shape, x_test2_tfidf.shape)

(500, 1000) (200, 1000)


#### Which Naive Bayes model performs better   
https://scikit-learn.org/stable/modules/naive_bayes.html  
GaussianNB and MultinomialNB may work well with TF-IDF features vector

In [None]:
clfs=[GaussianNB(), MultinomialNB()]
for clf in clfs:
    train_and_test(clf, x_train_tfidf, y_train_bin, x_test_tfidf, y_test_bin)
    print('---------------------')

GaussianNB(priors=None, var_smoothing=1e-09)
Train set accuracy: 0.986469864698647
Test set accuracy: 0.98
---------------------
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
Train set accuracy: 0.992619926199262
Test set accuracy: 0.996
---------------------


In [None]:
clf=MultinomialNB()
clf.fit(x_train_tfidf, y_train_bin)
y_test_pred=clf.predict(x_test_tfidf)
y_test2_pred=clf.predict(x_test2_tfidf)

##### Test set of 500 samples

In [None]:
cm, metrics= evaluate_metrics(y_test_bin, y_test_pred, labels=['sport', 'non-sport'])
print('Confusion matrix')
print(cm)
metrics

Confusion matrix
[[118   1]
 [  0 381]]


Unnamed: 0,support,True Positive Rate(TPR),False Positive Rate(FPR),Sensitivity(SE),Specificity(SP),Accuracy(ACC),Precision(P),Recall(R),F1-Score
sport,119.0,0.991597,0.0,0.991597,1.0,0.998,1.0,0.991597,0.995781
non-sport,381.0,1.0,0.008403,1.0,0.991597,0.998,0.997382,1.0,0.998689


##### Test set of 300 samples

In [None]:
cm2, metrics2= evaluate_metrics(y_test2_bin, y_test2_pred, labels=['sport', 'non-sport'])
print('Confusion matrix')
print(cm2)
metrics2

Confusion matrix
[[ 53   1]
 [  0 146]]


Unnamed: 0,support,True Positive Rate(TPR),False Positive Rate(FPR),Sensitivity(SE),Specificity(SP),Accuracy(ACC),Precision(P),Recall(R),F1-Score
sport,54.0,0.981481,0.0,0.981481,1.0,0.995,1.0,0.981481,0.990654
non-sport,146.0,1.0,0.018519,1.0,0.981481,0.995,0.993197,1.0,0.996587


## Multi-classes classification (5 classes)

### Binary features vector

#### Build a pipeline to find out best hyper parameters

In [None]:
def grid_search(pipeline, parameters, x_train, y_train):
    # find the best parameters for both the feature extraction and the classifier
    grid_searcher = GridSearchCV(pipeline, parameters, scoring='accuracy',
                               refit='accuracy', n_jobs=1,  verbose=10, cv=3)
    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_searcher.fit(x_train, y_train)
    print("done in %0.3fs" % (time() - t0))
    print("Best score: %0.3f" % grid_searcher.best_score_)
    print("Best parameters set:")
    best_estimator=grid_searcher.best_estimator_
    best_parameters = best_estimator.get_params()
    print(best_parameters)
    return grid_searcher, best_estimator

In [None]:
parameters = {
    'vectorizer__stop_words': (None, 'english'),
    'vectorizer__max_features': (100, 300, 1000, 3000, 10000),
    'vectorizer__ngram_range': ((1, 1), (1, 2), (1,3)),
}

In [None]:
bernoulli_pipeline = Pipeline([
    ('vectorizer', CountVectorizer(lowercase=True, analyzer='word', binary=True, max_df=0.75)),
    ('todense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)),
    ('clf', BernoulliNB())
])
bernoulli_searcher, best_bernoulli_clf=grid_search(bernoulli_pipeline, parameters, x_train, y_train)

Performing grid search...
pipeline: ['vectorizer', 'todense', 'clf']
parameters:
{'vectorizer__max_features': (100, 300, 1000, 3000, 10000),
 'vectorizer__ngram_range': ((1, 1), (1, 2), (1, 3)),
 'vectorizer__stop_words': (None, 'english')}
Fitting 3 folds for each of 30 candidates, totalling 90 fits
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=None 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=None, score=0.661, total=   0.5s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=None 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s


[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=None, score=0.703, total=   0.5s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=None 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.0s remaining:    0.0s


[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=None, score=0.673, total=   0.5s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=english 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.5s remaining:    0.0s


[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=english, score=0.799, total=   0.5s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=english 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    2.0s remaining:    0.0s


[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=english, score=0.762, total=   0.5s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=english 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    2.5s remaining:    0.0s


[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=english, score=0.815, total=   0.5s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=None 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    3.0s remaining:    0.0s


[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=None, score=0.648, total=   2.0s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=None 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    5.0s remaining:    0.0s


[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=None, score=0.679, total=   1.9s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=None 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    6.9s remaining:    0.0s


[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=None, score=0.659, total=   1.9s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=english 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    8.9s remaining:    0.0s


[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=english, score=0.797, total=   1.5s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=english 
[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=english, score=0.760, total=   1.5s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=english 
[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=english, score=0.803, total=   1.6s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 3), vectorizer__stop_words=None 
[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 3), vectorizer__stop_words=None, score=0.648, total=   4.3s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 3), vectorizer__stop_words=None 
[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 3), vectorizer__stop_words=None, score=0.

[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:  3.1min finished


done in 186.111s
Best score: 0.958
Best parameters set:
{'memory': None, 'steps': [('vectorizer', CountVectorizer(analyzer='word', binary=True, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.75, max_features=10000, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)), ('todense', FunctionTransformer(accept_sparse=True, check_inverse=True,
                    func=<function <lambda> at 0x7f6f3ebb1268>,
                    inv_kw_args=None, inverse_func=None, kw_args=None,
                    validate=False)), ('clf', BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True))], 'verbose': False, 'vectorizer': CountVectorizer(analyzer='word', binary=True, decode_error='strict',
                dtype=<class 'numpy.int64'>, encodi

In [None]:
multinomial_pipeline = Pipeline([
    ('vectorizer', CountVectorizer(lowercase=True, analyzer='word', binary=True, max_df=0.75)),
    ('todense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)),
    ('clf', MultinomialNB())
])
multinomial_searcher, best_multinomial_clf=grid_search(multinomial_pipeline, parameters, x_train, y_train)

Performing grid search...
pipeline: ['vectorizer', 'todense', 'clf']
parameters:
{'vectorizer__max_features': (100, 300, 1000, 3000, 10000),
 'vectorizer__ngram_range': ((1, 1), (1, 2), (1, 3)),
 'vectorizer__stop_words': (None, 'english')}
Fitting 3 folds for each of 30 candidates, totalling 90 fits
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=None 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=None, score=0.729, total=   0.5s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=None 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s


[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=None, score=0.727, total=   0.5s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=None 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.0s remaining:    0.0s


[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=None, score=0.731, total=   0.5s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=english 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.5s remaining:    0.0s


[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=english, score=0.812, total=   0.5s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=english 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    2.0s remaining:    0.0s


[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=english, score=0.810, total=   0.5s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=english 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    2.5s remaining:    0.0s


[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=english, score=0.810, total=   0.5s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=None 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    2.9s remaining:    0.0s


[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=None, score=0.725, total=   2.0s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=None 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    4.9s remaining:    0.0s


[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=None, score=0.720, total=   1.9s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=None 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    6.8s remaining:    0.0s


[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=None, score=0.701, total=   1.9s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=english 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    8.8s remaining:    0.0s


[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=english, score=0.810, total=   1.5s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=english 
[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=english, score=0.812, total=   1.5s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=english 
[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=english, score=0.797, total=   1.5s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 3), vectorizer__stop_words=None 
[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 3), vectorizer__stop_words=None, score=0.725, total=   4.2s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 3), vectorizer__stop_words=None 
[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 3), vectorizer__stop_words=None, score=0.

[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:  3.0min finished


done in 185.893s
Best score: 0.964
Best parameters set:
{'memory': None, 'steps': [('vectorizer', CountVectorizer(analyzer='word', binary=True, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.75, max_features=10000, min_df=1,
                ngram_range=(1, 3), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)), ('todense', FunctionTransformer(accept_sparse=True, check_inverse=True,
                    func=<function <lambda> at 0x7f6f3de96f28>,
                    inv_kw_args=None, inverse_func=None, kw_args=None,
                    validate=False)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))], 'verbose': False, 'vectorizer': CountVectorizer(analyzer='word', binary=True, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', 

In [None]:
pd.set_option('display.max_columns', 100)
pd.DataFrame(multinomial_searcher.cv_results_).sort_values('rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_vectorizer__max_features,param_vectorizer__ngram_range,param_vectorizer__stop_words,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
29,2.724626,0.047175,0.321136,0.00209,10000,"(1, 3)",english,"{'vectorizer__max_features': 10000, 'vectorize...",0.968635,0.964945,0.95941,0.96433,0.003791,1
27,1.553121,0.007609,0.24773,0.005393,10000,"(1, 2)",english,"{'vectorizer__max_features': 10000, 'vectorize...",0.968635,0.964945,0.957565,0.963715,0.004602,2
25,0.583247,0.006003,0.165784,0.000643,10000,"(1, 1)",english,"{'vectorizer__max_features': 10000, 'vectorize...",0.964945,0.96679,0.957565,0.9631,0.003986,3
24,0.590531,0.002407,0.182325,0.004664,10000,"(1, 1)",,"{'vectorizer__max_features': 10000, 'vectorize...",0.961255,0.964945,0.957565,0.961255,0.003013,4
21,1.398708,0.011437,0.21882,0.002411,3000,"(1, 2)",english,"{'vectorizer__max_features': 3000, 'vectorizer...",0.964945,0.96679,0.95203,0.961255,0.006566,4
19,0.436595,0.00994,0.141678,0.002994,3000,"(1, 1)",english,"{'vectorizer__max_features': 3000, 'vectorizer...",0.9631,0.96679,0.953875,0.961255,0.005432,6
18,0.442474,0.003588,0.153002,0.001319,3000,"(1, 1)",,"{'vectorizer__max_features': 3000, 'vectorizer...",0.961255,0.9631,0.957565,0.96064,0.002301,7
26,1.888304,0.006479,0.340842,0.005894,10000,"(1, 2)",,"{'vectorizer__max_features': 10000, 'vectorize...",0.961255,0.961255,0.957565,0.960025,0.00174,8
23,2.578952,0.014529,0.304348,0.008537,3000,"(1, 3)",english,"{'vectorizer__max_features': 3000, 'vectorizer...",0.964945,0.9631,0.95203,0.960025,0.005703,8
15,1.390334,0.013922,0.201407,0.00168,1000,"(1, 2)",english,"{'vectorizer__max_features': 1000, 'vectorizer...",0.961255,0.964945,0.944649,0.95695,0.008827,10


#### Evaluate metrics

##### Test set of 500 samples

In [None]:
print('Accuracy:', best_multinomial_clf.score(x_test, y_test))
y_test_pred=best_multinomial_clf.predict(x_test)
cm, metric_df=evaluate_metrics(y_test, y_test_pred, labels=CATEGORIES)
print('Confusion matrix:')
print(cm)
metric_df

Accuracy: 0.97
Confusion matrix:
[[ 81   5   0   2   0]
 [  3 113   0   0   1]
 [  0   0 119   0   0]
 [  2   0   0  77   0]
 [  0   2   0   0  95]]


Unnamed: 0,support,True Positive Rate(TPR),False Positive Rate(FPR),Sensitivity(SE),Specificity(SP),Accuracy(ACC),Precision(P),Recall(R),F1-Score
tech,88.0,0.920455,0.012136,0.920455,0.987864,0.976,0.94186,0.920455,0.931034
business,117.0,0.965812,0.018277,0.965812,0.981723,0.978,0.941667,0.965812,0.953586
sport,119.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
entertainment,79.0,0.974684,0.004751,0.974684,0.995249,0.992,0.974684,0.974684,0.974684
politics,97.0,0.979381,0.002481,0.979381,0.997519,0.994,0.989583,0.979381,0.984456


##### Test set of 300 sample

In [None]:
print('Accuracy:', best_multinomial_clf.score(x_test2, y_test2))
y_test2_pred=best_multinomial_clf.predict(x_test2)
cm, metric_df=evaluate_metrics(y_test2, y_test2_pred, labels=CATEGORIES)
print('Confusion matrix:')
print(cm)
metric_df

Accuracy: 0.97
Confusion matrix:
[[34  3  0  0  0]
 [ 1 41  0  0  1]
 [ 0  0 54  0  0]
 [ 0  0  0 28  0]
 [ 0  1  0  0 37]]


Unnamed: 0,support,True Positive Rate(TPR),False Positive Rate(FPR),Sensitivity(SE),Specificity(SP),Accuracy(ACC),Precision(P),Recall(R),F1-Score
tech,37.0,0.918919,0.006135,0.918919,0.993865,0.98,0.971429,0.918919,0.944444
business,43.0,0.953488,0.025478,0.953488,0.974522,0.97,0.911111,0.953488,0.931818
sport,54.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
entertainment,28.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
politics,38.0,0.973684,0.006173,0.973684,0.993827,0.99,0.973684,0.973684,0.973684


### TF-IDF features vector

#### Build a pipeline to find out best hyper parameters

In [None]:
parameters = {
    'vectorizer__stop_words': (None, 'english'),
    'vectorizer__max_features': (100, 300, 1000, 3000, 10000),
    'vectorizer__ngram_range': ((1, 1), (1, 2), (1,3)), 
    'vectorizer__sublinear_tf':(True, False)
}

In [None]:
gaussian_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(lowercase=True, analyzer='word', use_idf=True, smooth_idf=True, max_df=0.75)),
    ('todense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)),
    ('clf', GaussianNB())
])
gaussian_searcher, best_bernoulli_clf = grid_search(gaussian_pipeline, parameters, x_train, y_train)

Performing grid search...
pipeline: ['vectorizer', 'todense', 'clf']
parameters:
{'vectorizer__max_features': (100, 300, 1000, 3000, 10000),
 'vectorizer__ngram_range': ((1, 1), (1, 2), (1, 3)),
 'vectorizer__stop_words': (None, 'english'),
 'vectorizer__sublinear_tf': (True, False)}
Fitting 3 folds for each of 60 candidates, totalling 180 fits
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=None, vectorizer__sublinear_tf=True 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=None, vectorizer__sublinear_tf=True, score=0.795, total=   0.5s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=None, vectorizer__sublinear_tf=True 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s


[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=None, vectorizer__sublinear_tf=True, score=0.782, total=   0.5s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=None, vectorizer__sublinear_tf=True 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.0s remaining:    0.0s


[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=None, vectorizer__sublinear_tf=True, score=0.753, total=   0.5s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=None, vectorizer__sublinear_tf=False 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.4s remaining:    0.0s


[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=None, vectorizer__sublinear_tf=False, score=0.808, total=   0.5s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=None, vectorizer__sublinear_tf=False 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    1.9s remaining:    0.0s


[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=None, vectorizer__sublinear_tf=False, score=0.792, total=   0.5s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=None, vectorizer__sublinear_tf=False 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    2.4s remaining:    0.0s


[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=None, vectorizer__sublinear_tf=False, score=0.742, total=   0.5s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=english, vectorizer__sublinear_tf=True 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    2.8s remaining:    0.0s


[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=english, vectorizer__sublinear_tf=True, score=0.793, total=   0.4s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=english, vectorizer__sublinear_tf=True 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    3.3s remaining:    0.0s


[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=english, vectorizer__sublinear_tf=True, score=0.814, total=   0.4s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=english, vectorizer__sublinear_tf=True 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    3.7s remaining:    0.0s


[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=english, vectorizer__sublinear_tf=True, score=0.821, total=   0.4s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=english, vectorizer__sublinear_tf=False 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    4.1s remaining:    0.0s


[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=english, vectorizer__sublinear_tf=False, score=0.827, total=   0.5s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=english, vectorizer__sublinear_tf=False 
[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=english, vectorizer__sublinear_tf=False, score=0.815, total=   0.4s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=english, vectorizer__sublinear_tf=False 
[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=english, vectorizer__sublinear_tf=False, score=0.817, total=   0.4s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=None, vectorizer__sublinear_tf=True 
[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=None, vectorizer__sublinear_tf=True, score=

[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed:  6.2min finished


done in 372.440s
Best score: 0.949
Best parameters set:
{'memory': None, 'steps': [('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.75,
                max_features=10000, min_df=1, ngram_range=(1, 2), norm='l2',
                preprocessor=None, smooth_idf=True, stop_words='english',
                strip_accents=None, sublinear_tf=True,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
                vocabulary=None)), ('todense', FunctionTransformer(accept_sparse=True, check_inverse=True,
                    func=<function <lambda> at 0x7f9012fe5c80>,
                    inv_kw_args=None, inverse_func=None, kw_args=None,
                    validate=False)), ('clf', GaussianNB(priors=None, var_smoothing=1e-09))], 'verbose': False, 'vectorizer': TfidfVectorizer(analyzer='word', binary=False, dec

In [None]:
multinomial_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(lowercase=True, analyzer='word', use_idf=True, smooth_idf=True, max_df=0.75)),
    ('todense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)),
    ('clf', MultinomialNB())
])
multinomial_searcher, best_multinomial_clf=grid_search(multinomial_pipeline, parameters, x_train, y_train)

Performing grid search...
pipeline: ['vectorizer', 'todense', 'clf']
parameters:
{'vectorizer__max_features': (100, 300, 1000, 3000, 10000),
 'vectorizer__ngram_range': ((1, 1), (1, 2), (1, 3)),
 'vectorizer__stop_words': (None, 'english'),
 'vectorizer__sublinear_tf': (True, False)}
Fitting 3 folds for each of 60 candidates, totalling 180 fits
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=None, vectorizer__sublinear_tf=True 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=None, vectorizer__sublinear_tf=True, score=0.838, total=   0.5s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=None, vectorizer__sublinear_tf=True 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s


[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=None, vectorizer__sublinear_tf=True, score=0.790, total=   0.6s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=None, vectorizer__sublinear_tf=True 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.1s remaining:    0.0s


[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=None, vectorizer__sublinear_tf=True, score=0.806, total=   0.6s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=None, vectorizer__sublinear_tf=False 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.7s remaining:    0.0s


[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=None, vectorizer__sublinear_tf=False, score=0.828, total=   0.6s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=None, vectorizer__sublinear_tf=False 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    2.3s remaining:    0.0s


[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=None, vectorizer__sublinear_tf=False, score=0.786, total=   0.6s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=None, vectorizer__sublinear_tf=False 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    2.8s remaining:    0.0s


[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=None, vectorizer__sublinear_tf=False, score=0.786, total=   0.6s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=english, vectorizer__sublinear_tf=True 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    3.4s remaining:    0.0s


[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=english, vectorizer__sublinear_tf=True, score=0.904, total=   0.5s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=english, vectorizer__sublinear_tf=True 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    4.0s remaining:    0.0s


[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=english, vectorizer__sublinear_tf=True, score=0.902, total=   0.5s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=english, vectorizer__sublinear_tf=True 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    4.5s remaining:    0.0s


[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=english, vectorizer__sublinear_tf=True, score=0.876, total=   0.5s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=english, vectorizer__sublinear_tf=False 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    5.1s remaining:    0.0s


[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=english, vectorizer__sublinear_tf=False, score=0.902, total=   0.5s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=english, vectorizer__sublinear_tf=False 
[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=english, vectorizer__sublinear_tf=False, score=0.891, total=   0.5s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=english, vectorizer__sublinear_tf=False 
[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=english, vectorizer__sublinear_tf=False, score=0.878, total=   0.5s
[CV] vectorizer__max_features=100, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=None, vectorizer__sublinear_tf=True 
[CV]  vectorizer__max_features=100, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=None, vectorizer__sublinear_tf=True, score=

[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed:  6.3min finished


done in 378.601s
Best score: 0.970
Best parameters set:
{'memory': None, 'steps': [('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.75, max_features=3000,
                min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)), ('todense', FunctionTransformer(accept_sparse=True, check_inverse=True,
                    func=<function <lambda> at 0x7f901257ebf8>,
                    inv_kw_args=None, inverse_func=None, kw_args=None,
                    validate=False)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))], 'verbose': False, 'vectorizer': TfidfVectorizer(analyzer='word', binary=False, decod

In [None]:
pd.set_option('display.max_columns', 100)
pd.DataFrame(multinomial_searcher.cv_results_).sort_values('rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_vectorizer__max_features,param_vectorizer__ngram_range,param_vectorizer__stop_words,param_vectorizer__sublinear_tf,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
43,1.389153,0.006463,0.261968,0.000568,3000,"(1, 2)",english,False,"{'vectorizer__max_features': 3000, 'vectorizer...",0.98155,0.964945,0.9631,0.969865,0.008297,1
47,2.802945,0.194739,0.364105,0.022864,3000,"(1, 3)",english,False,"{'vectorizer__max_features': 3000, 'vectorizer...",0.98155,0.964945,0.9631,0.969865,0.008297,1
59,2.633256,0.016991,0.362647,0.00207,10000,"(1, 3)",english,False,"{'vectorizer__max_features': 10000, 'vectorize...",0.98155,0.961255,0.964945,0.96925,0.008827,3
55,1.421249,0.012934,0.288123,0.005057,10000,"(1, 2)",english,False,"{'vectorizer__max_features': 10000, 'vectorize...",0.98155,0.961255,0.964945,0.96925,0.008827,3
38,0.404653,0.003656,0.188619,0.002124,3000,"(1, 1)",english,True,"{'vectorizer__max_features': 3000, 'vectorizer...",0.97786,0.97048,0.957565,0.968635,0.008388,5
58,2.642627,0.012092,0.366806,0.00335,10000,"(1, 3)",english,True,"{'vectorizer__max_features': 10000, 'vectorize...",0.979705,0.96679,0.95572,0.967405,0.009802,6
54,1.423741,0.008016,0.288487,0.004106,10000,"(1, 2)",english,True,"{'vectorizer__max_features': 10000, 'vectorize...",0.97786,0.964945,0.95941,0.967405,0.007731,6
39,0.396864,0.003677,0.184775,0.000552,3000,"(1, 1)",english,False,"{'vectorizer__max_features': 3000, 'vectorizer...",0.979705,0.9631,0.957565,0.96679,0.009408,8
46,2.66512,0.096048,0.367154,0.034214,3000,"(1, 3)",english,True,"{'vectorizer__max_features': 3000, 'vectorizer...",0.97417,0.96679,0.95941,0.96679,0.006026,9
42,1.394368,0.015838,0.26661,0.006961,3000,"(1, 2)",english,True,"{'vectorizer__max_features': 3000, 'vectorizer...",0.97417,0.96679,0.95941,0.96679,0.006026,9


#### Evaluate metrics

##### Test set of 500 samples

In [None]:
print('Accuracy:', best_multinomial_clf.score(x_test, y_test))
y_test_pred=best_multinomial_clf.predict(x_test)
cm, metric_df=evaluate_metrics(y_test, y_test_pred, labels=CATEGORIES)
print('Confusion matrix:')
print(cm)
metric_df

Accuracy: 0.978
Confusion matrix:
[[ 81   2   1   2   2]
 [  1 116   0   0   0]
 [  0   0 119   0   0]
 [  0   0   0  79   0]
 [  0   3   0   0  94]]


Unnamed: 0,support,True Positive Rate(TPR),False Positive Rate(FPR),Sensitivity(SE),Specificity(SP),Accuracy(ACC),Precision(P),Recall(R),F1-Score
tech,88.0,0.920455,0.002427,0.920455,0.997573,0.984,0.987805,0.920455,0.952941
business,117.0,0.991453,0.013055,0.991453,0.986945,0.988,0.958678,0.991453,0.97479
sport,119.0,1.0,0.002625,1.0,0.997375,0.998,0.991667,1.0,0.995816
entertainment,79.0,1.0,0.004751,1.0,0.995249,0.996,0.975309,1.0,0.9875
politics,97.0,0.969072,0.004963,0.969072,0.995037,0.99,0.979167,0.969072,0.974093


In [None]:
report = sklm.classification_report(y_test, y_test_pred, labels= ['tech', 'business','sport','entertainment', 'politics'])
print(report)

               precision    recall  f1-score   support

         tech       0.99      0.92      0.95        88
     business       0.96      0.99      0.97       117
        sport       0.99      1.00      1.00       119
entertainment       0.98      1.00      0.99        79
     politics       0.98      0.97      0.97        97

     accuracy                           0.98       500
    macro avg       0.98      0.98      0.98       500
 weighted avg       0.98      0.98      0.98       500



##### Test set of 300 sample

In [None]:
print('Accuracy:', best_multinomial_clf.score(x_test2, y_test2))
y_test2_pred=best_multinomial_clf.predict(x_test2)
cm, metric_df=evaluate_metrics(y_test2, y_test2_pred, labels=CATEGORIES)
print('Confusion matrix:')
print(cm)
metric_df

Accuracy: 0.98
Confusion matrix:
[[35  1  1  0  0]
 [ 1 42  0  0  0]
 [ 0  0 54  0  0]
 [ 0  0  0 28  0]
 [ 0  1  0  0 37]]


Unnamed: 0,support,True Positive Rate(TPR),False Positive Rate(FPR),Sensitivity(SE),Specificity(SP),Accuracy(ACC),Precision(P),Recall(R),F1-Score
tech,37.0,0.945946,0.006135,0.945946,0.993865,0.985,0.972222,0.945946,0.958904
business,43.0,0.976744,0.012739,0.976744,0.987261,0.985,0.954545,0.976744,0.965517
sport,54.0,1.0,0.006849,1.0,0.993151,0.995,0.981818,1.0,0.990826
entertainment,28.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
politics,38.0,0.973684,0.0,0.973684,1.0,0.995,1.0,0.973684,0.986667
