Test diferent Vectorizers and n-grams sizes

In [1]:
import pandas as pd

dataset = pd.read_excel('OpArticles_ADUs.xlsx')

## Cleanup and normalization

In [2]:
import re
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer

corpus = []
stemmer = RSLPStemmer()
stopwords_list = stopwords.words('portuguese')
# Remover algumas palavras da lista, p.ex. "não"
stopwords_list.remove('não')

for i in range(0, dataset['tokens'].size):
    # get review, remove and lowercase non alpha chars
    review = re.sub('[^a-zA-Z\u00C0-\u00ff]', ' ', dataset['tokens'][i]).lower()
    # split into tokens, apply stemming and remove stop words
    review = ' '.join([stemmer.stem(w) for w in review.split() if not w in set(stopwords_list)])
    corpus.append(review)

print(corpus[:5])

['fact não apen frut ignor', 'hav hum jorn investig preocup aprofund contextual histór isenç relat preocup soc urg denunci muit peç real jorn', 'tud cómic fif', 'tod permit organiz faç total absurd sent', 'não faz rir cust poder']


## Generating a data set

We need to transform the data in the reduced-vocabulary corpus into a dataset that can be handled by machine learning models. Each review in our corpus is still rather unstructured: it is simply a lists of tokens. We will transform each review into a representation that makes use of the same set of features for the whole dataset.

In [3]:
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.naive_bayes import ComplementNB
import time

y = dataset['label']

def test_vectorizer(vectorizer):
    # Fit vectorizer
    X = vectorizer.fit_transform(corpus).toarray()

    # Fit CV
    start = time.time()
    scores = cross_validate(
        ComplementNB(),
        X,
        y,
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=0),
        scoring=['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted'], n_jobs=2
    )
    stop = time.time()

    print(scores)

    print("Elapsed time: %0.2fs" % (stop - start))
    print("Mean accuracy: %0.2f" % scores['test_accuracy'].mean())
    print("Mean Precision: %0.2f" % scores['test_precision_weighted'].mean())
    print("Mean Recall: %0.2f" % scores['test_recall_weighted'].mean())
    print("Mean F1-score: %0.2f" % scores['test_f1_weighted'].mean())

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

### Bag-of-Words model

The simplest way to do it is to create a *bag-of-words* model, which ignores word sequence.

We can use scikit-learn's [CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html), which converts a collection of text documents to a matrix of token counts.

In [5]:
test_vectorizer(CountVectorizer())

{'fit_time': array([13.07978487, 13.26759815, 13.37541723, 13.44195557,  8.8533721 ]), 'score_time': array([0.17496586, 0.16118813, 0.14487982, 0.17112231, 0.14049125]), 'test_accuracy': array([0.50164228, 0.49417737, 0.49507316, 0.49551971, 0.4874552 ]), 'test_precision_weighted': array([0.51073754, 0.50588003, 0.50736981, 0.50588012, 0.49840547]), 'test_recall_weighted': array([0.50164228, 0.49417737, 0.49507316, 0.49551971, 0.4874552 ]), 'test_f1_weighted': array([0.50528851, 0.49864392, 0.49973221, 0.4993699 , 0.4914467 ])}
Elapsed time: 39.20s
Mean accuracy: 0.49
Mean Precision: 0.51
Mean Recall: 0.49
Mean F1-score: 0.50


### 1-hot vectors

[CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) includes a parameter (*binary*) that allows us to represent each review as a 1-hot vector with a 0 or a 1 for each of the features, indicating whether the corresponding token appears in the review.

In [6]:
test_vectorizer(CountVectorizer(binary=True))

{'fit_time': array([15.1131978 , 15.06183267, 14.62184119, 14.46484876,  9.53496361]), 'score_time': array([0.20294929, 0.2479744 , 0.22663593, 0.27171206, 0.14503622]), 'test_accuracy': array([0.50104509, 0.49835772, 0.50343386, 0.49492234, 0.48775388]), 'test_precision_weighted': array([0.51123087, 0.50855918, 0.51569898, 0.50572692, 0.49794493]), 'test_recall_weighted': array([0.50104509, 0.49835772, 0.50343386, 0.49492234, 0.48775388]), 'test_f1_weighted': array([0.50502096, 0.50233345, 0.5080883 , 0.49900227, 0.491515  ])}
Elapsed time: 42.52s
Mean accuracy: 0.50
Mean Precision: 0.51
Mean Recall: 0.50
Mean F1-score: 0.50


### TF-IDF

We can adjust the counts of each word in a document by considering how many times it occurs in the document (its *term frequency TF*) and in how many documents it occurs (its *document frequency DF*). [TfidfVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html) provides a way to directly obtain TF-IDF weighted features: the term frequency of a word is multiplied by its *inverse* document frequency.

In [7]:
test_vectorizer(TfidfVectorizer())

{'fit_time': array([1.31466675, 1.1946764 , 1.63242769, 1.44752264, 1.11738658]), 'score_time': array([0.16918802, 0.21013141, 0.16723514, 0.13361049, 0.09616566]), 'test_accuracy': array([0.51149597, 0.49387877, 0.51567632, 0.50567503, 0.49671446]), 'test_precision_weighted': array([0.51092058, 0.49217059, 0.51140383, 0.50686052, 0.49588358]), 'test_recall_weighted': array([0.51149597, 0.49387877, 0.51567632, 0.50567503, 0.49671446]), 'test_f1_weighted': array([0.51071789, 0.49255129, 0.51326197, 0.50596719, 0.49574144])}
Elapsed time: 7.25s
Mean accuracy: 0.50
Mean Precision: 0.50
Mean Recall: 0.50
Mean F1-score: 0.50


### Bi-grams

In [8]:
test_vectorizer(CountVectorizer(ngram_range=(2,2)))

{'fit_time': array([170.74105906, 171.49962926, 152.12257886, 155.2970736 ,
       110.26333189]), 'score_time': array([3.92482615, 4.05506015, 3.99222088, 2.3081069 , 1.44701433]), 'test_accuracy': array([0.38996715, 0.37951627, 0.39205733, 0.3864994 , 0.38470729]), 'test_precision_weighted': array([0.5033966 , 0.48992763, 0.5111218 , 0.50719376, 0.50113247]), 'test_recall_weighted': array([0.38996715, 0.37951627, 0.39205733, 0.3864994 , 0.38470729]), 'test_f1_weighted': array([0.42145123, 0.41055452, 0.42512984, 0.42029431, 0.41590245])}
Elapsed time: 468.63s
Mean accuracy: 0.39
Mean Precision: 0.50
Mean Recall: 0.39
Mean F1-score: 0.42


In [9]:
test_vectorizer(CountVectorizer(binary=True, ngram_range=(2,2)))

{'fit_time': array([141.89739418, 132.04240179, 144.67197037, 140.43964195,
       100.1123383 ]), 'score_time': array([1.59695458, 2.98096132, 4.24064732, 2.09014583, 0.93750048]), 'test_accuracy': array([0.38996715, 0.38011347, 0.39146014, 0.3864994 , 0.38410992]), 'test_precision_weighted': array([0.50340146, 0.49049108, 0.5118461 , 0.50745018, 0.50134433]), 'test_recall_weighted': array([0.38996715, 0.38011347, 0.39146014, 0.3864994 , 0.38410992]), 'test_f1_weighted': array([0.42146972, 0.41111356, 0.42506059, 0.42029731, 0.41565417])}
Elapsed time: 407.18s
Mean accuracy: 0.39
Mean Precision: 0.50
Mean Recall: 0.39
Mean F1-score: 0.42


In [10]:
test_vectorizer(TfidfVectorizer(ngram_range=(2,2)))

{'fit_time': array([52.00000143, 50.53124881, 50.98924994, 43.15953207, 41.49654031]), 'score_time': array([2.03124785, 2.56250119, 0.74999928, 0.68750072, 0.54687262]), 'test_accuracy': array([0.39265452, 0.38071066, 0.39922365, 0.38410992, 0.39456392]), 'test_precision_weighted': array([0.51297967, 0.49177468, 0.52090689, 0.50650954, 0.5098002 ]), 'test_recall_weighted': array([0.39265452, 0.38071066, 0.39922365, 0.38410992, 0.39456392]), 'test_f1_weighted': array([0.42717849, 0.41194117, 0.43496743, 0.41887385, 0.42739032])}
Elapsed time: 164.66s
Mean accuracy: 0.39
Mean Precision: 0.51
Mean Recall: 0.39
Mean F1-score: 0.42


### UniBi-grams

In [11]:
test_vectorizer(CountVectorizer(ngram_range=(1,2)))

{'fit_time': array([149.90625048, 149.24988961, 150.29026389, 145.39967275,
       121.13327289]), 'score_time': array([4.53244662, 4.32931805, 2.3281188 , 4.06246328, 2.89272881]), 'test_accuracy': array([0.47715736, 0.45237384, 0.47028964, 0.46565114, 0.46057348]), 'test_precision_weighted': array([0.52288722, 0.49964685, 0.52276963, 0.51738276, 0.50667011]), 'test_recall_weighted': array([0.47715736, 0.45237384, 0.47028964, 0.46565114, 0.46057348]), 'test_f1_weighted': array([0.48824165, 0.46449026, 0.48447104, 0.47941111, 0.47122509])}
Elapsed time: 456.06s
Mean accuracy: 0.47
Mean Precision: 0.51
Mean Recall: 0.47
Mean F1-score: 0.48


In [12]:
test_vectorizer(CountVectorizer(binary=True, ngram_range=(1,2)))

{'fit_time': array([175.65081596, 183.20510912, 172.53594518, 166.27012944,
       127.27495408]), 'score_time': array([3.63869619, 2.44821453, 4.98171759, 5.38313293, 2.75274849]), 'test_accuracy': array([0.47446999, 0.45267244, 0.46969245, 0.46296296, 0.46057348]), 'test_precision_weighted': array([0.52168457, 0.49945424, 0.52215795, 0.51660163, 0.50720002]), 'test_recall_weighted': array([0.47446999, 0.45267244, 0.46969245, 0.46296296, 0.46057348]), 'test_f1_weighted': array([0.48596697, 0.46456337, 0.48406052, 0.47734425, 0.47134698])}
Elapsed time: 521.30s
Mean accuracy: 0.46
Mean Precision: 0.51
Mean Recall: 0.46
Mean F1-score: 0.48


In [13]:
test_vectorizer(TfidfVectorizer(ngram_range=(1,2)))

{'fit_time': array([73.1751225 , 69.58713055, 70.63749433, 64.50446773, 51.81103539]), 'score_time': array([1.89231372, 2.95600128, 1.6496942 , 3.15321493, 0.63205838]), 'test_accuracy': array([0.53926545, 0.50701702, 0.53090475, 0.51732378, 0.52060932]), 'test_precision_weighted': array([0.53905235, 0.50566168, 0.53096444, 0.51666578, 0.51939811]), 'test_recall_weighted': array([0.53926545, 0.50701702, 0.53090475, 0.51732378, 0.52060932]), 'test_f1_weighted': array([0.53785028, 0.5057579 , 0.53002529, 0.51601608, 0.51895145])}
Elapsed time: 227.85s
Mean accuracy: 0.52
Mean Precision: 0.52
Mean Recall: 0.52
Mean F1-score: 0.52


### UniBiTri-grams

In [None]:
test_vectorizer(CountVectorizer(ngram_range=(1,3)))

In [None]:
test_vectorizer(CountVectorizer(binary=True, ngram_range=(1,3)))

In [None]:
test_vectorizer(TfidfVectorizer(ngram_range=(1,3)))