# Test diferent Vectorizers and n-grams sizes

In [1]:
import pandas as pd

dataset = pd.read_excel('OpArticles_ADUs.xlsx')

#### Cleanup and normalization

In [2]:
import re
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer

corpus = []
stemmer = RSLPStemmer()
stopwords_list = stopwords.words('portuguese')
# Remover algumas palavras da lista, p.ex. "não"
stopwords_list.remove('não')

for i in range(0, dataset['tokens'].size):
    # get review, remove and lowercase non alpha chars
    review = re.sub('[^a-zA-Z\u00C0-\u00ff]', ' ', dataset['tokens'][i]).lower()
    # split into tokens, apply stemming and remove stop words
    review = ' '.join([stemmer.stem(w) for w in review.split() if not w in set(stopwords_list)])
    corpus.append(review)

print(corpus[:5])

['fact não apen frut ignor', 'hav hum jorn investig preocup aprofund contextual histór isenç relat preocup soc urg denunci muit peç real jorn', 'tud cómic fif', 'tod permit organiz faç total absurd sent', 'não faz rir cust poder']


## Generating a data set

We need to transform the data in the reduced-vocabulary corpus into a dataset that can be handled by machine learning models. Each review in our corpus is still rather unstructured: it is simply a lists of tokens. We will transform each review into a representation that makes use of the same set of features for the whole dataset.

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import ComplementNB
import sklearn.metrics as metrics
import time

y = dataset['label']

def test_vectorizer(vectorizer):
    # Fit vectorizer
    start = time.time()
    X = vectorizer.fit_transform(corpus).toarray()
    stop = time.time()
    print("Vectorizer fit time: %0.2fs" % (stop - start))
    print("(Number of samples, Number of features):", X.shape)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=0, stratify=y, shuffle=True)

    start = time.time()
    clf = ComplementNB()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    stop = time.time()

    print("\nModel time: %0.2fs" % (stop - start))
    print("\nConfusion matrix:\n", metrics.confusion_matrix(y_test, y_pred))
    print("\nClassification report:\n", metrics.classification_report(y_test, y_pred))

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

### Bag-of-Words model

The simplest way to do it is to create a *bag-of-words* model, which ignores word sequence.

We can use scikit-learn's [CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html), which converts a collection of text documents to a matrix of token counts.

In [5]:
test_vectorizer(CountVectorizer())

Vectorizer fit time: 0.39s
(Number of samples, Number of features): (16743, 8256)

Model time: 8.69s

Confusion matrix:
 [[317  30 247  47  92]
 [  6  54  52   9  12]
 [291  63 861 143 263]
 [ 51  15  94  96  26]
 [ 81  21 161  23 294]]

Classification report:
               precision    recall  f1-score   support

        Fact       0.42      0.43      0.43       733
      Policy       0.30      0.41      0.34       133
       Value       0.61      0.53      0.57      1621
    Value(+)       0.30      0.34      0.32       282
    Value(-)       0.43      0.51      0.46       580

    accuracy                           0.48      3349
   macro avg       0.41      0.44      0.42      3349
weighted avg       0.50      0.48      0.49      3349



### 1-hot vectors

[CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) includes a parameter (*binary*) that allows us to represent each review as a 1-hot vector with a 0 or a 1 for each of the features, indicating whether the corresponding token appears in the review.

In [6]:
test_vectorizer(CountVectorizer(binary=True))

Vectorizer fit time: 0.30s
(Number of samples, Number of features): (16743, 8256)

Model time: 9.24s

Confusion matrix:
 [[315  28 253  48  89]
 [  6  58  48  10  11]
 [302  67 863 133 256]
 [ 50  17  91  97  27]
 [ 76  21 162  22 299]]

Classification report:
               precision    recall  f1-score   support

        Fact       0.42      0.43      0.43       733
      Policy       0.30      0.44      0.36       133
       Value       0.61      0.53      0.57      1621
    Value(+)       0.31      0.34      0.33       282
    Value(-)       0.44      0.52      0.47       580

    accuracy                           0.49      3349
   macro avg       0.42      0.45      0.43      3349
weighted avg       0.50      0.49      0.49      3349



### TF-IDF

We can adjust the counts of each word in a document by considering how many times it occurs in the document (its *term frequency TF*) and in how many documents it occurs (its *document frequency DF*). [TfidfVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html) provides a way to directly obtain TF-IDF weighted features: the term frequency of a word is multiplied by its *inverse* document frequency.

In [18]:
test_vectorizer(TfidfVectorizer())

Vectorizer fit time: 0.37s
(Number of samples, Number of features): (16743, 8256)

Model time: 0.41s

Confusion matrix:
 [[300  28 288  42  75]
 [  6  41  70   9   7]
 [263  66 968 112 212]
 [ 42  11 115  93  21]
 [ 69  28 196  18 269]]

Classification report:
               precision    recall  f1-score   support

        Fact       0.44      0.41      0.42       733
      Policy       0.24      0.31      0.27       133
       Value       0.59      0.60      0.59      1621
    Value(+)       0.34      0.33      0.33       282
    Value(-)       0.46      0.46      0.46       580

    accuracy                           0.50      3349
   macro avg       0.41      0.42      0.42      3349
weighted avg       0.50      0.50      0.50      3349



### Bi-grams

In [17]:
test_vectorizer(CountVectorizer(ngram_range=(2,2)))

Vectorizer fit time: 0.52s
(Number of samples, Number of features): (16743, 61558)

Model time: 67.52s

Confusion matrix:
 [[257 177 153  68  78]
 [  0  94  20  15   4]
 [289 424 584 131 193]
 [ 43  72  50 105  12]
 [ 63 141  98  17 261]]

Classification report:
               precision    recall  f1-score   support

        Fact       0.39      0.35      0.37       733
      Policy       0.10      0.71      0.18       133
       Value       0.65      0.36      0.46      1621
    Value(+)       0.31      0.37      0.34       282
    Value(-)       0.48      0.45      0.46       580

    accuracy                           0.39      3349
   macro avg       0.39      0.45      0.36      3349
weighted avg       0.51      0.39      0.42      3349



In [9]:
test_vectorizer(CountVectorizer(binary=True, ngram_range=(2,2)))

Vectorizer fit time: 0.62s
(Number of samples, Number of features): (16743, 61558)

Model time: 59.72s

Confusion matrix:
 [[257 176 153  68  79]
 [  0  94  20  15   4]
 [289 424 583 132 193]
 [ 43  72  50 105  12]
 [ 63 141  98  17 261]]

Classification report:
               precision    recall  f1-score   support

        Fact       0.39      0.35      0.37       733
      Policy       0.10      0.71      0.18       133
       Value       0.64      0.36      0.46      1621
    Value(+)       0.31      0.37      0.34       282
    Value(-)       0.48      0.45      0.46       580

    accuracy                           0.39      3349
   macro avg       0.39      0.45      0.36      3349
weighted avg       0.51      0.39      0.42      3349



In [10]:
test_vectorizer(TfidfVectorizer(ngram_range=(2,2)))

Vectorizer fit time: 0.56s
(Number of samples, Number of features): (16743, 61558)

Model time: 3.35s

Confusion matrix:
 [[263 173 157  63  77]
 [  0  93  22  15   3]
 [289 436 592 122 182]
 [ 41  75  53 105   8]
 [ 60 144 101  15 260]]

Classification report:
               precision    recall  f1-score   support

        Fact       0.40      0.36      0.38       733
      Policy       0.10      0.70      0.18       133
       Value       0.64      0.37      0.47      1621
    Value(+)       0.33      0.37      0.35       282
    Value(-)       0.49      0.45      0.47       580

    accuracy                           0.39      3349
   macro avg       0.39      0.45      0.37      3349
weighted avg       0.51      0.39      0.43      3349



### UniBi-grams

In [11]:
test_vectorizer(CountVectorizer(ngram_range=(1,2)))

Vectorizer fit time: 0.82s
(Number of samples, Number of features): (16743, 69814)

Model time: 70.65s

Confusion matrix:
 [[286  58 204  96  89]
 [  1  84  30  13   5]
 [303 158 736 182 242]
 [ 37  27  66 127  25]
 [ 67  49 121  36 307]]

Classification report:
               precision    recall  f1-score   support

        Fact       0.41      0.39      0.40       733
      Policy       0.22      0.63      0.33       133
       Value       0.64      0.45      0.53      1621
    Value(+)       0.28      0.45      0.35       282
    Value(-)       0.46      0.53      0.49       580

    accuracy                           0.46      3349
   macro avg       0.40      0.49      0.42      3349
weighted avg       0.51      0.46      0.47      3349



In [12]:
test_vectorizer(CountVectorizer(binary=True, ngram_range=(1,2)))

Vectorizer fit time: 0.66s
(Number of samples, Number of features): (16743, 69814)

Model time: 72.80s

Confusion matrix:
 [[283  61 201  96  92]
 [  1  85  29  13   5]
 [299 156 734 189 243]
 [ 37  28  64 129  24]
 [ 66  47 121  38 308]]

Classification report:
               precision    recall  f1-score   support

        Fact       0.41      0.39      0.40       733
      Policy       0.23      0.64      0.33       133
       Value       0.64      0.45      0.53      1621
    Value(+)       0.28      0.46      0.35       282
    Value(-)       0.46      0.53      0.49       580

    accuracy                           0.46      3349
   macro avg       0.40      0.49      0.42      3349
weighted avg       0.51      0.46      0.47      3349



In [13]:
test_vectorizer(TfidfVectorizer(ngram_range=(1,2)))

Vectorizer fit time: 0.63s
(Number of samples, Number of features): (16743, 69814)

Model time: 4.49s

Confusion matrix:
 [[ 292   26  287   55   73]
 [   3   63   54   10    3]
 [ 272   59 1003  101  186]
 [  36   14  101  110   21]
 [  64   21  196   14  285]]

Classification report:
               precision    recall  f1-score   support

        Fact       0.44      0.40      0.42       733
      Policy       0.34      0.47      0.40       133
       Value       0.61      0.62      0.61      1621
    Value(+)       0.38      0.39      0.38       282
    Value(-)       0.50      0.49      0.50       580

    accuracy                           0.52      3349
   macro avg       0.45      0.47      0.46      3349
weighted avg       0.52      0.52      0.52      3349



### Explore parameters

#### Skip accents

In [14]:
vect = TfidfVectorizer()
vect.fit_transform(corpus).toarray()
test_vectorizer(vect)

Vectorizer fit time: 0.30s
(Number of samples, Number of features): (16743, 8256)

Model time: 0.37s

Confusion matrix:
 [[300  28 288  42  75]
 [  6  41  70   9   7]
 [263  66 968 112 212]
 [ 42  11 115  93  21]
 [ 69  28 196  18 269]]

Classification report:
               precision    recall  f1-score   support

        Fact       0.44      0.41      0.42       733
      Policy       0.24      0.31      0.27       133
       Value       0.59      0.60      0.59      1621
    Value(+)       0.34      0.33      0.33       282
    Value(-)       0.46      0.46      0.46       580

    accuracy                           0.50      3349
   macro avg       0.41      0.42      0.42      3349
weighted avg       0.50      0.50      0.50      3349



In [15]:
vect = TfidfVectorizer(strip_accents='unicode')
vect.fit_transform(corpus).toarray()
test_vectorizer(vect)

Vectorizer fit time: 0.40s
(Number of samples, Number of features): (16743, 7843)

Model time: 0.35s

Confusion matrix:
 [[298  23 292  46  74]
 [  8  38  70  10   7]
 [270  65 954 112 220]
 [ 38  12 117  95  20]
 [ 69  29 190  20 272]]

Classification report:
               precision    recall  f1-score   support

        Fact       0.44      0.41      0.42       733
      Policy       0.23      0.29      0.25       133
       Value       0.59      0.59      0.59      1621
    Value(+)       0.34      0.34      0.34       282
    Value(-)       0.46      0.47      0.46       580

    accuracy                           0.49      3349
   macro avg       0.41      0.42      0.41      3349
weighted avg       0.50      0.49      0.50      3349

