<a href="https://colab.research.google.com/github/crismunoz/NLP_examples/blob/master/TF_IDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [72]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
import bs4 as bs  
import urllib.request
import nltk
nltk.download('punkt')
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('portuguese')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [73]:
raw_html = urllib.request.urlopen('https://blogs.oglobo.globo.com/miriam-leitao/post/azevedo-versus-trump-o-contexto.html')  
raw_html = raw_html.read()
article_html = bs.BeautifulSoup(raw_html, 'lxml')

In [74]:
article_paragraphs = article_html.article.find_all('p')
article_text = ''
for para in article_paragraphs:  
    article_text += para.text

In [75]:
corpus = nltk.sent_tokenize(article_text)

# Counter Vectorizer

In [76]:
vectorizer = CountVectorizer()
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [77]:
# use default nltk tokenizer
vectorizer.set_params(tokenizer=nltk.word_tokenize)

# remove Portuguese stop words
vectorizer.set_params(stop_words=stopwords)

# include 1-grams and 2-grams
vectorizer.set_params(ngram_range=(1, 2))

# ignore terms that appear in more than 50% of the documents
vectorizer.set_params(max_df=0.5)

# only keep terms that appear in at least 2 documents
vectorizer.set_params(min_df=2)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.5, max_features=None, min_df=2,
                ngram_range=(1, 2), preprocessor=None,
                stop_words=['de', 'a', 'o', 'que', 'e', 'é', 'do', 'da', 'em',
                            'um', 'para', 'com', 'não', 'uma', 'os', 'no', 'se',
                            'na', 'por', 'mais', 'as', 'dos', 'como', 'mas',
                            'ao', 'ele', 'das', 'à', 'seu', 'sua', ...],
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<function word_tokenize at 0x7f624562e9d8>,
                vocabulary=None)

In [81]:
X = vectorizer.fit_transform(corpus)
X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 1, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [69]:
print(vectorizer.get_feature_names())

['?', 'ameaça', 'americano', 'ano', 'anos', 'azevêdo', 'azevêdo ,', 'azevêdo deixou', 'biden', 'bob', 'brasileiro', 'conta', 'contexto', 'conversa', 'deixou', 'desde', 'diretor-geral', 'disse', 'disse azevêdo', 'diz', 'dois', 'encontro', 'encontro ,', 'eua', 'excelente', 'excelente .', 'fato', 'governo', 'importante', 'janeiro', 'mentirosa', 'mentiroso', 'nunca', 'omc', 'omc ,', 'organização', 'outro', 'presidente', 'presidente americano', 'sendo', 'testemunhas', 'trump', 'trump .', 'woodward', '“', '”']


# TF-IDF


In [70]:
vectorizer = TfidfVectorizer()
vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [39]:
# use default nltk tokenizer
vectorizer.set_params(tokenizer=nltk.word_tokenize)

# remove Portuguese stop words
vectorizer.set_params(stop_words=stopwords)

# include 1-grams and 2-grams
vectorizer.set_params(ngram_range=(1, 2))

# ignore terms that appear in more than 50% of the documents
vectorizer.set_params(max_df=0.5)

# only keep terms that appear in at least 2 documents
vectorizer.set_params(min_df=2)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.5, max_features=None,
                min_df=2, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True,
                stop_words=['de', 'a', 'o', 'que', 'e', 'é', 'do', 'da', 'em',
                            'um', 'para', 'com', 'não', 'uma', 'os', 'no', 'se',
                            'na', 'por', 'mais', 'as', 'dos', 'como', 'mas',
                            'ao', 'ele', 'das', 'à', 'seu', 'sua', ...],
                strip_accents=None, sublinear_tf=False,
                token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<function word_tokenize at 0x7f624562e9d8>,
                use_idf=True, vocabulary=None)

In [71]:
X = vectorizer.fit_transform(corpus)
X.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.1923588 , 0.        , 0.        , ..., 0.1923588 , 0.17168473,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.31332825, 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [41]:
print(vectorizer.get_feature_names())

['?', 'ameaça', 'americano', 'ano', 'anos', 'azevêdo', 'azevêdo ,', 'azevêdo deixou', 'biden', 'bob', 'brasileiro', 'conta', 'contexto', 'conversa', 'deixou', 'desde', 'diretor-geral', 'disse', 'disse azevêdo', 'diz', 'dois', 'encontro', 'encontro ,', 'eua', 'excelente', 'excelente .', 'fato', 'governo', 'importante', 'janeiro', 'mentirosa', 'mentiroso', 'nunca', 'omc', 'omc ,', 'organização', 'outro', 'presidente', 'presidente americano', 'sendo', 'testemunhas', 'trump', 'trump .', 'woodward', '“', '”']
