# <span style="color:red">Importing Data and getting initial info</span>

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import seaborn as sns

from unidecode import unidecode
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer
from wordcloud import WordCloud
from string import punctuation

# UNCOMMENT THIS PART WHEN RUNNING THIS NOTEBOOK FOR THE FIRST TIME
# nltk.download() 

In [2]:
data = pd.read_csv("./data/feed_2020-02-24.csv")

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Id            1000 non-null   int64 
 1   Title         1000 non-null   object
 2   Image         1000 non-null   object
 3   Link          1000 non-null   object
 4   Category      1000 non-null   object
 5   Content       1000 non-null   object
 6   Created at    1000 non-null   object
 7   Updated at    1000 non-null   object
 8   Button text   997 non-null    object
 9   Pick count    1000 non-null   int64 
 10  Collectible   1000 non-null   bool  
 11  Published at  1000 non-null   object
 12  Platform      1000 non-null   object
dtypes: bool(1), int64(2), object(10)
memory usage: 94.9+ KB


# <span style="color:red">Data Pre-processing</span>

In [4]:
feed_news = data.query('Category == "news"').copy()
feed_news = feed_news[['Id', 'Title', 'Content']]

### Creating a column with title and content together

In [5]:
feed_news['Title_and_Content'] = feed_news['Title'] + ' ' + feed_news['Content']

### Transforming text to lower_case

In [6]:
feed_news['Title_and_Content'] = feed_news['Title_and_Content'].str.lower()

### Removing stopwords

In [7]:
pattern = r'\b(?:{})\b'.format('|'.join(stopwords.words('portuguese'))) 
feed_news['Without_stopwords'] = feed_news['Title_and_Content'].str.replace(pattern, '')

### Removing punctuations

In [10]:
def remove_punctuations(text):
    tokenizer = WordPunctTokenizer()
    words = tokenizer.tokenize(text)
    punctuations = list(punctuation)
    
    # TODO: Improve this code here
    # The goal is to remove the punctuations from the text
    print(words)
    for word in words:
        for char in word:
            print(word)
            word = word.replace(char, '') if char in punctuations else None

    return ' '.join(words)

In [11]:
feed_news['Without_punctuations'] = feed_news['Without_stopwords'].apply(remove_punctuations)

ex
None
-
executiva
None
None
None
None
None
None
None
None
fala
None
None
None
sobre
None
None
None
None
vida
None
None
None
propósito
None
None
None
None
None
None
None
None
-
ibc
None
None
coaching
None
None
None
None
None
None
None
esqueça
None
None
None
None
None
None
disseram
None
None
None
None
None
None
None
sobre
None
None
None
None
zona
None
None
None
conforto
None
None
None
None
None
None
None
.
mentira
None
None
None
None
None
None
,
existe
None
None
None
None
None
!
fico
None
None
None
pensando
None
None
None
None
None
None
None
trouxe
None
None
None
None
None
aqui
None
None
None
…
talvez
None
None
None
None
None
ficado
None
None
None
None
None
curioso
None
None
None
None
None
None
,
porque
None
None
None
None
None
todos
None
None
None
None
anos
None
None
None
impactado
None
None
None
None
None
None
None
None
enxurrada
None
None
None
None
None
None
None
None
textos
None
None
None
None
None
vídeos
None
None
None
None
None
alta
None
None
None
performance
None
None
None
None


None
None
None
None
None
jornal
None
None
None
None
None
local
None
None
None
None
,
chegou
None
None
None
None
None
andar
None
None
None
None
estabelecimento
None
None
None
None
None
None
None
None
None
None
None
None
None
None
abraçado
None
None
None
None
None
None
None
novo
None
None
None
amigo
None
None
None
None
.
cadela
None
None
None
None
None
adotada
None
None
None
None
None
None
compartilha
None
None
None
None
None
None
None
None
None
None
cobertor
None
None
None
None
None
None
None
cão
None
None
rua
None
None
rs
None
foto
None
None
None
viraliza
None
None
None
None
None
None
None
cadelinha
None
None
None
None
None
None
None
None
lana
None
None
None
compartilha
None
None
None
None
None
None
None
None
None
None
cobertor
None
None
None
None
None
None
None
cão
None
None
vive
None
None
None
situação
None
None
None
None
None
None
None
rua
None
None
,
separados
None
None
None
None
None
None
None
None
cerca
None
None
None
None
.
cobertor
None
None
None
None
None
None
None
inovador
No

algo
None
None
None
deixa
None
None
None
None
profundamente
None
None
None
None
None
None
None
None
None
None
None
None
emocionado
None
None
None
None
None
None
None
None
None
ver
None
None
filhos
None
None
None
None
None
reconhecendo
None
None
None
None
None
None
None
None
None
None
None
esforço
None
None
None
None
None
None
pais
None
None
None
fizeram
None
None
None
None
None
None
(
ainda
None
None
None
None
fazem
None
None
None
None
)
criação
None
None
None
None
None
None
,
demonstram
None
None
None
None
None
None
None
None
None
gratidão
None
None
None
None
None
None
None
tudo
None
None
None
.
menino
None
None
None
None
None
ajuda
None
None
None
None
comprar
None
None
None
None
None
None
cadeira
None
None
None
None
None
None
rodas
None
None
None
None
nova
None
None
None
melhor
None
None
None
None
None
amigo
None
None
None
None
amizade
None
None
None
None
None
None
sentimento
None
None
None
None
None
None
None
None
None
,
bens
None
None
None
preciosos
None
None
None
None
None
None
No

AttributeError: 'NoneType' object has no attribute 'replace'

### Removing accents

In [None]:
def remove_accents(text):
    return unidecode(text)

In [None]:
feed_news['Without_accents'] = feed_news['Without_punctuations'].apply(remove_accents)

### Removing morphological affixes from words (Stemmer)

In [None]:
def stem_words(text):    
    tokenizer = WordPunctTokenizer()
    stemmer = RSLPStemmer()
    words = tokenizer.tokenize(text)
    stemmed_words = map(lambda word : stemmer.stem(word), words)

    return ' '.join(stemmed_words)

In [None]:
feed_news['Without_morphological_affixes'] = feed_news['Without_accents'].apply(stem_words)
feed_news

# <span style="color:red">WordClouds and Frequency Distributions</span>

### Function to generate wordclouds given a text

In [None]:
def generate_wordcloud(text, figsize=(20,8)):
    wordcloud = WordCloud(max_words=100, 
                          width=900,
                          height=500,
                          max_font_size=350,
                          collocations=False,
                          normalize_plurals=False).generate(text)
    plt.figure(figsize=figsize)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()

### Function to get the full content and title in one string

In [None]:
def full_content_and_title(series):
    return series.str.cat(sep=" ")

### Function to generate a frequency distribution 

In [None]:
def generate_freq_dist_plot(text, max_words=25):
    words = WordPunctTokenizer().tokenize(text)
    words_that_apper_the_most = pd.Series(words).value_counts().nlargest(max_words)

    ax = sns.barplot(x=words_that_apper_the_most.index, y=words_that_apper_the_most.values)
    ax.figure.set_size_inches(20, 7)
    
    return words_that_apper_the_most

### For raw data

In [None]:
text = full_content_and_title(feed_news['Title_and_Content'])

generate_wordcloud(text)
generate_freq_dist_plot(text)

### Without stop  words

In [None]:
text = full_content_and_title(feed_news['Without_stopwords'])

generate_wordcloud(text)
generate_freq_dist_plot(text)

### Without punctuations

In [None]:
text = full_content_and_title(feed_news['Without_punctuations'])

generate_wordcloud(text)
generate_freq_dist_plot(text)

### Without accents

In [None]:
text = full_content_and_title(feed_news['Without_accents'])
generate_wordcloud(text)
generate_freq_dist_plot(text)

### Without morphological affixes (Stemmed)

In [None]:
text = full_content_and_title(feed_news['Without_morphological_affixes'])
generate_wordcloud(text)
generate_freq_dist_plot(text)