In [84]:
# Imports

# Web scraping
import requests
from bs4 import BeautifulSoup

# Dataframes
import pandas as pd
import numpy as np

# Naive Bayes
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [6]:
  # Store URL address to a variable
  url = 'https://news.google.com/home?hl=pt-BR&gl=BR&ceid=BR:pt-419'

  # Pull the page content to result
  resultado = requests.get(url)

In [3]:
 # Se o status for diferente de 200, geramos mensagem de erro
if resultado.status_code != 200:
  print("O request obteve status {}. Verifique sua conexão!".format(resultado.status_code))
else:
  print('Página raspada com sucesso.')

Página raspada com sucesso.


In [7]:
 # Use BeautifulSoup, parse HTML scraped code
texto = resultado.text
soup = BeautifulSoup(texto, 'html.parser')

In [None]:
print(soup.prettify())

In [10]:
# Headlines
headlines = soup.find_all('a', class_= "DY5T1d RZIKme")
# List to capture the headlines
heads1 = [h.text for h in headlines]

In [None]:
# Links
links = soup.find_all('a', href=True)
# List to capture the links
links_ = [links[l]['href'] for l in range(len(links))]

In [70]:
# Function to replace "./" with "https:/news.google.com/"
def sub(text):
  return text.replace('./', 'https://news.google.com/')

# New list with corrected links
links_ = list(map(sub, links_))

# Removing duplicates and not articles pages
clean_links = (pd.DataFrame(links_)
    .drop_duplicates()
    .reset_index(drop=True)
    .rename(columns={0:'link'})
    .query('~link.str.contains("publications")', engine='python')
    ).link.to_list()

In [85]:
# Dataframe
news = pd.DataFrame({'headline': heads1, 'links': clean_links[4:38]})

# Labels
news['labels'] = ['Brasil', 'Brasil', 'Brasil','Brasil','elections','elections', 'elections', 'elections', 'elections', 'weather', 'weather',
                  'weather', 'weather', 'weather', 'Brasil', 'Other', 'Other', 'Live', 'Live', 'Brasil', 'Brasil', 'politics', 'Brasil', 
                  'elections', 'Highlights', 'Highlights', 'Highlights', 'Highlights', 'Highlights', 'politics', 'blogs', 'politics', 'blogs','economics']

In [114]:
news.head()

Unnamed: 0,headline,links,labels
0,Feliciano admite espalhar fake news contra o P...,https://news.google.com/articles/CBMieGh0dHBzO...,Brasil
1,Feliciano confirma que dissemina fake news ter...,https://news.google.com/articles/CAIiEOvS5aZBm...,Brasil
2,Veja a notícia falsa que mais está causando es...,https://news.google.com/articles/CAIiECda4EqUB...,Brasil
3,Feliciano admite espalhar fake news sobre fech...,https://news.google.com/articles/CAIiEGZQURfYt...,Brasil
4,Lula tem 45% e Bolsonaro 34% em pesquisa BTG/FSB,https://news.google.com/stories/CAAqNggKIjBDQk...,elections


In [122]:
# Dataset Shape
news.shape

(34, 3)

## Multinomial NB

In [89]:
# Transform the columns to arrays of text
x = np.array(news["headline"])
y = np.array(news["labels"])

# Instantiate Count Vectorizer
cv = CountVectorizer()
# Fit and Transform
X = cv.fit_transform(x)

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [90]:
# Instantiating and Training the model
model = MultinomialNB()
model.fit(X_train,y_train)

MultinomialNB()

In [93]:
# Predict
new_headline = 'Áudios e vídeos revelam como os criminosos mais perigosos do Brasil pretendiam escapar de presídios'
data = cv.transform([new_headline]).toarray()
output = model.predict(data)
print(output)

['Brasil']


In [95]:
# Predict
new_headline = '55% dos brasileiros desaprovam governo Bolsonaro'
data = cv.transform([new_headline]).toarray()
output = model.predict(data)
print(output)

['elections']


In [120]:
# Predict
new_headline = 'Previsto calor para toda a semana no nordeste'
data = cv.transform([new_headline]).toarray()
output = model.predict(data)
print(output)

['weather']
