In [33]:
import requests
import json
import numpy as np
import pandas as np
import markovify

from bs4 import BeautifulSoup

# Grab Source Page from Buzzfeed Homepage

In [40]:
url = "https://www.buzzfeednews.com/"
html_page = requests.get(url)

# Instantiate in soup object and grab headlines

In [41]:
soup = BeautifulSoup(html_page.content, 'html.parser')

In [42]:
# find the story cards
story_cards = soup.find_all("a", class_="newsblock-story-card__title-link")

In [43]:
# check the length
len(story_cards)

36

In [44]:
# check first element
story_cards[0]

<a class="newsblock-story-card__title-link" data-bfa="@a:post; @d:-1; @o:{dimension2:4939631,dimension4:Mahershala Ali Responded To His Co-Star Viggo Mortensen Saying The N-Word};@e:{obj_id:4939631,obj_type:post,p:-1,post_category:Culture,data_source:omnisearch,treatment:news,variation_id:};" data-bfa-impressions="true" href="https://www.buzzfeednews.com/article/davidmack/mahershala-ali-viggo-mortensens-n-word-green-book">
        Mahershala Ali Responded To His Co-Star Viggo Mortensen Saying The N-Word
      </a>

# Investigate headlines position in story_cards

In [45]:
# inspect and grab headlines from first story_card
first_card = story_cards[0]
first_card

<a class="newsblock-story-card__title-link" data-bfa="@a:post; @d:-1; @o:{dimension2:4939631,dimension4:Mahershala Ali Responded To His Co-Star Viggo Mortensen Saying The N-Word};@e:{obj_id:4939631,obj_type:post,p:-1,post_category:Culture,data_source:omnisearch,treatment:news,variation_id:};" data-bfa-impressions="true" href="https://www.buzzfeednews.com/article/davidmack/mahershala-ali-viggo-mortensens-n-word-green-book">
        Mahershala Ali Responded To His Co-Star Viggo Mortensen Saying The N-Word
      </a>

In [46]:
print(first_card.text)


        Mahershala Ali Responded To His Co-Star Viggo Mortensen Saying The N-Word
      


In [47]:
first_card.text.strip("\n").strip()

'Mahershala Ali Responded To His Co-Star Viggo Mortensen Saying The N-Word'

# Great, now let's do this for all of the headlines!!!

In [48]:
headlines = [card.text.strip("\n").strip() for card in story_cards]

In [50]:
headlines[:3]

['Mahershala Ali Responded To His Co-Star Viggo Mortensen Saying The N-Word',
 'Former James Bond Star Rosamund Pike Says The Movies Are “Ripe For An Incredible Amount Of Sexism”',
 'The Border Patrol Apprehended More Families Than Ever At The Border Last Month']

# Now that we have some headlines, let's play with generating new ones

In [34]:
markov = markovify.NewlineText("\n".join(headlines))

In [35]:
for i in range(10):
    new_headlines = markov.make_short_sentence(140)
    print(new_headlines)

Kim Kardashian West, Lady Gaga, Jake Paul, And Other Celebrities Are Sharing Photos And Videos Of The Hospital And Back To Work
An Instagram Video Shows The Inside Of The Hospital And Back To Work
An Instagram Video Shows The Inside Of The Woolsey Fire Evacuations
The Border Patrol Apprehended More Families Than Ever At The Border Patrol Apprehended More Families Than Ever At The Border Last Month
Ruth Bader Ginsburg Is Already Out Of The Woolsey Fire Evacuations
Celebrities Are Sharing Photos And Videos Of The Hospital And Back To Work
The Border Patrol Apprehended More Families Than Ever At The Border Patrol Apprehended More Families Than Ever At The Border Last Month
California University Students Say Their Play About The Migrant Caravan Lately? Here's What It's Been Up To.
An Instagram Video Shows The Inside Of The Woolsey Fire Evacuations
An Instagram Video Shows The Inside Of The Woolsey Fire Evacuations


# This is cool, but more data would be helpful. Let's use selenium to grab the first five pages of headlines
## Need:
### Chromedriver installed (I used homebrew for mac 'brew cask install chromedriver'
### Selenium 'pip install selenium'
### Also the request website, so it takes inspecting the 'show more' click element on the page

In [59]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

In [71]:
browser = webdriver.Chrome()

# set page_number_max
page_number_max = 6


def grab_buzzfeed_headlines_usa(page_number_max):
    headlines = []
    url = "https://www.buzzfeednews.com/us/feed/home?page={}&flexpro_enabled=1"
    page_number = 1
    while page_number<page_number_max:
        print("Scraping page number {}".format(page_number))
        browser.get(url.format(page_number))
        time.sleep(1)
        elem = browser.find_element_by_tag_name("a")
        post_elems = browser.find_elements_by_class_name("newsblock-story-card__title-link")
        for post in post_elems:
            headlines.append(post.text)
        page_number += 1
    return headlines

In [72]:
headlines = grab_buzzfeed_headlines_usa(6)

Scraping page number 1
Scraping page number 2
Scraping page number 3
Scraping page number 4
Scraping page number 5


In [65]:
headlines

['Kim Kardashian West, Lady Gaga, Jake Paul, And Other Celebrities Are Also Forced To Flee California Wildfires',
 'These Pictures Show The Total Devastation Caused By California Wildfires',
 '164 People Are Sick And 1 Is Dead Due To A Salmonella Outbreak Linked To Raw Turkey',
 '16 Heartbreaking Books To Get You Through Your Own Heartache',
 '22 Of The Most Powerful Photos Of This Week',
 'This #MeToo Moment Is Tearing The Russian Internet Apart',
 'Amazon Removed Merchandise From Its Site Touting The Far-Right Group "Proud Boys"',
 'Scotland Will Be The First Country To Mandate LGBTI Education',
 'The ACLU Just Sued Trump Over His New Asylum Order',
 'Florida Is A Giant Mess Again And Lawsuits Are Flying Ahead Of Possible Recounts',
 'This Small Town Was Once A Progressive Fairy Tale. But In 2018, It’s Living A Far-Right Nightmare.',
 'An Infant’s Death Has Sparked A Heated Debate Around The "Free Birth" Movement',
 "Here's What Registered Dietitians Really Think About Oat Milk's Nut

In [85]:
markov = markovify.NewlineText("\n".join(headlines), state_size=2)

In [90]:
new_headlines = []
for i in range(10):
    new_headline = markov.make_sentence()
    new_headlines.append(new_headline)

In [92]:
for new_headline in new_headlines:
    if new_headline in headlines:
        print("x - {} is in headlines".format(new_headline))
    else:
        print("o - {} is not in headlines".format(new_headline))

o - Donald Trump Singled Out A CNN Reporter As An “Enemy Of The Thousand Oaks Shooter Allegedly Sexually Assaulted His Track And Field Coach In High School is not in headlines
o - The Biggest Moments Of The Biggest Democratic Stars Of The 2018 Midterm Elections is not in headlines
o - The Migrant Caravan Is Camping In A Car Crash Is Dead is not in headlines
o - Washington Rep. Suzan DelBene Will Run For His Old House Seat, Now That He’s Gone is not in headlines
o - The California Shooter Who Murdered At Least 12 People Died In A Car Crash Is Dead is not in headlines
o - Pot Stores Could Open In DC If Democrats Win Back The House And Republicans Will Keep The Senate is not in headlines
o - Here Are Some Of The Most Devastating Line In This Year's Campaign is not in headlines
o - These Old Photos Of Hot And Cool Grandparents Are A Great Break From The Wild Places That People Voted At Today is not in headlines
o - Kim Davis — That Anti-Gay Clerk In Kentucky — Was Just Elected As The First

In [93]:
new_headlines

['Donald Trump Singled Out A CNN Reporter As An “Enemy Of The Thousand Oaks Shooter Allegedly Sexually Assaulted His Track And Field Coach In High School',
 'The Biggest Moments Of The Biggest Democratic Stars Of The 2018 Midterm Elections',
 'The Migrant Caravan Is Camping In A Car Crash Is Dead',
 'Washington Rep. Suzan DelBene Will Run For His Old House Seat, Now That He’s Gone',
 'The California Shooter Who Murdered At Least 12 People Died In A Car Crash Is Dead',
 'Pot Stores Could Open In DC If Democrats Win Back The House And Republicans Will Keep The Senate',
 "Here Are Some Of The Most Devastating Line In This Year's Campaign",
 'These Old Photos Of Hot And Cool Grandparents Are A Great Break From The Wild Places That People Voted At Today',
 'Kim Davis — That Anti-Gay Clerk In Kentucky — Was Just Elected As The First Muslim Women Elected To Congress',
 'Californians Just Voted For The Next Fight']