# Web Scraping

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
res = requests.get('https://www.nytimes.com/interactive/2017/06/23/opinion/trumps-lies.html')

In [3]:
soup = BeautifulSoup(res.text, 'lxml')

In [4]:
soup.title

<title data-react-helmet="true">President Trump’s Lies, the Definitive List - The New York Times</title>

In [5]:
soup.title.string

'President Trump’s Lies, the Definitive List - The New York Times'

# Format of Articles
<span class="short-desc">
    <strong>Jan. 21&nbsp;</strong>“I wasn't a fan of Iraq. I didn't want to go into Iraq.”
    <span class="short-truth">
        <a href="https://www.buzzfeed.com/andrewkaczynski/in-2002-donald-trump-said-he-supported-invading-iraq-on-the" target="_blank">(He was for an invasion before he was against it.)</a>
    </span>
</span>
<br>
<span class="short-desc"><strong>July 19&nbsp;</strong>“But the F.B.I. person really reports directly to the president of the United States, which is interesting.” <span class="short-truth"><a href="https://www.usatoday.com/story/news/politics/onpolitics/2017/07/20/fbi-director-reports-justice-department-not-president/495094001/" target="_blank">(He reports directly to the attorney general.)</a></span></span>

In [6]:
# soup.find_all('span', attrs = {'class': 'short-desc'})     # Short - find_all is assumed
# soup('span', class_ = 'short-desc')    # Even Shorter

articles = soup.find_all('span', attrs = {'class': 'short-desc'})
len(articles)

116

In [7]:
articles[-1]

<span class="short-desc"><strong>July 19 </strong>“But the F.B.I. person really reports directly to the president of the United States, which is interesting.” <span class="short-truth"><a href="https://www.usatoday.com/story/news/politics/onpolitics/2017/07/20/fbi-director-reports-justice-department-not-president/495094001/" target="_blank">(He reports directly to the attorney general.)</a></span></span>

# Extracting the Date

In [8]:
# articles[0].strong    # Shortcut

articles[0].find('strong').text[:-1] + ', 2017'

'Jan. 21, 2017'

# Extracting the Lie

In [9]:
articles[0].contents

[<strong>Jan. 21 </strong>,
 "“I wasn't a fan of Iraq. I didn't want to go into Iraq.” ",
 <span class="short-truth"><a href="https://www.buzzfeed.com/andrewkaczynski/in-2002-donald-trump-said-he-supported-invading-iraq-on-the" target="_blank">(He was for an invasion before he was against it.)</a></span>]

In [10]:
articles[0].contents[1].string.rstrip()[1:-1]

"I wasn't a fan of Iraq. I didn't want to go into Iraq."

# Extracting the Explanation

In [11]:
articles[0].find('a').text[1:-1]

'He was for an invasion before he was against it.'

# Extracting the URL

In [12]:
articles[0].find('a')['href']

'https://www.buzzfeed.com/andrewkaczynski/in-2002-donald-trump-said-he-supported-invading-iraq-on-the'

# Building the DataSet

In [13]:
tabular_articles = []

for article in articles:
    date = article.find('strong').text[:-1] + ', 2017'
    lie = article.contents[1].string.rstrip()[1:-1]
    explanation = article.find('a').text[1:-1]
    url = article.find('a')['href']
    tabular_articles.append([date, lie, explanation, url])

tabular_articles[:2]

[['Jan. 21, 2017',
  "I wasn't a fan of Iraq. I didn't want to go into Iraq.",
  'He was for an invasion before he was against it.',
  'https://www.buzzfeed.com/andrewkaczynski/in-2002-donald-trump-said-he-supported-invading-iraq-on-the'],
 ['Jan. 21, 2017',
  'A reporter for Time magazine — and I have been on their cover 14 or 15 times. I think we have the all-time record in the history of Time magazine.',
  'Trump was on the cover 11 times and Nixon appeared 55 times.',
  'http://nation.time.com/2013/11/06/10-things-you-didnt-know-about-time/']]

# Applying Tabular Data Structure

In [14]:
trump_lies_df = pd.DataFrame(tabular_articles, columns = ['date', 'lie', 'explanation', 'url'])
trump_lies_df.head(3)

Unnamed: 0,date,lie,explanation,url
0,"Jan. 21, 2017",I wasn't a fan of Iraq. I didn't want to go in...,He was for an invasion before he was against it.,https://www.buzzfeed.com/andrewkaczynski/in-20...
1,"Jan. 21, 2017",A reporter for Time magazine — and I have been...,Trump was on the cover 11 times and Nixon appe...,http://nation.time.com/2013/11/06/10-things-yo...
2,"Jan. 23, 2017",Between 3 million and 5 million illegal votes ...,There's no evidence of illegal voting.,https://www.nytimes.com/2017/01/23/us/politics...


In [15]:
trump_lies_df.date = pd.to_datetime(trump_lies_df.date)

In [16]:
trump_lies_df.head(3)

Unnamed: 0,date,lie,explanation,url
0,2017-01-21,I wasn't a fan of Iraq. I didn't want to go in...,He was for an invasion before he was against it.,https://www.buzzfeed.com/andrewkaczynski/in-20...
1,2017-01-21,A reporter for Time magazine — and I have been...,Trump was on the cover 11 times and Nixon appe...,http://nation.time.com/2013/11/06/10-things-yo...
2,2017-01-23,Between 3 million and 5 million illegal votes ...,There's no evidence of illegal voting.,https://www.nytimes.com/2017/01/23/us/politics...


# Exporting the Data Set to a CSV File

In [17]:
trump_lies_df.to_csv('../data/trump-lies.csv', index = False, encoding = 'utf-8')

# Verification

In [18]:
df = pd.read_csv('../data/trump-lies.csv')
df.head(3)

Unnamed: 0,date,lie,explanation,url
0,2017-01-21,I wasn't a fan of Iraq. I didn't want to go in...,He was for an invasion before he was against it.,https://www.buzzfeed.com/andrewkaczynski/in-20...
1,2017-01-21,A reporter for Time magazine — and I have been...,Trump was on the cover 11 times and Nixon appe...,http://nation.time.com/2013/11/06/10-things-yo...
2,2017-01-23,Between 3 million and 5 million illegal votes ...,There's no evidence of illegal voting.,https://www.nytimes.com/2017/01/23/us/politics...


# N.B.
1. Heed **Robot.txt**
1. Add delay in access of the website being scraped
1. APIs are preferable to Web Scraping
1. Study **Scrapy**