In [1]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup

In [2]:
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None

In [3]:
def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)

In [4]:
def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

In [5]:
raw_html = simple_get('https://en.wikipedia.org/wiki/List_of_natural_disasters_in_the_United_States')
len(raw_html)

83197

In [6]:
no_html = simple_get('https://en.wikipedia.org/wiki/List_of_natural_disasters_in_the_United_States')
no_html is None 
True

True

In [31]:
from bs4 import BeautifulSoup
raw_html = open('disasters_page.html').read()
html = BeautifulSoup(raw_html, 'html.parser')

for i, td in enumerate(html.select('td')):
         print(i, td.text)

0 


1 

This article has multiple issues. Please help improve it or discuss these issues on the talk page. (Learn how and when to remove these template messages)








This article needs additional citations for verification. Please help improve this article by adding citations to reliable sources. Unsourced material may be challenged and removed. (July 2018) (Learn how and when to remove this template message)











This article's factual accuracy may be compromised due to out-of-date information. Please update this article to reflect recent events or newly available information. (July 2018)





 (Learn how and when to remove this template message)


2 


3 
This article needs additional citations for verification. Please help improve this article by adding citations to reliable sources. Unsourced material may be challenged and removed. (July 2018) (Learn how and when to remove this template message)

4 


5 
This article's factual accuracy may be compromised due to out-of-da

In [28]:
raw_html = simple_get('https://en.wikipedia.org/wiki/List_of_natural_disasters_in_the_United_States')
html = BeautifulSoup(raw_html, 'html.parser')
for i, tbody in enumerate(html.select('tbody')):
        print(i, tbody.text)

0 This article has multiple issues. Please help improve it or discuss these issues on the talk page. (Learn how and when to remove these template messages)

This article needs additional citations for verification. Please help improve this article by adding citations to reliable sources. Unsourced material may be challenged and removed. (July 2018) (Learn how and when to remove this template message)
This article's factual accuracy may be compromised due to out-of-date information. Please update this article to reflect recent events or newly available information. (July 2018)

 (Learn how and when to remove this template message)
1 This article needs additional citations for verification. Please help improve this article by adding citations to reliable sources. Unsourced material may be challenged and removed. (July 2018) (Learn how and when to remove this template message)
2 This article's factual accuracy may be compromised due to out-of-date information. Please update this article t

In [62]:
def get_names():
    """
    Downloads the page where the list of disasters is found
    and returns a list of strings, one per disaster
    """
    url = 'https://en.wikipedia.org/wiki/List_of_natural_disasters_in_the_United_States'
    response = simple_get(url)

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')
        names = set()
        for b in enumerate(html.select('b')):
            for name in tr.text.split('\n'):
                if len(name) > 0:
                    names.add(name.strip())
        return list(names)

    raise Exception('Error retrieving contents at {}'.format(url))

In [69]:
def get_hits_on_name(name):
    """
    Accepts a `name` of a disaster and returns the number
    of hits that disaster's Wikipedia page received in the 
    last 60 days, as an `int`
    """
    # url_root is a template string that is used to build a URL.
    url_root = 'https://xtools.wmflabs.org/articleinfo/en.wikipedia.org/{}'
    response = simple_get(url_root.format(name))

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')

        hit_link = [b for b in html.select('b')
                    if tr['href'].find('latest-60') > -1]

        if len(hit_link) > 0:
            # Strip commas
            link_text = hit_link[0].text.replace(',', '')
            try:
                # Convert to integer
                return int(link_text)
            except:
                log_error("couldn't parse {} as an `int`".format(link_text))
        
    log_error('No pageviews found for {}'.format(name))
    return None

In [71]:
if __name__ == '__main__':
    print('Getting the list of names....')
    names = get_names()
    print('... done.n')

    results = []

    print('Getting stats for each name....')

    for name in names:
        try:
            hits = get_hits_on_name(name)
            if hits is None:
                hits = 1
            results.append((hits, name))
        except:
            results.append((-1, name))
            log_error('error encountered while processing '
                      '{}, skipping'.format(name))

    print('... done.n')

    results.sort()
    results.reverse()

    if len(results) > 5:
        top_marks = results[:5]
    else:
        top_marks = results

    print('\nThe most popular disasters are:')
    for (mark, disaster) in top_marks:
        print('{} with {} page views'.format(disaster, mark))

    no_results = len([res for res in results if res[0] == 1])
    print('\nBut we did not find results for '
        '{} disasters on the list'.format(no_results))

Getting the list of names....
... done.n
Getting stats for each name....
No pageviews found for Volcanic dust from a massive eruption by Mount Tambora in the Dutch East Indies (present Indonesia) in 1815 led to an abnormally cold summer in 1816 in the northeastern United States and eastern Canada. Cold weather inhibited crops, and frosts and snowstorms killed what did grow, leading to a localized famine.
No pageviews found for Famine (caused by volcano)
No pageviews found for Unknown
No pageviews found for 1816
No pageviews found for Year Without a Summer
... done.n

The most popular disasters are:
Year Without a Summer with 1 page views
Volcanic dust from a massive eruption by Mount Tambora in the Dutch East Indies (present Indonesia) in 1815 led to an abnormally cold summer in 1816 in the northeastern United States and eastern Canada. Cold weather inhibited crops, and frosts and snowstorms killed what did grow, leading to a localized famine. with 1 page views
Unknown with 1 page view

In [None]:
# Import dataframe into MySQL
import sqlalchemy
from sqlalchemy import create_engine
kwargs = dict(
username = 'root',
password = 'password',
database_ip = 'localhost',
database_name = 'ecommercedb',
)

from sqlalchemy import create_engine
#engine = create_engine("mysql+pymysql://root:"+'password'+"@localhost/ecommercedb")

conn_string = "mysql+pymysql://{username}:{password}@{database_ip}/{database_name}".format(**kwargs)
engine = create_engine(conn_string)
df.to_sql(con=engine, if_exists='replace', index=False,name='pres')