# USA states population

In [1]:
import plotly.plotly as py
import json
import urllib2
from bs4 import BeautifulSoup
from IPython.display import IFrame

Get the data from the Wikipedia page on the subject.

In [2]:
page_url = 'https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_population'

IFrame(page_url, 950, 500)

### Scrape the table

In [3]:
soup = BeautifulSoup(urllib2.urlopen(page_url))

In [4]:
states = []
values = []
ranks = []

# N.B. the data of interest is the first table of the page,
#   state names are in the third column
#   values of interest are the the fourth column

def parse_float(x):
    x_no_comma = x.replace(',', '')
    try:
        return float(x_no_comma)
    except:
        return False

for i, row in enumerate(soup.findAll('table')[0].findAll('tr')):
    tds = row.findAll('td')
    if len(tds):
        anchors = tds[2].findAll('a')
        if anchors:
            value = parse_float(tds[3].contents[0])
            if value:
                states.append(anchors[0].contents[0])
                values.append(value)
                ranks.append(i)

### Convert usa state name to abbreviations

Plotly's `USA-states` location understands two-letter abbreviations; we'll need to convert the full name that scraped from the Wikipedia table.

In [5]:
states_titlecase = json.load(urllib2.urlopen('https://gist.githubusercontent.com/mshafrir/2646763/raw/8b0dbb93521f5d6889502305335104218454c2bf/states_titlecase.json'))

states_titlecase[0]  # one item 

{u'abbreviation': u'AL', u'name': u'Alabama'}

In [6]:
locations = []
z = []
hovertext = []

names = []

for s in states_titlecase:
    for state, value, rank in zip(states, values, ranks):
        if s['name'].lower() == state.lower():
            locations.append(s['abbreviation'])
            z.append(value)
            hovertext.append(u'<b>Rank:</b> {0}<br>{1}'.format(rank, state)) 
            
            names.append(state)

### Make a choropleth

In [7]:
py.iplot(
    dict(
        data=[
            dict(
                type='choropleth',
                locationmode='USA-states',
                locations=locations,
                z=z,
                text=hovertext,
                colorscale='Viridis'
            )
        ],
        layout=dict(
            title='USA states population in 2014',
            titlefont=dict(
                size=24
            ),
            geo=dict(
                scope='usa',
                projection=dict(
                    
                )
            ),
            autosize=False,
            width=800,
            height=580
        )
    ),
    validate=False,
    filename='usa-states-population'
)

In [8]:
# Inject CSS styling in the NB
from IPython.display import display, HTML
display(HTML(open('../_custom.css').read()))