In [1]:
# these will install packages from the notebook!
#!pip install sklearn
#!pip install matplotlib
#!pip install numpy
#!pip install pandas


In [2]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn import datasets

In [3]:
iris = datasets.load_iris() # load the digits (data)
type(iris)

sklearn.datasets.base.Bunch

In [25]:
iris.keys()

['target_names', 'data', 'target', 'DESCR', 'feature_names']

In [26]:
list(iris.target_names) # names of potential labels

['setosa', 'versicolor', 'virginica']

In [27]:
list(iris.feature_names) # names of features

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [28]:
irisdf = pd.DataFrame(iris.data, columns=iris.feature_names)
irisdf['target'] = iris.target
# append target (response column) to your new dataframe

In [29]:
irisdf.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [30]:
irisdf.count() # no null values present

sepal length (cm)    150
sepal width (cm)     150
petal length (cm)    150
petal width (cm)     150
target               150
dtype: int64

In [31]:
irisdf.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667,1.0
std,0.828066,0.433594,1.76442,0.763161,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


In [32]:
irisdf.dtypes

sepal length (cm)    float64
sepal width (cm)     float64
petal length (cm)    float64
petal width (cm)     float64
target                 int64
dtype: object

In [33]:
irisdf_new = irisdf[ irisdf.target != 2 ]
# keep all rows which do not have target 2
# since target can be 0,1, and 2, and logistic regression must be binary,
# for this dataset, we will remove all rows which have a target of 2 to meet this requirement

In [34]:
irisdf_new.count()

sepal length (cm)    100
sepal width (cm)     100
petal length (cm)    100
petal width (cm)     100
target               100
dtype: int64

In [35]:
test_df = irisdf_new[ irisdf_new.index < 30 ]
# reserve 30 records for testing!
# we will refer to this dataset later.

test_df.count() 

sepal length (cm)    30
sepal width (cm)     30
petal length (cm)    30
petal width (cm)     30
target               30
dtype: int64

In [36]:
train_df = irisdf_new[ irisdf_new.index >= 30 ] 

# take 70 records for model training
train_df.count()

sepal length (cm)    70
sepal width (cm)     70
petal length (cm)    70
petal width (cm)     70
target               70
dtype: int64

In [37]:
predictor_cols = ["sepal length (cm)","sepal width (cm)",
                  "petal length (cm)","petal width (cm)"]
# here is list of features

In [38]:
# lets create our prediction dataframe (columns are strickly feature columns)
df_pred = train_df[predictor_cols]

In [39]:
df_pred.count()

sepal length (cm)    70
sepal width (cm)     70
petal length (cm)    70
petal width (cm)     70
dtype: int64

In [40]:
# Now time to fit our model, using StatsModels!
# to fit model you will use the training dataset, not the test dataset!
logit = LogisticRegression()
results = logit.fit(df_pred, train_df.target)
#print type(results)

In [41]:
# disable chained-assignment caution
pd.options.mode.chained_assignment = None
# input test_pred_df into the logit model and append column to original test_df
test_df['prediction'] = logit.predict(test_df[predictor_cols])

In [42]:
test_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,prediction
0,5.1,3.5,1.4,0.2,0,0
1,4.9,3.0,1.4,0.2,0,0
2,4.7,3.2,1.3,0.2,0,0
3,4.6,3.1,1.5,0.2,0,0
4,5.0,3.6,1.4,0.2,0,0


In [3]:
from bs4 import BeautifulSoup
import urllib
r = urllib.urlopen('http://www.aflcio.org/Legislation-and-Politics/Legislative-Alerts').read()
soup = BeautifulSoup(r,'lxml')
print type(soup)


<class 'bs4.BeautifulSoup'>


In [4]:
print soup.prettify()[0:1000]


<!DOCTYPE html>
<html class="no-js" lang="en-US" xml:lang="en-US" xmlns="http://www.w3.org/1999/xhtml" xmlns:fb="http://ogp.me/ns/fb#">
 <head>
  <title>
   Legislative Alerts
  </title>
  <meta content="text/html; charset=utf-8" name="Content-Type"/>
  <meta content="en-US" name="Content-language"/>
  <meta content="" name="author"/>
  <meta content="" name="copyright"/>
  <meta content="" name="description"/>
  <meta content="" name="keywords"/>
  <meta content="TRUE" name="MSSmartTagsPreventParsing"/>
  <meta content="eZ Publish" name="generator"/>
  <meta content="Legislative Alerts" property="og:title"/>
  <meta content="http://www.aflcio.org/Legislation-and-Politics/Legislative-Alerts" property="og:url"/>
  <meta content="AFL-CIO" property="og:site_name"/>
  <meta content="http://www.aflcio.org/extension/aflcio/design/aflcio_user/images/facebook_aflcio_200x200.jpg" property="og:image"/>
  <meta content="non_profit" property="og:type"/>
  <meta content="288636237825618" property="

In [5]:
from IPython.display import HTML
HTML('<iframe src=http://www.aflcio.org/Legislation-and-Politics/Legislative-Alerts width=700 height=500></iframe>')


In [6]:
print soup.prettify()[28700:30500]


-Alerts/" id="siteSearch" method="post">
        <label for="keyword">
         Search Legislative Alerts:
        </label>
        <input id="keyword" name="SearchText" size="10" type="text" value=""/>
        <input alt="search" class="button" src="/extension/aflcio/design/aflcio_user/images/btn-search-blog.png" type="image"/>
        <span>
        </span>
       </form>
      </div>
      <div class="legisalerts_year_search">
       <!-- //navigation footer -->
       <form>
        <label for="children_years_EC">
         Browse by Year:
        </label>
        <select id="children_years">
         <option selected="selected" value="2016">
          2016
         </option>
         <option value="2015">
          2015
         </option>
         <option value="2014">
          2014
         </option>
         <option value="2013">
          2013
         </option>
         <option value="2012">
          2012
         </option>
         <option value="2011">
          2011
      

In [7]:
letters = soup.find_all("div", class_="ec_statements")

In [10]:
letters[0]

<div class="ec_statements">\n<div id="legalert_title"><a href="/Legislation-and-Politics/Legislative-Alerts/Letter-in-Support-of-Barack-Obama-s-Veto-of-DOL-Retirement-Rule-Override">Letter in Support of Barack Obama's Veto of DOL Retirement Rule Override</a></div>\n<div id="legalert_date">June 21, 2016</div>\n</div>

In [33]:
lobbying = {}
for element in letters:
    #print(element.a.get_text())
    lobbying[element.a.get_text()] = {}

Letter in Support of Barack Obama's Veto of DOL Retirement Rule Override
Myths v Facts Response - ILRWG H-2B Approps FY 2017
AFL-CIO Opposes H-2B  Returning Worker Amendment
Letter to House Education and the Workforce Committee Urging Them to Oppose the Resolution to Block the Dept. of Labor's Persuader Rule
Letter to Representatives in opposition to the Local Radio Freedom Act
Letter to Senators in opposition to the Local Radio Freedom Act
Letter to Senators urging them to oppose the conference report on the Customs Bill (H.R. 644)
Letter to Senators in support of the Workplace Action for a Growing Economy Act
Letter to Representatives in support of The Workplace Action for a Growing Economy Act
Letter to Representatives opposing the "Fairness in Class Action Litigation and Furthering Asbestos Claim Transparency Act"


In [34]:
lobbying

{u'AFL-CIO Opposes H-2B  Returning Worker Amendment': {},
 u"Letter in Support of Barack Obama's Veto of DOL Retirement Rule Override": {},
 u"Letter to House Education and the Workforce Committee Urging Them to Oppose the Resolution to Block the Dept. of Labor's Persuader Rule": {},
 u'Letter to Representatives in opposition to the Local Radio Freedom Act': {},
 u'Letter to Representatives in support of The Workplace Action for a Growing Economy Act': {},
 u'Letter to Representatives opposing the "Fairness in Class Action Litigation and Furthering Asbestos Claim Transparency Act"': {},
 u'Letter to Senators in opposition to the Local Radio Freedom Act': {},
 u'Letter to Senators in support of the Workplace Action for a Growing Economy Act': {},
 u'Letter to Senators urging them to oppose the conference report on the Customs Bill (H.R. 644)': {},
 u'Myths v Facts Response - ILRWG H-2B Approps FY 2017': {}}

In [31]:
letters[0].div['id']

'legalert_title'

In [32]:
prefix = "www.aflcio.org"


In [35]:
for element in letters:
    lobbying[element.a.get_text()]["link"] = prefix + element.a["href"]

In [39]:
str(letters[0].find(id="legalert_date").get_text())

'June 21, 2016'

In [40]:
for element in letters:
    date = element.find(id="legalert_date").get_text()
    lobbying[element.a.get_text()]["date"] = date


In [41]:
lobbying

{u'AFL-CIO Opposes H-2B  Returning Worker Amendment': {'date': u'June 13, 2016',
  'link': 'www.aflcio.org/Legislation-and-Politics/Legislative-Alerts/AFL-CIO-Opposes-H-2B-Returning-Worker-Amendment'},
 u"Letter in Support of Barack Obama's Veto of DOL Retirement Rule Override": {'date': u'June 21, 2016',
  'link': 'www.aflcio.org/Legislation-and-Politics/Legislative-Alerts/Letter-in-Support-of-Barack-Obama-s-Veto-of-DOL-Retirement-Rule-Override'},
 u"Letter to House Education and the Workforce Committee Urging Them to Oppose the Resolution to Block the Dept. of Labor's Persuader Rule": {'date': u'May 17, 2016',
  'link': 'www.aflcio.org/Legislation-and-Politics/Legislative-Alerts/Letter-to-House-Education-and-the-Workforce-Committee-Urging-Them-to-Oppose-the-Resolution-to-Block-the-Dept.-of-Labor-s-Persuader-Rule'},
 u'Letter to Representatives in opposition to the Local Radio Freedom Act': {'date': u'February 19, 2016',
  'link': 'www.aflcio.org/Legislation-and-Politics/Legislative-A

In [42]:
for item in lobbying.keys():
    print item + ": " + "\n\t" + "link: " + lobbying[item]["link"] + "\n\t" + "date: " + lobbying[item]["date"] + "\n\n" 


Letter to Senators urging them to oppose the conference report on the Customs Bill (H.R. 644): 
	link: www.aflcio.org/Legislation-and-Politics/Legislative-Alerts/Letter-to-Senators-urging-them-to-oppose-the-conference-report-on-the-Customs-Bill-H.R.-644
	date: February 09, 2016


Letter to Representatives opposing the "Fairness in Class Action Litigation and Furthering Asbestos Claim Transparency Act": 
	link: www.aflcio.org/Legislation-and-Politics/Legislative-Alerts/Letter-to-Representatives-opposing-the-Fairness-in-Class-Action-Litigation-and-Furthering-Asbestos-Claim-Transparency-Act
	date: January 05, 2016


AFL-CIO Opposes H-2B  Returning Worker Amendment: 
	link: www.aflcio.org/Legislation-and-Politics/Legislative-Alerts/AFL-CIO-Opposes-H-2B-Returning-Worker-Amendment
	date: June 13, 2016


Letter to Representatives in opposition to the Local Radio Freedom Act: 
	link: www.aflcio.org/Legislation-and-Politics/Legislative-Alerts/Letter-to-Representatives-in-opposition-to-the-Local