In [128]:
from selenium import webdriver
import bs4
import requests
import pandas as pd
import re
import os

In [3]:
def get_soup(domain, path):
    '''
    Function to pass in url (domain + path) to get 
    html via Requests, and parse via Beautiful Soup
    '''
    # use requests to grab page via url
    url = domain + path
    r = requests.get(url)

    # store html for parsing via soup
    html = r.text
    
    # create soup
    soup = bs4.BeautifulSoup(html, 'lxml')

    return soup

In [4]:
# webpage domain
domain = 'http://conferences.oreilly.com'

In [5]:
# get parsed html of wed and thurs schedule pages
soup_wed = get_soup(domain,'/strata/hadoop-big-data-ny/public/schedule/grid/public/2016-09-28?view=list')
soup_thurs = get_soup(domain,'/strata/hadoop-big-data-ny/public/schedule/grid/public/2016-09-29?view=list')

# confirm results
print soup_wed.prettify()

In [7]:
# get all URLs from page
topic_wed = soup_wed.find_all('a', href=re.compile('schedule/detail/'))
topic_thurs = soup_thurs.find_all('a', href=re.compile('schedule/detail/'))

# combine links from wed and thurs schedule pages
topic_links = topic_wed + topic_thurs

In [8]:
# initialize dict
link_dict = {
    'link' : [],
    'topic': []
}

# create dictionary of text and links
for link in topic_links:
    link_dict['link'].append(link.get('href'))
    link_dict['topic'].append(link.text)
    
# convert dict to dataframe
sessions_df = pd.DataFrame(link_dict)
sessions_df.head()

Unnamed: 0,link,topic
0,/strata/hadoop-big-data-ny/public/schedule/det...,Parallel SQL and analytics with Solr
1,/strata/hadoop-big-data-ny/public/schedule/det...,JupyterLab: The evolution of the Jupyter Notebook
2,/strata/hadoop-big-data-ny/public/schedule/det...,Designing a location intelligence platform for...
3,/strata/hadoop-big-data-ny/public/schedule/det...,The future of column-oriented data processing ...
4,/strata/hadoop-big-data-ny/public/schedule/det...,Beyond Hadoop at Yahoo: Interactive analytics ...


In [116]:
# initialize list
div_desc = []

# initialize web driver to get dynamic page source
driver = webdriver.Chrome("C:/Users/ffarmer/Downloads/chromedriver.exe")

# add session descriptions to list
for path in sessions_df['link']: # for each session link
    driver.get(domain + path) # open link
    page_html = driver.page_source.encode('utf-8') # save page source
    soup = bs4.BeautifulSoup(page_html, 'lxml') # parse html
    div_desc.append(soup.find_all('div', class_='en_session_description description')) #save descriptions, append to list

# kill driver
driver.quit()

# confirm results
div_desc

[[<div class="en_session_description description">\n<h2>Description</h2>\n<p>Analytics has increasingly become a major focus for Apache Solr, the primary search engine in the Hadoop stack. Yonik Seeley explores recent Apache Solr features in the areas of faceting and analytics, including parallel <span class="caps">SQL</span>, streaming expressions, distributed join, and distributed graph queries. Given the increasing number of APIs and techniques that can be brought to bear, Yonik also covers the trade-offs of different approaches and strategies for maximizing scalability.</p>\n</div>],
 [<div class="en_session_description description">\n<h2>Description</h2>\n<p>Project Jupyter provides building blocks for interactive and exploratory computing that make science and data science reproducible across over 40 programming languages (Python, Julia, R, etc.). Central to the project is the Jupyter Notebook, a web-based, interactive computing platform that allows users to author data- and code

In [124]:
# initialize list
descriptions = []

# loop through description divs to get <p> contents
# Note: some of the descriptions are broken up over multiple <p> tags
# so have to combine them, checking for embedded tags within <p>
for res_set in div_desc: # loop through ResultSets
    phrases = res_set[0].p.contents # get the <p> contents from the div

    desc = '' # initialize string for description
    if len(phrases) > 1: # if description has multiple <p>
        for i in phrases: # loop through each <p>
            if isinstance(i, bs4.element.Tag) == True: # if <p> has embedded tags
                if len(i.contents) == 0: # check if embedded tag has contents
                    desc += '' # if so, add nothing
                else:
                    desc += str(i.contents[0].encode('utf-8')) # otherwise, add contents of embedded tag
            else:
                desc += str(i.encode('utf-8')) # add contents
    else:
        desc += str(phrases[0].encode('utf-8')) # add contents

    descriptions.append(desc) # add combined description to list
    
# confirm results, should be 193 (total sessions)
print 'Parsed ' + str(len(descriptions)) + ' descriptions'

Parsed 193 descriptions


In [125]:
# add descriptions to dataframe
sessions_df['descriptions'] = descriptions

# confirm results
sessions_df.head()

Unnamed: 0,link,topic,descriptions
0,/strata/hadoop-big-data-ny/public/schedule/det...,Parallel SQL and analytics with Solr,Analytics has increasingly become a major focu...
1,/strata/hadoop-big-data-ny/public/schedule/det...,JupyterLab: The evolution of the Jupyter Notebook,Project Jupyter provides building blocks for i...
2,/strata/hadoop-big-data-ny/public/schedule/det...,Designing a location intelligence platform for...,CartoDB has enabled hundreds of thousands of u...
3,/strata/hadoop-big-data-ny/public/schedule/det...,The future of column-oriented data processing ...,"In pursuit of speed and efficiency, big data p..."
4,/strata/hadoop-big-data-ny/public/schedule/det...,Beyond Hadoop at Yahoo: Interactive analytics ...,Yahoo initially built Hadoop as an answer to a...


In [132]:
# change directory to save file
os.chdir('data_train')
os.getcwd()

'C:\\Users\\ffarmer\\Documents\\Code\\strata-notes_text-analysis\\data_train'

In [135]:
# save dataframe as pickle
sessions_df.to_pickle('strata_sessions.pkl')