In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
import requests

In [36]:
#get wiki for all bachelor seasons
allseasons = requests.get("https://en.wikipedia.org/wiki/The_Bachelor_(U.S._TV_series)#Seasons")
#make beautiful soup element
soup = BeautifulSoup(allseasons.text, "html.parser")

#get the table cell that has links to each episode
seasons = soup.find("table", attrs={"class":"navbox"}).find("td", attrs={"class":"navbox-list navbox-odd hlist"})
seasons = seasons.find("div", attrs={"style":"padding:0em 0.25em"}).find("ul")

urls = []                           #list of links to season-specific page
seasonNums = []                     #list of seasons w/ wiki pages (no seasons 1-4 or 6-8)
seasonNum = 1                       #season number
for item in seasons.find_all("li"): #for each item in list of seasons
    if (seasonNum == 20):           #don't include season 20, b/c no contestants listed yet
        break
    season = item.find("a")         #get url tag
    if season is not None:          #if has url link, get url text
        urls.append("\"https://en.wikipedia.org" + season.get("href") + "\"")
        seasonNums.append(seasonNum) #add season number to list 
    seasonNum += 1
    
wikiPageText = []                   #init list of wiki site text, for all seasons
for url in urls:
    site = requests.get(url[1:-1])  #get web-site for that url
    soup = BeautifulSoup(site.text, "html.parser") #make BS element
    wikiPageText.append(soup)       #add web-site text to list

wikiPages = dict(zip(seasonNums, wikiPageText)) #key=season, val=Soup Elem(wiki page text)

In [203]:
# For each season in wiki, make list of dictionaries - one dictionary for each contestant.
# dictionary name = seasonsDict
#             key = season number
#           value = list of dictionaries for that season (one for each contestant)
#             
# For contestant dictionaries:
#            keys = name, age, hometown, occupation, elimination, season
#          values = associated values to fields, as scraped from wiki
#
# To test contestant dictionaries:
#         print seasonsDict[season][contestant][fieldname]
#    eg:  print seasonsDict[9][10]['name']  -- get name for season 9, contestant 10
#
# Note: Wiki does not have pages dedicated to Seasons 1-4, or 6-8. Also, Wiki does not list
# contestants for episode 20.  Those Seasons will be added to the dictionary later.

seasonsDict = dict()                #key = season num, val=list of contestant dictionaries
allContestants = dict()             #keys = name/age/etc, values = associated data

for sn in seasonNums:
    seasonPage = wikiPages[sn]      #get BS element for this season
    seasonPage = seasonPage.find("div", attrs={"id":"content"}).find("div", attrs={"id":"bodyContent"})
    seasonPage = seasonPage.find("div", attrs={"id":"mw-content-text"})
    seasonPage = seasonPage.find("table", attrs={"class":"wikitable sortable"})
    
    listOfContestantDicts = []          #list of dicts for each contestant
    
    numtr = 0                           #num rows (one per contestant)
    for tr in seasonPage.find_all("tr"):#for each contestant listed,
        if (numtr == 0):                #skip first row (column headers)
            numtr += 1
            continue

        contestantDict = dict()         #init new dict for contestant
        numtd = 0                       #column number
        for td in tr.find_all("td"):    #for each column of data,
            
            if (numtd == 0):
                name = str(td.contents)
                if ("<b>" in name):
                    td.find("b")
                    name = str(td.contents)[4:-5]
                if ("[u'" in name):                     #if in format "[u'name']",
                    name = name.encode('utf8')[3:-2]    #format to get 'name'
                if ("<span class" in name):
                    td.find("span", attrs={"class":"nowrap"})
                    tag = "<span class='nowrap'>"       #start tag before name
                    name = str(td.contents)[len(tag)+1:]#cut out start tag
                    end = name.index("<")               #get start point of end tag
                    name = name[:end]                   #cut out end tag
                    trashTag = "style=\"display:none;\">" #weird tag to cut from a name
                    if (trashTag in name):
                        name = name[(len(trashTag)+1):-1] 
                if ("<sup" in name):                    #if name has "name', <sup ...",
                    end2 = name.index("<sup")           #format to get name
                    name = name[:end2-3]
                contestantDict['name'] = name           #add name to dict

            if (numtd == 1):
                age = str(td.contents)
                if ("<b>" in age):
                    td.find("b")
                    age = str(td.contents)[4:-5]
                if ("[u'" in age):                      
                    age = age.encode('utf8')[3:5]       
                contestantDict['age'] = age
            if (numtd == 2):
                td.find("a")
                home = td.get("href")       
                home = td.get_text("title")
                if ("title" in home):                #format oddity in season 19, contest 1
                    indx = home.index("title")
                    home = home[:indx]
                contestantDict['hometown'] = home
            if (numtd == 3):
                job = str(td.contents)
                if ("<b>" in job):
                    td.find("b")
                    job = str(td.contents)[4:-5]
                if ("[u'" in job):                     
                    job = job.encode('utf8')[3:-2]      
                contestantDict['occupation'] = job   
            if (numtd == 4):
                elim = str(td.contents)
                if ("<b>" in elim):
                    td.find("b")
                    elim = str(td.contents)[4:-5]
                if ("[u'" in elim):                      
                    elim = elim.encode('utf8')[3:-2]      
                contestantDict['eliminated'] = elim
            numtd += 1
        numtr += 1
        contestantDict['season'] = sn   #include season num in dict
        
        listOfContestantDicts.append(contestantDict) #add dict to list of dicts
        
    seasonsDict[sn] = listOfContestantDicts  #key = season num, val=list of contestant dicts

#no seasons: 1-4, 6-8, 20
#to test: print seasonsDict [season][contestant][fieldname], eg:
#print seasonsDict[9][10]['name']

Lucy 


In [None]:
#get data for seasons 1-4 and 6-8
#possible source:  http://abc.go.com/primetime/bachelor/index?pn=photos#t=31304

#good source: http://www.realitywanted.com/shows/the-bachelor/season-8-paris

In [218]:
# Get data for Season 8, add to dictionary

def getSeason8(season8Site):
    
    #get site with Bachelor Season 8 Contestants
    seasonEight = requests.get(season8Site)
    #make beautiful soup element
    season8= BeautifulSoup(seasonEight.text, "html.parser")

    #get the table cell that has links to each episode
    eight = season8.find("body", attrs={"id":"imagegalleryIndexPage"})
    eight = eight.find("main", attrs={"id":"main"})
    eight = eight.find("div", attrs={"class":"container"})
    eight = eight.find_all("div", attrs={"class":"row"})[1]
    eight = eight.find("div", attrs={"class":"col col-11"}).find("div", attrs={"class":"row"})
    eight = eight.find("div", attrs={"class":"col col-8"})
    eight = eight.find("div", attrs={"class":"content widget gallery-index-content"})
    eight = eight.find("ul")


    urls8 = []                       #list of urls for season 8 contestant pages
    for item in eight.find_all("li"):#for each contestant in list of season 8 contestants
        url8 = item.find("a")       #get url tag
        if url8 is not None:         #if has url link, get url 
            urls8.append("\"http://realitytv.about.com" + url8.get("href") + "\"")

    cont8Sites = []                  #list of soup objects for season 8 contestant sites
    for link in urls8:
        site8 = requests.get(link[1:-1]) 
        soup8 = BeautifulSoup(site8.text, "html.parser") #get soup element
        cont8Sites.append(soup8)     #add soup element to list

    for cont8 in cont8Sites:
        c8 = cont8.find("body", attrs={"id":"imagegalleryPage"})
        c8 = c8.find("main", attrs={"class":"slab"})
        c8 = c8.find("div", attrs={"class":"container"})
        c8 = c8.find_all("div", attrs={"class":"row"})[1]
        c8 = c8.find("div", attrs={"class":"col col-11"})
        c8 = c8.find("div", attrs={"id":"contentIntro"})
        c8 = c8.find("div", attrs={"class":"row"})
        c8 = c8.find("div", attrs={"class":"col col-6"})
        c8 = c8.find("div", attrs={"class":"muted subheading"}).getText()
    
        #get name
        firstComma = c8.index(',')
        contestantDict['name'] = c8[:firstComma]   #add name to dict
    
        #get age
        substrC8 = c8[firstComma+2:]
        secondComma = substrC8.index(',')
        contestantDict['age'] = substrC8[:secondComma]
            
        #get job
        if ("is a" in c8):
            jobtag = "is a "
        if ("is an" in c8):
            jobtag = "is an "
        jobIndex = c8.index(jobtag)  
        contestantDict['occupation'] = c8[(jobIndex+len(jobtag)):(c8.index("who"))]   #add name to dict

        #get hometown
        hometag = "resides in "
        homeIndex = c8.index(hometag)
        contestantDict['hometown'] = c8[(homeIndex+len(hometag)):-1]
    
        #add season
        contestantDict['season'] = c8[8]

#call 'getSeason8' to scrape data from both sites that have season 8 data
getSeason8("http://realitytv.about.com/od/thebachelor8/ig/Ladies-of-The-Bachelor--Paris/")
getSeason8("http://realitytv.about.com/od/thebachelor8/ig/Ladies-of-The-Bachelor--Paris/index.01.htm#step-heading")

29
29
