In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
import requests

In [29]:
#get wiki for all bachelor seasons
allseasons = requests.get("https://en.wikipedia.org/wiki/The_Bachelor_(U.S._TV_series)#Seasons")
soup = BeautifulSoup(allseasons.text, "html.parser") #make soup element

#get the table cell that has links to each episode
seasons = soup.find("table", attrs={"class":"navbox"}).find("td", attrs={"class":"navbox-list navbox-odd hlist"})
seasons = seasons.find("div", attrs={"style":"padding:0em 0.25em"}).find("ul")

urls = []                           #list of links to season-specific page
seasonNums = []                     #list of seasons w/ wiki pages (no seasons 1-4 or 6-8)
seasonNum = 1                       #season number
for item in seasons.find_all("li"): #for each item in list of seasons
    if (seasonNum == 20):           #don't include season 20, b/c no contestants listed yet
        break
    season = item.find("a")         #get url tag
    if season is not None:          #if has url link, get url text
        urls.append("\"https://en.wikipedia.org" + season.get("href") + "\"")
        seasonNums.append(seasonNum) #add season number to list 
    seasonNum += 1
    
wikiPageText = []                   #init list of wiki site text, for all seasons
for url in urls:
    site = requests.get(url[1:-1])  #get web-site for that url
    soup = BeautifulSoup(site.text, "html.parser") #make BS element
    wikiPageText.append(soup)       #add web-site text to list

wikiPages = dict(zip(seasonNums, wikiPageText)) #key=season, val=Soup Elem(wiki page text)

In [30]:
# For each season in wiki, make list of dictionaries - one dictionary for each contestant.
# dictionary name = seasonsDict
#             key = season number
#           value = list of dictionaries for that season (one for each contestant)
#             
# For contestant dictionaries:
#            keys = name, age, hometown, occupation, elimination, season
#          values = associated values to fields, as scraped from wiki
#
# To test contestant dictionaries:
#         print seasonsDict[season][contestant][fieldname]
#    eg:  print seasonsDict[9][10]['name']  -- get name for season 9, contestant 10
#
# Note: Wiki does not have pages dedicated to Seasons 1-4, or 6-8. I added 2, 4, 6, and 8
# below, from other sources. Contestants for episode 20 are not added, because they are
# not public yet.
#

seasonsDict = dict()                #key = season num, val=list of contestant dictionaries
allContestants = dict()             #keys = name/age/etc, values = associated data

for sn in seasonNums:
    seasonPage = wikiPages[sn]      #get BS element for this season
    seasonPage = seasonPage.find("div", attrs={"id":"content"}).find("div", attrs={"id":"bodyContent"})
    seasonPage = seasonPage.find("div", attrs={"id":"mw-content-text"})
    seasonPage = seasonPage.find("table", attrs={"class":"wikitable sortable"})
    
    listOfContestantDicts = []          #list of dicts for each contestant
    
    numtr = 0                           #num rows (one per contestant)
    for tr in seasonPage.find_all("tr"):#for each contestant listed,
        if (numtr == 0):                #skip first row (column headers)
            numtr += 1
            continue

        contestantDict = dict()         #init new dict for contestant
        numtd = 0                       #column number
        for td in tr.find_all("td"):    #for each column of data,
            
            if (numtd == 0):
                name = str(td.contents)
                if ("<b>" in name):
                    td.find("b")
                    name = str(td.contents)[4:-5]
                if ("[u'" in name):                     #if in format "[u'name']",
                    name = name.encode('utf8')[3:-2]    #format to get 'name'
                if ("<span class" in name):
                    td.find("span", attrs={"class":"nowrap"})
                    tag = "<span class='nowrap'>"       #start tag before name
                    name = str(td.contents)[len(tag)+1:]#cut out start tag
                    end = name.index("<")               #get start point of end tag
                    name = name[:end]                   #cut out end tag
                    trashTag = "style=\"display:none;\">" #weird tag to cut from a name
                    if (trashTag in name):
                        name = name[(len(trashTag)+1):-1] 
                if ("<sup" in name):                    #if name has "name', <sup ...",
                    end2 = name.index("<sup")           #format to get name
                    name = name[:end2-3]
                if ("</b" in name):
                    name = name[:name.index("</b")]
                contestantDict['name'] = name           #add name to dict

            if (numtd == 1):
                age = str(td.contents)
                if ("<b>" in age):
                    td.find("b")
                    age = str(td.contents)[4:-5]
                if ("[u'" in age):                      
                    age = age.encode('utf8')[3:5]       
                contestantDict['age'] = age
            if (numtd == 2):
                td.find("a")
                home = td.get("href")       
                home = td.get_text("title")
                if ("title" in home):                #format oddity in season 19, contest 1
                    indx = home.index("title")
                    home = home[:indx]
                contestantDict['hometown'] = home
            if (numtd == 3):
                job = str(td.contents)
                if ("<b>" in job):
                    td.find("b")
                    job = str(td.contents)[4:-5]
                if ("[u'" in job):                     
                    job = job.encode('utf8')[3:-2]      
                contestantDict['occupation'] = job   
            if (numtd == 4):
                elim = str(td.contents)
                if ("<b>" in elim):
                    td.find("b")
                    elim = str(td.contents)[4:-5]
                if ("[u'" in elim):                      
                    elim = elim.encode('utf8')[3:-2]      
                contestantDict['elimination'] = elim
            numtd += 1
        numtr += 1
        contestantDict['season'] = sn   #include season num in dict
        
        listOfContestantDicts.append(contestantDict) #add dict to list of dicts
        
    seasonsDict[sn] = listOfContestantDicts  #key = season num, val=list of contestant dicts

#seasons not added yet: 1-4, 6-8, 20
#to test dict so far: print seasonsDict [season][contestant][fieldname], eg:
#print seasonsDict[9][10]['name']

In [None]:
copySeasonsDict = seasonsDict;

In [4]:
# Haven't found data for seasons 1, 3, 7.  Below are notes, including data on winners.


# SEASON 1 
# WINNER - Amanda Marsh, a 23-year-old event planner from Chanute, Kansas
# 
# first names: Kim, Cathy, Trista, Amy, Alexa, LaNease, Tina, Angelique, Rhonda, 
#             Christina, Katie, Amanda, Angela, Melissa, Shannon
# source for winner: http://www.courant.com/hc-amandawinner-ph-photo.html


# SEASON 3 
# WINNER - Jen Schefft, a 26-year-old publicist from Mentor, Ohio
#
# names: jen schefft, kirsten Buschbacher, Tina Panas, Cristina Costa, Anne-Michelle Seiler,
# Liz Terzo, Amber Stoke, Audree Shelton, Heather Barry, Tina Sevier, Amy Plinska,
# Christina Sztanko, Elizabeth, Rachel, Shannon Ford, Amy Greenspan, Angela Polimeri,
# Brooke Vermeulen, Courtney Chan, Ginny Edwards, Jennifer Buttacavoli, Kerri, Kristen,
# Stephanie, Tiffany Sandels
#
# source: http://www.tvsa.co.za/shows/viewshowseasons.aspx?showId=2925&season=3
# source for first names: http://www.realitywanted.com/shows/the-bachelor/season-8-paris


# SEASON 7
# WINNER: Sarah Brice, a 24-year-old nurse from McKinney, Texas
#
# first names: 
# Anitra, elim episode 4
# Brenda, elim episode 1
# Carrie, elim episode 2
# Danushka, elim episode 1
# Emilie, elim episode 1
# Geitan, elim episode 1
# Gina-Marie,elim episode 2
# Heather,elim episode 1
# Jenny, elim episode 3
# Kara, elim episode 3
# Katie, elim episode 1
# Kerry, elim episode 2
# Kimberley, elim episode 5
# Kindle, elim episode 4
# Kristen, elim episode 1
# Kristina, elim episode 1
# Kyshawn, elim episode 1
# Megan, elim episode 2
# Siomara, elim episode 1
# Valerie, elim episode 1
# Sarah Welchelim episode 6
#
# source for names and elim episodes: http://tvdatabase.wikia.com/wiki/Category:Bachelor
# source for winner Brice: http://www.realitytvworld.com/news/abc-bachelor-charlie-oconnell-picks-sarah-brice-rejects-krisily-kennedy-in-seventh-season-finale-3504.php

In [31]:
# Get data for Seasons 2, 4 and 6.
#
# Despite much effort, I could not get the text from the'realitytvword.com' sources below.
# The Beautiful Soup elements did not match the "inspect element" html tags.  
# (Oddly, I was able to scrape from 'realitytv.about.com' for season 8 - see below.)
#
# I tried the following suggestions, but they did not work:
#    https://www.reddit.com/r/learnpython/comments/2nqhzw/how_come_a_websites_page_source_html_is_different/
#    http://stackoverflow.com/questions/26913316/beautiful-soup-doesnt-get-full-webpage
# To keep on schedule, I hand-entered the data. If time permits, I will come back to this.
#
# SEASON 2 SOURCES
# 1) http://www.realitytvworld.com/#$$nxtmgs&&BYitVosLEeWjwgrBiYTF8Q$$
# 2) winner: http://www.realitywanted.com/shows/the-bachelor/season-2
#
# SEASON 4 SOURCES
# 1) http://www.realitytvworld.com/#$$nxtgih&&Dvr38or5EeW1VRL/9wgFGw$$
# 2) http://draheid.com/archives/bachelor4/messages/1452262/1105037.html
# 3) winner: http://www.realitywanted.com/shows/the-bachelor/season-4
#
# SEASON 6 SOURCES
# 1) Source:(looks different in Safari versus Chrome) http://www.realitytvworld.com/#$$nxt6je&&9hhYJIraEeWixQqIPWP/qw$$
# 2) Alt. Source: http://www.realitytvworld.com/news/abc-releases-identities-of-sixth-bachelor-edition-bachelorettes-2880.php")
# 3) FYI, source without occupations, but with pictures: 
# (http://community.realitytvworld.com/cgi-sys/cgiwrap/rtvw2/community/dcboard.cgi?az=printer_format&om=894&forum=DCForumID42)
# 4) Source of winner name: http://www.realitywanted.com/shows/the-bachelor/season-6
# 5) Mary Delgado source: http://www.sptimes.com/2003/09/26/Tampabay/No_wedding_bells__jus.shtml
#

#make array of data for contestants in Season 2
season2 = ["Heather, a 23-year-old sales coordinator who currently resides in Walnut Creek, CA",  
"Lori, a 26-year-old public relations representative who currently resides in Dallas, TX",  
"Heather, a 30-year-old flight attendant who currently resides in Watauga, TX",  
"Amber, a 25-year-old therapist who currently resides in Chapel Hill, NC",  
"Cari, a 28-year-old elementary school teacher who currently resides in Granite City, IL",  
"Christy, a 24-year-old radiologic technologist who currently resides in Avondale, AZ",  
"Hayley, a 28-year-old store manager who currently resides in Dana Point, CA",  
"Camille, a 29-year-old actress/model who currently resides in Los Angeles, CA",  
"Kyla Faye, a 22-year-old recording artist who currently resides in Midvale, UT",  
"Erin, a 25-year-old national magazine who currently resides in Chester, PA",  
"Frances, a 30-year-old strategic planning analyst who currently resides in San Francisco, CA",  
"Dana, a 24-year-old radio sales who currently resides in Beverly Hills, CA",  
"Merrilee, a 27-year-old teacher who currently resides in Forked River, NJ",  
"Suzi, a 27-year-old communications specialist who currently resides in Richmond, VA",  
"Anindita, a 27-year-old attorney who currently resides in New York, NY",  
"Fatima, a 22-year-old student who currently resides in Long Beach, CA",  
"Helene Eksterowicz, a 27-year-old school psychologist who currently resides in Glouchester, NJ",  
"Brooke Nicole, a 22-year-old student who currently resides in Tuscaloosa, AL",  
"Liangy, a 30-year-old paralegal who currently resides in Coral Gables, FL",  
"Erin, a 23-year-old interior designer who currently resides in Houston, TX",  
"Suzanne, a 32-year-old flight attendant who currently resides in Redondo Beach, CA",  
"Angela, a 26-year-old registered nurse who currently resides in Kansas City, MO",  
"Shannon, a 25-year-old graphic artist who currently resides in Hicksville, NY",  
"Christi Diane, a 23-year-old financial advisors asst. who currently resides in Eagle, ID",  
"Gwen, a 31-year-old executive recruiter who currently resides in Chester Springs, PA"] 


#make array of data for contestants in Season 4
season4= ["Brooke, a 24-year-old Teacher who currently resides in Bartlett, TN", 
"Lee-Ann, a 24-year-old Second Grade Teacher who currently resides in  Athens, GA", 
"Shea, a 25-year-old Firefighter who currently resides in Shreveport, LA", 
"Mary, a 35-year-old Sales Manager who currently resides in Tampa, FL", 
"Lindsay, a 23-year-old Professional Dancer who currently resides in Los Angeles, CA", 
"Estella Gardinier, a 27-year-old Mortgage Broker who currently resides in Beverly Hills, CA", 
"Lanah, a 27-year-old Event Coordinator who currently resides in Poolesville, MD", 
"Jenny, a 30-year-old Marketing Director who currently resides in Austin, TX", 
"Kristi, a 24-year-old Loan Processor who currently resides in Chicago, IL", 
"Lindsay, a 25-year-old Pharmaceutical Sales who currently resides in Mauldin, SC", 
"Shelly, a 26-year-old Pharmaceutical Sales who currently resides in Wanwatosa, WI", 
"Kelly Jo, a 23-year-old Director of Community Relations who currently resides in  Kalamazoo, MI", 
"Antoinette, a 30-year-old Senior Account Manager who currently resides in Philadelphia, PA", 
"Stacey, 26-year-old a Hair Stylist who currently resides in  Massillon, OH", 
"Heather, a 24-year-old Recent College Graduate who currently resides in Chicago, IL",
"Meredith, a 29-year-old Model/ Makeup Artist who currently resides in West Hollywood, CA", 
"Misty, a 23-year-old Radio Promotions Assistant who currently resides in Dallas, TX", 
"Christine, a 24-year-old Administrative Assistant who currently resides in Corona, CA", 
"Jenn, a 26-year-old Elementary School Teacher who currently resides in La Jolla, CA", 
"Leona, a 25-year-old Realtor's Assistant who currently resides in Chicago, IL", 
"Samantha, a 25-year-old Kitchen Designer who currently resides in Chicago, IL", 
"Julie, a 29-year-old Sales/ Modeling who currently resides in Louisville, KY", 
"Karin, a 32-year-old Mortgage Consultant who currently resides in Brooklyn Park, MN", 
"Lauren, a 24-year-old Retail Buyer who currently resides in Redondo Beach, CA", 
"Darla, a 26-year-old Attorney who currently resides in Gainesville, FL"] 

#make array of data for contestants in Season 6
season6 = ["Abby, a 29-year-old acrobat who currently resides in Henderson, NV", 
"Alma Rubenstein, a 35-year-old cafe owner who currently resides in Astoria, OR",
"Amanda, a 27-year-old cosmetics buyer who currently resides in New York, NY", 
"Amy, a 27-year-old marketing consultant who currently resides in San Diego, CA", 
"Andrea, a 33-year-old dental hygienist who currently resides in Denver, CO", 
"Ashley, a 31-year-old teacher who currently resides in Santa Barbara, CA", 
"Carolyn, a 36-year-old financial advisor who currently resides in Tulsa, OK", 
"Cheresse, a 31-year-old advertising director who currently resides in St. Louis, MO", 
"Cynthia, a 37-year-old charity foundations director who currently resides in Hermosa Beach, CA", 
"Elizabeth, a 28-year-old in pharmaceutical sales who currently resides in Chicago, IL", 
"Jayne, a 37-year-old dog groomer, who currently resides in Key Largo, FL",
"Jennifer, a 31-year-old account executive who currently resides in Seattle, WA", 
"Kelly, a 34-year-old actress who currently resides in Beverly Hills, CA", 
"Kerry, a 31-year-old nurse who currently resides in San Francisco, CA", 
"Kristie, a 32-year-old bar owner who currently resides in Windsor, Canada", 
"Kristin, a 27-year-old office manager who currently resides in Pensacola, FL", 
"Krysta, a 28-year-old financial analyst who currently resides in Oklahoma City, OK", 
"Leina, a 28-year-old advertising associate who currently resides in Chula Vista, CA", 
"Lisa, a 33-year-old teacher who currently resides in West Palm Beach, FL", 
"Melinda, a 39-year-old photographer who currently resides in Nashville, TN", 
"Natalie, a 34-year-old in retail sales who currently resides in Santa Monica, CA", 
"Nicole, a 28-year-old executive recruiter who currently resides in Libertyville, IL", 
"Susie, a 32-year-old insurance broker who currently resides in Hollywood, CA", 
"Tanya, a 31-year-old teacher who currently resides in Plano, Texas", 
"Wende, a 28-year-old model who currently resides in Austin, Texas",
"Mary Delgado, a 35-year-old real estate agent who currently resides in Tampa Bay, FL"]

In [32]:
# Add contestant data for seasons 2, 4 and 6 to dictionary 'contestantDict'.
#
# param : array of strings with contestant data
# param : season number
# param : winner name
def addNonWikiData(contestantArray, seasonNum, winnerName):
    for line in contestantArray:
        firstComma = line.index(',')                    #parse string
        startAge = line.index(" a ")
        jobTag = "year-old "     
        startJob = line.index(jobTag)
        homeTag = "currently resides in "
        startHome = line.index(homeTag)
    
        contestantDict['name'] = line[:firstComma]      #put field data into dictionary
        contestantDict['age'] = line[startAge+3:startAge+5]
        contestantDict['hometown'] = line[startHome + len(homeTag):]
        contestantDict['occupation'] = line[startJob + len(jobTag):line.index("who")-1]
        contestantDict['season'] = seasonNum
    
        if (winnerName in line):                       #if this is the Winner,
            contestantDict['elimination'] = "Winner"   #add 'winner' to 'elimination' field
        else:
            contestantDict['elimination'] = "unknown"
        listOfContestantDicts.append(contestantDict)   #add dict to list of dicts
        
    #key = season, val=list of contestant dicts
    seasonsDict[seasonNum] = listOfContestantDicts  
    
    
#add contestant data for seasons 4 and 6 to the contestant dictionary
addNonWikiData(season2, 2, "Eksterowicz")
addNonWikiData(season4, 4, "Gardinier")
addNonWikiData(season6, 6, "Delgado")


In [58]:
# Get data for Season 8, add to dictionary

#def getSeason8(season8Site):
copySeasonsDict = seasonsDict;
seasonEight = requests.get("http://realitytv.about.com/od/thebachelor8/ig/Ladies-of-The-Bachelor--Paris/")                #get site
season8= BeautifulSoup(seasonEight.text, "html.parser")#make beautiful soup element

#get the table cell that has links to each episode
eight = season8.find("body", attrs={"id":"imagegalleryIndexPage"})
eight = eight.find("main", attrs={"id":"main"})
eight = eight.find("div", attrs={"class":"container"})
eight = eight.find_all("div", attrs={"class":"row"})[1]
eight = eight.find("div", attrs={"class":"col col-11"}).find("div", attrs={"class":"row"})
eight = eight.find("div", attrs={"class":"col col-8"})
eight = eight.find("div", attrs={"class":"content widget gallery-index-content"})
eight = eight.find("ul")


urls8 = []                       #list of urls for season 8 contestant pages
for item in eight.find_all("li"):#for each contestant in list of season 8 contestants
    url8 = item.find("a")        #get url tag
    if url8 is not None:         #if has url link, get url 
        urls8.append("\"http://realitytv.about.com" + url8.get("href") + "\"")

#add contestant site leftover from next page
#urls8.append("\"http://realitytv.about.com/od/thebachelor8/ig/Ladies-of-The-Bachelor--Paris/Shiloh-of-The-Bachelor--Paris.htm\"")
        
cont8Sites = []                  #list of soup objects for season 8 contestant sites
for link in urls8:
    site8 = requests.get(link[1:-1]) 
    soup8 = BeautifulSoup(site8.text, "html.parser") #get soup element
    cont8Sites.append(soup8)     #add soup element to list
        
iteration = 0
for cont8 in cont8Sites:         #for each soup element (one per contestant site),
    c8 = cont8.find("body", attrs={"id":"imagegalleryPage"}) #find data
    c8 = c8.find("main", attrs={"class":"slab"})
    c8 = c8.find("div", attrs={"class":"container"})
    c8 = c8.find_all("div", attrs={"class":"row"})[1]
    c8 = c8.find("div", attrs={"class":"col col-11"})
    c8 = c8.find("div", attrs={"id":"contentIntro"})
    c8 = c8.find("div", attrs={"class":"row"})
    c8 = c8.find("div", attrs={"class":"col col-6"})
    c8 = c8.find("div", attrs={"class":"muted subheading"}).getText()
    

    #get name
    firstComma = c8.index(',')
    contestantDict['name'] = c8[:firstComma]           #add name to dict
        
    #get age
    substrC8 = c8[firstComma+2:]
    secondComma = substrC8.index(',')
    contestantDict['age'] = substrC8[:secondComma]
        
    #get hometown
    hometag = "resides in "
    if (hometag not in c8):
        hometag = "living in "
    homeIndex = c8.index(hometag)
    contestantDict['hometown'] = c8[(homeIndex+len(hometag)):-1]
        
    #get job
    jobtag = "is a "
    endjobtag = " who"
    if ("is an" in c8):
        jobtag = "is an "
    if("works in" in c8):   #has format "Tara, 23, works in X and currently resides in Y"
        jobtag = "works in "
        endjobtag = " and currently resides"
    if("is the" in c8):
        jobtag = "is the "
        endjobtag = " and currently resides"
    if (endjobtag not in c8):
        endjobtag = " living in"
    contestantDict['occupation'] = c8[(c8.index(jobtag)+len(jobtag)):(c8.index(endjobtag))]   #add name to dict

    #get elimination week
    if ("Sarah Stone" in name):         #hard-code season 8 winner
        contestantDict['elimination'] = "Winner"
    else:
        contestantDict['elimination'] = "unknown"
    
    #add season
    contestantDict['season'] = 8
        
    #add dict to list of dicts
    if (contestantDict) not in listOfContestantDicts:
        listOfContestantDicts.append(contestantDict)   

print listOfContestantDicts
#seasonsDict[8] = listOfContestantDicts  #key = season num, val=list of contestant dicts
    
#call 'getSeason8' to scrape data from both sites that have season 8 data
#getSeason8("http://realitytv.about.com/od/thebachelor8/ig/Ladies-of-The-Bachelor--Paris/")
#getSeason8("http://realitytv.about.com/od/thebachelor8/ig/Ladies-of-The-Bachelor--Paris/index.01.htm#step-heading")

[{'name': 'Whitney Bischoff', 'hometown': u'Louisville, Kentucky', 'age': '29', 'season': 19, 'elimination': 'Winner', 'occupation': 'Fertility Nurse'}, {'name': 'Becca Tilley', 'hometown': u'Shreveport, Louisiana', 'age': '26', 'season': 19, 'elimination': 'Runner-up', 'occupation': 'Chiropractic Assistant'}, {'name': 'Kaitlyn Bristowe', 'hometown': u'Leduc, Alberta', 'age': '29', 'season': 19, 'elimination': 'Eliminated in week 9', 'occupation': 'Dance Instructor'}, {'name': 'Jade Roper', 'hometown': u'Gering, Nebraska', 'age': '28', 'season': 19, 'elimination': 'Eliminated in week 8', 'occupation': 'Cosmetics Developer'}, {'name': 'Carly Waddell', 'hometown': u'Arlington, Texas', 'age': '29', 'season': 19, 'elimination': 'Eliminated in week 7', 'occupation': 'Cruise Ship Singer'}, {'name': 'Britt Nilsson', 'hometown': u'Hollywood, California', 'age': '27', 'season': 19, 'elimination': 'Eliminated in week 7', 'occupation': 'Waitress'}, {'name': 'Megan Bell', 'hometown': u'Nashville, 

In [52]:
#import json
#fd = open("tempdata/seasonsDict.json", "w")   #save dictionary to disk
#json.dump(seasonsDict, fd)
#fd.close()

#del seasonsDict
#with open("tempdata/seasonsDict.json", "r") as fd: 
#    seasonsDict = json.load(fd)               #reload 


6


In [57]:
print seasonsDict[8]

[{'name': 'Whitney Bischoff', 'hometown': u'Louisville, Kentucky', 'age': '29', 'season': 19, 'elimination': 'Winner', 'occupation': 'Fertility Nurse'}, {'name': 'Becca Tilley', 'hometown': u'Shreveport, Louisiana', 'age': '26', 'season': 19, 'elimination': 'Runner-up', 'occupation': 'Chiropractic Assistant'}, {'name': 'Kaitlyn Bristowe', 'hometown': u'Leduc, Alberta', 'age': '29', 'season': 19, 'elimination': 'Eliminated in week 9', 'occupation': 'Dance Instructor'}, {'name': 'Jade Roper', 'hometown': u'Gering, Nebraska', 'age': '28', 'season': 19, 'elimination': 'Eliminated in week 8', 'occupation': 'Cosmetics Developer'}, {'name': 'Carly Waddell', 'hometown': u'Arlington, Texas', 'age': '29', 'season': 19, 'elimination': 'Eliminated in week 7', 'occupation': 'Cruise Ship Singer'}, {'name': 'Britt Nilsson', 'hometown': u'Hollywood, California', 'age': '27', 'season': 19, 'elimination': 'Eliminated in week 7', 'occupation': 'Waitress'}, {'name': 'Megan Bell', 'hometown': u'Nashville, 

In [27]:
#convert lists of python dictionaries to one pandas dataframe

#print seasonsDict[season][contestant][fieldname]
print seasonsDict[9][1]['name']

seasons = [2,4,5,6,8,9,10,11,12,13,14,15,16,17,18,19] #no seasons: 1, 3, 7
fieldnames =['name', 'age', 'hometown', 'occupation', 'elimination', 'season']
contSeries = pd.Series()
bigList = []

cDicts = []
for season in seasons:
    listOfDicts = seasonsDict[season]
    for l in listOfDicts:
        d={}
        d['name']=l['name']
        d['age']=l['age']
        d['hometown']=l['hometown']
        d['occupation']=l['occupation']
        d['elimination']=l['elimination']
        d['season']=l['season']
        cDicts.append(d)
        
contestantDF = pd.DataFrame(cDicts)
contestantDF.drop_duplicates()
contestantDF.shape
contestantDF.head(100)



Sadie Murray


Unnamed: 0,age,elimination,hometown,name,occupation,season
0,29,Winner,"Louisville, Kentucky",Whitney Bischoff,Fertility Nurse,19
1,26,Runner-up,"Shreveport, Louisiana",Becca Tilley,Chiropractic Assistant,19
2,29,Eliminated in week 9,"Leduc, Alberta",Kaitlyn Bristowe,Dance Instructor,19
3,28,Eliminated in week 8,"Gering, Nebraska",Jade Roper,Cosmetics Developer,19
4,29,Eliminated in week 7,"Arlington, Texas",Carly Waddell,Cruise Ship Singer,19
5,27,Eliminated in week 7,"Hollywood, California",Britt Nilsson,Waitress,19
6,24,Quit in week 6,"Nashville, Tennessee",Megan Bell,Make-Up Artist,19
7,28,Eliminated in week 6,"Hudsonville, Michigan",Kelsey Poe,Guidance Counselor,19
8,26,Eliminated in week 6,"Great Falls, Virginia",Ashley Iaconetti,Nanny/Freelance Journalist,19
9,21,Eliminated in week 5,"Maple Valley, Washington",Mackenzie Deonigi,Dental Assistant,19
