# WESTMINSTER DOG SHOW DATA

## Part 1: scraping best in show data

In [1]:
# housekeeping
from pathlib import Path
import pprint as pp

#analysis
import requests
import lxml.html
import pandas as pd

In [2]:
BIS_url = "https://www.westminsterkennelclub.org/best-in-show-winners"
BIS_pg = requests.get(BIS_url).text
BIS_dom = lxml.html.fromstring(BIS_pg)

In [3]:
BIS_header = BIS_dom.cssselect("tbody tr td strong")
[ print(subheading.text) for subheading in BIS_header ]

header = list(map(lambda x: x.text.lower(), BIS_header))

YEAR
JUDGE(S)
BREED
DOG
OWNER(S)


In [4]:
BIS_table = BIS_dom.cssselect("table")[0]
rows = BIS_table.cssselect('tr')

# manually removing faulty rows
del rows[0]
del rows[5]
del rows[16]

champions = []

for row_el in rows:
    champion = []
    children = row_el.getchildren()
    
    champion = list(map(lambda x: x.text, children))

    champions.append(champion)


In [5]:
raw_df = pd.DataFrame(champions, columns=header)
# working copy
BIS_df = raw_df.copy()
BIS_df.head()

Unnamed: 0,year,judge(s),breed,dog,owner(s)
0,1907,*Not Recorded,Smooth Fox Terrier,Ch. Warren Remedy,Winthrop Rutherfurd
1,1908,*Not Recorded,Smooth Fox Terrier,Ch. Warren Remedy,Winthrop Rutherfurd
2,1909,*Not Recorded,Smooth Fox Terrier,Ch. Warren Remedy,Winthrop Rutherfurd
3,1910,*Not Recorded,Smooth Fox Terrier,Ch. Sabine Rarebit,Sabine Kennels
4,1911,*Not Recorded,Scottish Terrier,Ch. Tickle Em Jock,"A. Albright, Jr."


In [6]:
# getting rid of judges
BIS_df.drop('judge(s)', axis=1, inplace=True)
BIS_df

Unnamed: 0,year,breed,dog,owner(s)
0,1907,Smooth Fox Terrier,Ch. Warren Remedy,Winthrop Rutherfurd
1,1908,Smooth Fox Terrier,Ch. Warren Remedy,Winthrop Rutherfurd
2,1909,Smooth Fox Terrier,Ch. Warren Remedy,Winthrop Rutherfurd
3,1910,Smooth Fox Terrier,Ch. Sabine Rarebit,Sabine Kennels
4,1911,Scottish Terrier,Ch. Tickle Em Jock,"A. Albright, Jr."
...,...,...,...,...
111,2019,Wire Fox Terrier,GCHB CH King Arthur Van Foliny Home,Victor Malzoni Jr
112,2020,Standard Poodle,GCHP CH Stone Run Afternoon Tea,Connie S Unger & William Lee
113,2021,Pekingese,GCHG CH Pequest Wasabi,Sandra Middlebrooks & Peggy Steinman & Iris Lo...
114,2022,Bloodhound,GCHB Flessner's Toot My Own Horn,Chris & Bryan Flessner & Heather Helmer & Tina...


In [9]:
BIS_df.dtypes

year        object
breed       object
dog         object
owner(s)    object
dtype: object

In [12]:
# getting counts of all breeds that won BIS
BIS_breed = BIS_df['breed'].value_counts()
print(BIS_breed[:10])
print("-------------------------------------")
print("Unique breeds that have won best in show:",len(BIS_breed))

breed
Wire Fox Terrier            15
Scottish Terrier             8
English Springer Spaniel     6
Pekingese                    5
Smooth Fox Terrier           4
Sealyham Terrier             4
Boxer                        4
Poodle (Standard)            4
Doberman Pinscher            4
Airedale Terrier             4
Name: count, dtype: int64
-------------------------------------
Unique breeds that have won best in show: 51


In [76]:
# export as CSV
destination = Path('./data/BIS_data.csv')
BIS_df.to_csv(destination, index=False)

-----

-----

-----