# Westminster dog show data
## Part 2: Group winners
Grabbing the data from the group winners from 1924 - 2023
https://www.westminsterkennelclub.org/about-sensation/history/herding-group-winners

In [339]:
# housekeeping
from tqdm import tqdm
from pathlib import Path
import pprint as pp

# analysis
import requests
import lxml.html
import pandas as pd
import re
import random


### 1. `get_rows` helper function
- takes in a table
- returns the running group_matrix so far

In [324]:
def get_rows(table, group, index):
    rows = table.cssselect('tr')[1:]
    print(f"Fetching {group} table rows")
    print("-------------------------------")
    
    for row_el in tqdm(rows):
        children = row_el.getchildren()
        
        #DEBUG
        # print(f"Currently on row: {index}" )
        # print("list is " + str(len(children)) + " items long")
        # print("-----------------------------")

        if len(children) == 0:
            # print("Empty row - reindexed & skipped!")
            # print("-----------------------------")
            continue       
        
        champion = list(map(lambda x: x.text_content(), children))
        champion.append(group)
        index = index + 1
        
        groups_matrix.append(champion)
        index = index + 1
        
    return groups_matrix

### 2. `DOM_to_matrix` wrapper function:
- takes in `page_html` string, and `url` with relative path
- returns matrix, `n x n` list of lists, with the following cols:
    - group, year, breed, name, owners

In [330]:
def DOM_to_matrix(page, sub_url):
    group_dom = lxml.html.fromstring(page_html)
    index = 0

    # grabbing table 
    group_table = group_dom.cssselect("table")[0]

    # grabbing group
    pattern = '.*(?=\-group-winners)'
    m = re.search(pattern, sub_url)
    group_m = m.group(0).title()
    
    # grabbing rows in table
    get_rows(group_table, group_m, index)

    return groups_matrix

In [331]:
# function test with 1 page
# groups_matrix = []
# DOM_to_matrix(page_html, url)

## Variable initialization

In [332]:
groups_matrix = []

# url selection nested within script? not able to chain select from index.
index_URL = "https://www.westminsterkennelclub.org/tag?id=5afcae7ddcb2e405b315c336&name=Group%20Winners"

# work-around
base_URL = "https://www.westminsterkennelclub.org/about-sensation/history/"
rel_urls = ["sporting-group-winners",
            "hound-group-winners",
            "working-group-winners",
            "terrier-group-winners",
            "toy-group-winners",
            "non-sporting-group-winners",
            "herding-group-winners",
           ]

## Scraping, storing, matrix creation

In [333]:
# adapted from Jeremy's script - grabbing + storing in HTML files.
## maybe do it concurrently?????? if list of URLs was long. Refer to Kai's doc.

for url in tqdm(rel_urls):
    dest = Path("table-pgs/" + f"{url}" + ".html")
    
    if dest.exists(): # ... load it from file
        page_html = open(dest).read()
        print("-------------------------------")
        print(f"file \"{url}\" exists! Loading\n")
        page_html = open(dest).read()

        # function call to create 7 group matrix
        DOM_to_matrix(page_html, url)
            
    else: # ... fetch it
        page_url = base_URL + url
        print("Fetching " + page_url)
        page_html = requests.get(page_url).text
        
        # ... and then save it to file
        with open(dest, "w") as f:
            f.write(page_html)

  0%|                                                                                 | 0/7 [00:00<?, ?it/s]

-------------------------------
file "sporting-group-winners" exists! Loading

Fetching Sporting table rows
-------------------------------



100%|██████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 31835.32it/s][A


-------------------------------
file "hound-group-winners" exists! Loading

Fetching Hound table rows
-------------------------------



100%|████████████████████████████████████████████████████████████████████| 94/94 [00:00<00:00, 14943.89it/s][A


-------------------------------
file "working-group-winners" exists! Loading

Fetching Working table rows
-------------------------------



100%|██████████████████████████████████████████████████████████████████| 101/101 [00:00<00:00, 11435.72it/s][A
 43%|███████████████████████████████▎                                         | 3/7 [00:00<00:00, 12.14it/s]

-------------------------------
file "terrier-group-winners" exists! Loading

Fetching Terrier table rows
-------------------------------



100%|██████████████████████████████████████████████████████████████████| 101/101 [00:00<00:00, 14237.57it/s][A


-------------------------------
file "toy-group-winners" exists! Loading

Fetching Toy table rows
-------------------------------



100%|███████████████████████████████████████████████████████████████████| 101/101 [00:00<00:00, 7461.99it/s][A


-------------------------------
file "non-sporting-group-winners" exists! Loading

Fetching Non-Sporting table rows
-------------------------------



100%|███████████████████████████████████████████████████████████████████| 101/101 [00:00<00:00, 7656.74it/s][A
 86%|██████████████████████████████████████████████████████████████▌          | 6/7 [00:00<00:00, 16.19it/s]

-------------------------------
file "herding-group-winners" exists! Loading

Fetching Herding table rows
-------------------------------



100%|█████████████████████████████████████████████████████████████████████| 42/42 [00:00<00:00, 1025.06it/s][A
100%|█████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 14.30it/s]


In [334]:
# sanity check

print("Matrix is 5 x", len(groups_matrix))
print("\n-------------------------------\n")
print("20 random winning dogs from the matrix:\n")
pp.pprint(random.sample(groups_matrix, 20))

Matrix is 5 x 635

-------------------------------

20 random winning dogs from the matrix:

[['2005', 'Pekingese', 'Ch. Yakee If Only', 'Kit Woodruff', 'Toy'],
 ['1964',
  'Whippet',
  'Ch. Courtenay Fleetfoot of Pennyworth',
  'Pennyworth Kennels',
  'Hound'],
 ['1962',
  'Poodle (Miniature)',
  'Ch. Crikora Commotion',
  'Mrs. J. Donald Duncan',
  'Non-Sporting'],
 ['1955',
  'English Springer Spaniel',
  'Ch. King Peter of Salilyn',
  'Mrs. F. H. Gasow',
  'Sporting'],
 ['1929',
  'Wire Fox Terrier',
  'Eden Aristocrat of Wildoaks',
  'Mr. & Mrs. R. C. Bondy',
  'Terrier'],
 ['1980',
  'Gordon Setter',
  'Ch. Ben-Wen’s Benjy McDee',
  'Marie Annello & B. Perlstein',
  'Sporting'],
 ['1993',
  'Borzoi',
  'Ch. Fox Run’s Ivy Rose',
  'Ronald Mater & Joanne Hack',
  'Hound'],
 ['1935', 'Pomeranian', 'Wonder Son', 'Miss E. G. Hydon', 'Toy'],
 ['1931', 'Greyhound', 'Gamecock Duke of Wales', 'George S. West', 'Hound'],
 ['1945',
  'Cocker Spaniel (Black)',
  'Ch. Stockdale Town Talk',
  

## groups df creation

In [335]:
groups_header = group_dom.cssselect("tbody tr td strong")
header = list(map(lambda x: x.text.lower(), groups_header))
header.append('group')

raw_df = pd.DataFrame(groups_matrix, columns=header)
groups_df = raw_df.copy()
groups_df

Unnamed: 0,year,breed,dog,owner(s),group
0,2023,English Setter,GCHB CH Winchester's An Apple A Day JH,S Nordstrom & C Hodges & R Barnes & S McGraw &...,Sporting
1,2022,English Setter,GCHS CH Ciara N' Honeygait Belle Of The Ball F...,Van Jacobsen & Lee Afdahl & Amanda & Vito Ciar...,Sporting
2,2021,German Shorthaired Pointer,GCHS CH Clarity Reach The sky VJK-Myst,V. Nunes-Atkinson & Yvonne Hassler-Deterding &...,Sporting
3,2020,Golden Retriever,GCHP CH Hillock's Jack Daniel's RA JH CA RATN ...,"Tammy Tomlinson , Jim Cohen & Robert Samios",Sporting
4,2019,Sussex Spaniel,GCh. Kamand’s Full Of Beans @ Erinhill,Karen Ann Toner & Amanda W Toner,Sporting
...,...,...,...,...,...
630,1987,German Shepherd Dog,Ch. Covy Tucker Hill’s Manhattan,Shirlee Braunstein & Jane A. Firestone,Herding
631,1986,German Shepherd Dog,Ch. Covy Tucker Hill’s Manhattan,Shirlee Braunstein & Jane A. Firestone,Herding
632,1985,Shetland Sheepdog,Ch. Rhodan’s The Windwalker,Kathleen Schmutz,Herding
633,1984,German Shepherd Dog,Ch. Covy Tucker Hill’s Manhattan,Shirlee Braunstein & Jane A. Firestone,Herding


In [341]:
# export as CSV
destination = Path('./data/groups_data.csv')
groups_df.to_csv(destination, index=False)

---

---

---