# Westminster dog show data
## Part 2: Group winners
Grabbing the data from the group winners from 1924 - 2023
https://www.westminsterkennelclub.org/about-sensation/history/herding-group-winners

In [303]:
# make ur life easier libs
from tqdm import tqdm
from pathlib import Path
import pprint as pp

# analysis
import requests
import lxml.html
import pandas as pd
import re
import random


### 1. `get_rows` helper function
- takes in a table
- returns the running group_matrix so far

In [304]:
# working script

def get_rows(table, group, index):
    rows = table.cssselect('tr')[1:]
    print(f"Fetching {group} table rows")
    print("-------------------------------")
    
    for row_el in tqdm(rows):
        children = row_el.getchildren()
        
        #DEBUG
        # print(f"Currently on row: {index}" )
        # print("list is " + str(len(children)) + " items long")
        # print("-----------------------------")

        if len(children) == 0:
            # print("Empty row - reindexed & skipped!")
            # print("-----------------------------")
            continue       
        
        champion = list(map(lambda x: x.text_content(), children))
        champion.append(group)
        index = index + 1
        
        groups_matrix.append(champion)
        index = index + 1
        
    return groups_matrix

### 2. `DOM_to_matrix` wrapper function:
- takes in `page_html` string, and `url` with relative path
- returns matrix, `n x n` list of lists, with the following cols:
    - group, year, breed, name, owners

In [305]:
def DOM_to_matrix(page, sub_url):
    group_dom = lxml.html.fromstring(page_html)
    index = 0

    # grabbing table 
    group_table = group_dom.cssselect("table")[0]

    # grabbing group
    pattern = '.*(?=\-group-winners)'
    m = re.search(pattern, sub_url)
    group_m = m.group(0)
    
    # grabbing rows in table
    get_rows(group_table, group_m, index)

    return groups_matrix

In [314]:
# function test with 1 page
# groups_matrix = []
# DOM_to_matrix(page_html, url)

## Variable initialization

In [315]:
groups_matrix = []

# url selection nested within script? not able to chain select from index.
index_URL = "https://www.westminsterkennelclub.org/tag?id=5afcae7ddcb2e405b315c336&name=Group%20Winners"

# work-around
base_URL = "https://www.westminsterkennelclub.org/about-sensation/history/"
rel_urls = ["sporting-group-winners",
            "hound-group-winners",
            "working-group-winners",
            "terrier-group-winners",
            "toy-group-winners",
            "non-sporting-group-winners",
            "herding-group-winners",
           ]

## Scraping, storing, matrix creation

In [316]:
# adapted from Jeremy's script - grabbing + storing in HTML files.
## maybe do it concurrently?????? if list of URLs was long. Refer to Kai's doc.

for url in tqdm(rel_urls):
    dest = Path("table-pgs/" + f"{url}" + ".html")
    
    if dest.exists(): # ... load it from file
        page_html = open(dest).read()
        print("-------------------------------")
        print(f"file \"{url}\" exists! Loading\n")
        page_html = open(dest).read()

        # function call to create 7 group matrix
        DOM_to_matrix(page_html, url)
            
    else: # ... fetch it
        page_url = base_URL + url
        print("Fetching " + page_url)
        page_html = requests.get(page_url).text
        pg_breed
        
        # ... and then save it to file
        with open(dest, "w") as f:
            f.write(page_html)

  0%|                                                                                 | 0/7 [00:00<?, ?it/s]

-------------------------------
file "sporting-group-winners" exists! Loading

Fetching sporting table rows
-------------------------------



100%|██████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 14507.14it/s][A


-------------------------------
file "hound-group-winners" exists! Loading

Fetching hound table rows
-------------------------------



100%|████████████████████████████████████████████████████████████████████| 94/94 [00:00<00:00, 12268.24it/s][A


-------------------------------
file "working-group-winners" exists! Loading

Fetching working table rows
-------------------------------



100%|██████████████████████████████████████████████████████████████████| 101/101 [00:00<00:00, 13586.42it/s][A
 43%|███████████████████████████████▎                                         | 3/7 [00:00<00:00, 25.34it/s]

-------------------------------
file "terrier-group-winners" exists! Loading

Fetching terrier table rows
-------------------------------



100%|███████████████████████████████████████████████████████████████████| 101/101 [00:00<00:00, 7976.66it/s][A


-------------------------------
file "toy-group-winners" exists! Loading

Fetching toy table rows
-------------------------------



100%|██████████████████████████████████████████████████████████████████| 101/101 [00:00<00:00, 15314.87it/s][A


-------------------------------
file "non-sporting-group-winners" exists! Loading

Fetching non-sporting table rows
-------------------------------



100%|██████████████████████████████████████████████████████████████████| 101/101 [00:00<00:00, 23111.00it/s][A
 86%|██████████████████████████████████████████████████████████████▌          | 6/7 [00:00<00:00, 25.18it/s]

-------------------------------
file "herding-group-winners" exists! Loading

Fetching herding table rows
-------------------------------



100%|████████████████████████████████████████████████████████████████████| 42/42 [00:00<00:00, 14074.85it/s][A
100%|█████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 26.33it/s]


In [317]:
# sanity check

print("Matrix is 5 x", len(groups_matrix))
print("\n-------------------------------\n")
print("20 random winning dogs from the matrix:\n")
pp.pprint(random.sample(groups_matrix, 20))

Matrix is 5 x 635

-------------------------------

20 random winning dogs from the matrix:

[['1973',
  'Afghan Hound',
  'Ch. Khayam’s Apollo',
  'Dr. & Mrs. Doyle Rogers',
  'hound'],
 ['1926',
  'Chow Chow',
  'Ch. Victorious of Tien',
  'Mrs. H. Earl Hoover',
  'non-sporting'],
 ['2008',
  'Sealyham Terrier',
  'Ch. Efbe’s Hidalgo At Goodspice',
  'Margery Good & Richard Good & Sandra Middlebrooks',
  'terrier'],
 ['2011',
  'Pekingese',
  'GCh. Palacegarden Malachy',
  'Iris Love & S. Middlebrooks & D. Fitzpatrick',
  'toy'],
 ['1991',
  'Scottish Deerhound',
  'Ch. Fernhill’s Phantom CD',
  'Miranda Levin & Barbara Heidenreich',
  'hound'],
 ['1986',
  'German Shepherd Dog',
  'Ch. Covy Tucker Hill’s Manhattan',
  'Shirlee Braunstein & Jane A. Firestone',
  'herding'],
 ['2004',
  'Sussex Spaniel',
  'Ch. Clussexx Three D Grinchy Glee',
  'Cecilia Ruggles & Beth Dowd',
  'sporting'],
 ['2011',
  'Scottish Deerhound',
  'GCH. Foxcliffe Hickory Wind',
  'Sally Sweatt & Cecilia L D

## groups df creation

In [323]:
groups_header = group_dom.cssselect("tbody tr td strong")
header = list(map(lambda x: x.text.lower(), groups_header))
header.append('group')

raw_df = pd.DataFrame(groups_matrix, columns=header)
df = raw_df.copy()
df

Unnamed: 0,year,breed,dog,owner(s),group
0,2023,English Setter,GCHB CH Winchester's An Apple A Day JH,S Nordstrom & C Hodges & R Barnes & S McGraw &...,sporting
1,2022,English Setter,GCHS CH Ciara N' Honeygait Belle Of The Ball F...,Van Jacobsen & Lee Afdahl & Amanda & Vito Ciar...,sporting
2,2021,German Shorthaired Pointer,GCHS CH Clarity Reach The sky VJK-Myst,V. Nunes-Atkinson & Yvonne Hassler-Deterding &...,sporting
3,2020,Golden Retriever,GCHP CH Hillock's Jack Daniel's RA JH CA RATN ...,"Tammy Tomlinson , Jim Cohen & Robert Samios",sporting
4,2019,Sussex Spaniel,GCh. Kamand’s Full Of Beans @ Erinhill,Karen Ann Toner & Amanda W Toner,sporting
...,...,...,...,...,...
630,1987,German Shepherd Dog,Ch. Covy Tucker Hill’s Manhattan,Shirlee Braunstein & Jane A. Firestone,herding
631,1986,German Shepherd Dog,Ch. Covy Tucker Hill’s Manhattan,Shirlee Braunstein & Jane A. Firestone,herding
632,1985,Shetland Sheepdog,Ch. Rhodan’s The Windwalker,Kathleen Schmutz,herding
633,1984,German Shepherd Dog,Ch. Covy Tucker Hill’s Manhattan,Shirlee Braunstein & Jane A. Firestone,herding
