# Westminster dog show data
## Part 2: Group winners
Grabbing the data from the group winners from 1924 - 2023
https://www.westminsterkennelclub.org/about-sensation/history/herding-group-winners

In [1]:
# housekeeping
from tqdm import tqdm
from pathlib import Path
import pprint as pp

# analysis
import requests
import lxml.html
import pandas as pd
import re
import random


### 1. `get_rows` helper function
- takes in a table
- returns the running group_matrix so far

In [2]:
def get_rows(table, group, index):
    rows = table.cssselect('tr')[1:]
    print(f"Fetching {group} table rows")
    print("-------------------------------")
    
    for row_el in tqdm(rows):
        children = row_el.getchildren()
        
        #DEBUG
        # print(f"Currently on row: {index}" )
        # print("list is " + str(len(children)) + " items long")
        # print("-----------------------------")

        if len(children) == 0:
            # print("Empty row - reindexed & skipped!")
            # print("-----------------------------")
            continue       
        
        champion = list(map(lambda x: x.text_content(), children))
        champion.append(group)
        index = index + 1
        
        groups_matrix.append(champion)
        index = index + 1
        
    return groups_matrix

### 2. `DOM_to_matrix` wrapper function:
- takes in `page_html` string, and `url` with relative path
- returns matrix, `n x n` list of lists, with the following cols:
    - group, year, breed, name, owners

In [3]:
def DOM_to_matrix(page, sub_url):
    group_dom = lxml.html.fromstring(page_html)
    index = 0

    # grabbing table 
    group_table = group_dom.cssselect("table")[0]

    # grabbing group
    pattern = '.*(?=\-group-winners)'
    m = re.search(pattern, sub_url)
    group_m = m.group(0).title()
    
    # grabbing rows in table
    get_rows(group_table, group_m, index)

    return groups_matrix

In [331]:
# function test with 1 page
# groups_matrix = []
# DOM_to_matrix(page_html, url)

## Variable initialization

In [4]:
groups_matrix = []

# url selection nested within script? not able to chain select from index.
index_URL = "https://www.westminsterkennelclub.org/tag?id=5afcae7ddcb2e405b315c336&name=Group%20Winners"

# work-around
base_URL = "https://www.westminsterkennelclub.org/about-sensation/history/"
rel_urls = ["sporting-group-winners",
            "hound-group-winners",
            "working-group-winners",
            "terrier-group-winners",
            "toy-group-winners",
            "non-sporting-group-winners",
            "herding-group-winners",
           ]

## Scraping, storing, matrix creation

In [5]:
# adapted from Jeremy's script - grabbing + storing in HTML files.
## maybe do it concurrently?????? if list of URLs was long. Refer to Kai's doc.

for url in tqdm(rel_urls):
    dest = Path("../table-pgs/" + f"{url}" + ".html")
    
    if dest.exists(): # ... load it from file
        page_html = open(dest).read()
        print("-------------------------------")
        print(f"file \"{url}\" exists! Loading\n")
        page_html = open(dest).read()

        # function call to create 7 group matrix
        DOM_to_matrix(page_html, url)
            
    else: # ... fetch it
        page_url = base_URL + url
        print("Fetching " + page_url)
        page_html = requests.get(page_url).text
        
        # ... and then save it to file
        with open(dest, "w") as f:
            f.write(page_html)

  0%|                                                                                 | 0/7 [00:00<?, ?it/s]

-------------------------------
file "sporting-group-winners" exists! Loading

Fetching Sporting table rows
-------------------------------



100%|███████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 2864.10it/s][A
 14%|██████████▍                                                              | 1/7 [00:00<00:00,  7.69it/s]

-------------------------------
file "hound-group-winners" exists! Loading

Fetching Hound table rows
-------------------------------



100%|█████████████████████████████████████████████████████████████████████| 94/94 [00:00<00:00, 5755.18it/s][A
 29%|████████████████████▊                                                    | 2/7 [00:00<00:00,  7.71it/s]

-------------------------------
file "working-group-winners" exists! Loading

Fetching Working table rows
-------------------------------



100%|██████████████████████████████████████████████████████████████████| 101/101 [00:00<00:00, 15586.47it/s][A


-------------------------------
file "terrier-group-winners" exists! Loading

Fetching Terrier table rows
-------------------------------



100%|██████████████████████████████████████████████████████████████████| 101/101 [00:00<00:00, 27202.51it/s][A


-------------------------------
file "toy-group-winners" exists! Loading

Fetching Toy table rows
-------------------------------



100%|██████████████████████████████████████████████████████████████████| 101/101 [00:00<00:00, 10233.96it/s][A
 71%|████████████████████████████████████████████████████▏                    | 5/7 [00:00<00:00, 11.52it/s]

-------------------------------
file "non-sporting-group-winners" exists! Loading

Fetching Non-Sporting table rows
-------------------------------



100%|███████████████████████████████████████████████████████████████████| 101/101 [00:00<00:00, 8994.54it/s][A


-------------------------------
file "herding-group-winners" exists! Loading

Fetching Herding table rows
-------------------------------



100%|█████████████████████████████████████████████████████████████████████| 42/42 [00:00<00:00, 7544.68it/s][A
100%|█████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.68it/s]


In [9]:
# sanity check

print("Matrix is 5 x", len(groups_matrix))
print("\n-------------------------------\n")
print("20 random winning dogs from the matrix:\n")
pp.pprint(random.sample(groups_matrix, 20))

Matrix is 5 x 635

-------------------------------

20 random winning dogs from the matrix:

[['1971',
  'Norwegian Elkhound',
  'Ch. Vin Melca’s Vagabond',
  'Patricia V. Craige',
  'Hound'],
 ['1993',
  'English Springer Spaniel',
  'Ch. Salilyn’s Condor',
  'D. & R. Herzig, MD & J. Gasow',
  'Sporting'],
 ['1956', 'Bloodhound', 'Ch. Fancy Bombardier', 'Tom & Pearl Sheahan', 'Hound'],
 ['1973',
  'Afghan Hound',
  'Ch. Khayam’s Apollo',
  'Dr. & Mrs. Doyle Rogers',
  'Hound'],
 ['2010',
  'Poodle (Toy)',
  'Ch. Smash Jp Moon Walk',
  'Ron Scott & Debbie Burke',
  'Toy'],
 ['1987', 'Bloodhound', 'Ch. Viking’s Thor', 'Charles Sexton', 'Hound'],
 ['1964',
  'Sealyham Terrier',
  'Ch. Alcide of Axe',
  'Pool Forge Kennels',
  'Terrier'],
 ['1993',
  'Komondor',
  'Ch. Lojosmegyi Dahu Digal',
  'Patricia Turner & Anna Quigley',
  'Working'],
 ['2002',
  'Affenpinscher',
  'Ch. Yarrow’s Super Nova',
  'Dr. & Mrs. William Truesdale',
  'Toy'],
 ['1926',
  'Collie (Rough)',
  'Laund Hero of 

## groups df creation

In [8]:
group_dom = 
groups_header = group_dom.cssselect("tbody tr td strong")
header = list(map(lambda x: x.text.lower(), groups_header))
header.append('group')

raw_df = pd.DataFrame(groups_matrix, columns=header)
groups_df = raw_df.copy()
groups_df

NameError: name 'header' is not defined

In [341]:
# export as CSV
destination = Path('./data/groups_data.csv')
groups_df.to_csv(destination, index=False)

---

---

---