In [22]:
from scrape.crawler import PbCrawler
from scrape.html import get_title
from scrape.query_builder import QueryBuilder
from app.constants import BikeCategory, Region
from functools import partial
from app import create_app, Brand
import requests

# Purpose of Notebook
---
This notebook will analyze some subset of pink bike data posts to determine what are some common bike models for each bike brand. The `PbCrawler` will walk though each page of the pink bike postings and extract each title. Given a list of titles, we will perform some simple summary statistics / data manipulation to determine common bike models that are associated with each brand. Emphasis on the word simple.

The Bike model will be extracted from the title using two steps:
1. Perform a regex match from a known bike brand
2. Split on the regex match (if present) and take the second half of the title (referred to as the "ending")
3. Strip the "ending" of the title from leading / trailing white space
4. Take the first word (lol, works better than you think with large numbers)

# Part I: Setup PbCrawler
---
The `PbCrawler` initialization takes in two key parameters:
1. `QueryBuilder`: Build out query strings for any given `base_url`
2. `mods`: List of functions that each extract data from BeautifulSoup nodes and returns a tuple with the structure `(key:str, value:str)`

In [23]:
base_url = "https://www.pinkbike.com/buysell/list/"
category = BikeCategory.ENDURO.value
region = Region.NORTH_AMERICA.value

qb = (QueryBuilder(base_url)
     .upsert_query("category", category)
     .upsert_query("region", region))

crawler = PbCrawler(qb, mods=[get_title])

Using the `current_url` property, we can look at the url that would be accessed if were to call the `scrape_current_page` method.

In [3]:
crawler.current_url

'https://www.pinkbike.com/buysell/list/?category=2&region=3&page=1'

# Get Bike Brands
---
The `app_context` for flask apps enables database access under its scope. All bike brands will be retrieved from the database and used for this analysis.

In [24]:
app = create_app()
brands = []
with app.app_context():
    brands.extend(Brand.query.all())

In [25]:
brands[0]

<Brand(value = rocky mountain)>

# Data Processing Functions
---
Below are two helper functions to extract words from the titles. 
1. `create_brand_matcher` returns a function that cross references title with a list of regex patterns of bike brands using regex.search
2. `split_title_on_match` splits the title on the matching word (if any) and grabs the second half of the title

In [26]:
results = list(crawler.scrape_current_page()) # use this to check out the scraped data

https://www.pinkbike.com/buysell/list/?category=2&region=3&page=1


In [27]:
import re
from typing import Pattern

def create_brand_matcher(brands:list[Brand]) -> callable:
    patterns = [re.compile(brand.value, re.IGNORECASE) for brand in brands]
    
    def _inner(s):
        for pattern in patterns:
            result = re.search(pattern, s)
            if result:
                return result
    return _inner

def split_title_on_match(title:str, matcher:callable):
    match = matcher(title)
    if match:
        ending = title[match.end():].lower().strip()
        model = ending.split(" ")[0]
        pattern = match.re.pattern
        return (pattern, model)
    return tuple()


brand_extractor = create_brand_matcher(brands)

# Pipeline
---
Code below will scrape through 50 pages of pink bike postings and scrape the titles. Upon completion of scraping web page data, the program will sleep for 2s to minimize the load on Pink Bike servers. After sleeping,
the next page is then hit. 

In [28]:
import time
n = 50
brand_models = dict()
for post in crawler.crawl(50, patience=2):
    match = split_title_on_match(post["title"], brand_extractor)
    if match:
        pattern, ending = match
        if pattern not in brand_models:
            brand_models[pattern] = []
        brand_models[pattern].append(ending)

https://www.pinkbike.com/buysell/list/?category=2&region=3&page=1
https://www.pinkbike.com/buysell/list/?category=2&region=3&page=2
https://www.pinkbike.com/buysell/list/?category=2&region=3&page=3
https://www.pinkbike.com/buysell/list/?category=2&region=3&page=4
https://www.pinkbike.com/buysell/list/?category=2&region=3&page=5
https://www.pinkbike.com/buysell/list/?category=2&region=3&page=6
https://www.pinkbike.com/buysell/list/?category=2&region=3&page=7
https://www.pinkbike.com/buysell/list/?category=2&region=3&page=8
https://www.pinkbike.com/buysell/list/?category=2&region=3&page=9
https://www.pinkbike.com/buysell/list/?category=2&region=3&page=10
https://www.pinkbike.com/buysell/list/?category=2&region=3&page=11
https://www.pinkbike.com/buysell/list/?category=2&region=3&page=12
https://www.pinkbike.com/buysell/list/?category=2&region=3&page=13
https://www.pinkbike.com/buysell/list/?category=2&region=3&page=14
https://www.pinkbike.com/buysell/list/?category=2&region=3&page=15
http

In [32]:
from collections import Counter
count_dict = {key:Counter(values) for key,values in brand_models.items()}

# Conclusion
This process was rather hilarious, since it would be more accurate to just go to each company site and do it by hand. Still had a good time, and now we have 
some bike model data for a MVP product. Going forward, it may just be wiser to use current scraping methods to extract bike models from each company site. This will give is more information into the tiers of 
product they offer for each bike model. Don't get me started with Specialized's abyss of bike tiers ....

See the intial results below. Most popular bike brands were compiled to a bikes.json file.

In [33]:
count_dict

{'trek': Counter({'fuel': 12,
          'roscoe': 3,
          'remedy': 10,
          'slash': 4,
          'top': 1,
          '9.9': 1,
          'project': 2,
          'stache': 1,
          'superfly': 2,
          'full': 1,
          'scratch': 1}),
 'pivot': Counter({'mach': 10,
          'firebird': 11,
          'trail': 3,
          'switchblade': 5,
          'cycles': 1}),
 'santa cruz': Counter({'bronson': 21,
          'nomad': 17,
          'megatower': 12,
          'tallboy': 11,
          '5010': 11,
          'hightower': 14,
          'e': 1,
          ',': 1,
          '5010c': 1,
          'heckler.': 1,
          'juliana': 1,
          'a': 1,
          'blur': 2,
          '(megatower)': 1}),
 'orbea': Counter({'occam': 1, 'wild': 1, 'oiz': 1, 'rallon': 1}),
 'evil': Counter({'': 1,
          'offering': 16,
          'wreckoning': 5,
          'calling': 2,
          'following': 5,
          'follow': 1,
          'wreckoning(push': 1,
          '-': 1,
   