In [None]:
# Import dependencies
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager
import unidecode 
from config import api_key
import requests
import random

In [None]:
# url of website being scraped
breeds_url = 'https://stacker.com/stories/2454/origins-50-most-popular-dog-breeds'

executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

In [None]:
# visit top 50 dog breeds website
browser.visit(breeds_url)
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [None]:
# array containing top 50 breed info
results = soup.find_all('div', class_='ct-slideshow__slide__text-container')
results.pop(0)

In [None]:
# initialize empty lists to store breed info
breed_list = []
rank_list = []
origin_list = []

# loop through each dog in results list
for item in results:

    # extract and append breed names and ranks to lists
    breed_title = item.find(class_="ct-slideshow__slide__text-container__caption").text
    breed_rank = breed_title.split('. ',1)
    rank_list.append(breed_rank[0].strip())
    breed_list.append(unidecode.unidecode(breed_rank[1].strip()))
    
    # extract country/place of origin and append to list
    para = item.find('div',class_="ct-slideshow__slide__text-container__description").\
        get_text(separator='<br>', strip=True).split('<br>')
    origin = para[0].split(': ',1)
    origin_list.append(origin[1])

In [None]:
# initialize empty lists to store origin 
latitude_list = []
longitude_list = []

# loop through locations in origin list and use google maps API to get latitude/longitude
for i in range(len(origin_list)):
    location = origin_list[i]
    try: 
        base_url = 'https://maps.googleapis.com/maps/api/geocode/json?'
        url = f'{base_url}address={location}&key={api_key}'
        data = requests.get(url).json()
        
        lat = data['results'][0]['geometry']['location']['lat']
        lon = data['results'][0]['geometry']['location']['lng']
        latitude_list.append(lat)
        longitude_list.append(lon)
        
    except:
        latitude_list.append(np.NaN)
        longitude_list.append(np.NaN)

In [None]:
# generate random float to adjust latitude and longitude so that markers
# from same location won't stack directly on top of each other
lat_adj_factor = [random.uniform(-2,2) for _ in range(50)]
lng_adj_factor = [random.uniform(-2,2) for _ in range(50)]

In [None]:
# assemble all lists into dataframe
origin_df = pd.DataFrame(list(zip(breed_list, rank_list,origin_list,
                                 latitude_list,longitude_list,lat_adj_factor,lng_adj_factor)),
               columns =['breed_name', 'rank','origin','lat_unadj','lng_unadj',
                        'lat_adj_factor','lng_adj_factor'])

In [None]:
# create columns adjusting latitude/longitude by adjustment factors
origin_df['lat'] = origin_df['lat_unadj'] + origin_df['lat_adj_factor']
origin_df['lng'] = origin_df['lng_unadj'] + origin_df['lng_adj_factor']

In [None]:
# read in cleaned dog breed data to merge against origin data
dogs_df = pd.read_csv('data/cleaned_data.csv')

# dataframe with just dog name and id
dogids_df = pd.DataFrame(dogs_df[['name','id']])

In [None]:
# make breed name lowercase in both dataframes for merging
dogids_df['name_lower'] = dogids_df['name'].str.lower()
origin_df['name_lower'] = origin_df['breed_name'].str.lower()

In [None]:
# merge dataframes
merge_df = origin_df.merge(dogids_df,how='inner',on='name_lower')

In [None]:
# drop redundant columns
merge_df = merge_df.drop(columns=['breed_name','name_lower','lat_adj_factor','lng_adj_factor'])

In [None]:
# export to csv
# merge_df.to_csv('data/breed_origins.csv')