In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bs
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager
import nbimporter
from function import combine_data
import time

In [2]:
# We see that there were a few pages that were not properly scraped from HGTV. Here we check what seasons are missing.

og_scraped_df = pd.read_csv('Resources/House_Hunters_International_Seasons_Grouped.csv', index_col = 0)
missing_seasons = [ x for x in  np.arange(1,og_scraped_df.iloc[-1,0]) if x not in og_scraped_df['Season'].tolist() ]
missing_seasons

[4, 18]

In [None]:
# While reviewing the data, I also noticed a few seasons only had 1 episode, so we will want to rescrape them too. The last 
# several seasons correctly only have 1 episode, so we exclude them.

missing_seasons = missing_seasons + og_scraped_df.loc[og_scraped_df['Episode'] < 3, :].iloc[:-6,:]['Season'].tolist()
missing_seasons

In [None]:
# It appears the construction of the seasons' URLs changed over time for some reason. As such, we will need to filter the 
# seasons by format and concatenate several lists together. Season 1 actually had 2 seperate pages (1a and 100).

rescrape_urls = [ f'https://www.hgtv.com/shows/house-hunters-international/episodes/{x}a' for x in missing_seasons if x == 1 ]\
    + [ f'https://www.hgtv.com/shows/house-hunters-international/episodes/{x}00' for x in missing_seasons if x < 145 ]\
    + [ f'https://www.hgtv.com/shows/house-hunters-international/episodes/{x}' for x in missing_seasons if x > 144 ]
rescrape_urls

In [None]:
# Here we rescrape the several pages that were determined to have errors.

number_list = [] 
url_list = []
title_list = []
description_list = []

for url in rescrape_urls:
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=False)
    
    browser.visit(url)
    html = browser.html
    soup = bs(html, 'html.parser')

    episodes = soup.find('div', id = 'mod-episode-list-1')

    episode_number = episodes.find_all('span', class_='m-EpisodeCard__a-AssetInfo')
    episode_url = episodes.find_all('a')
    episode_description = episodes.find_all('p', class_='m-EpisodeCard__a-Description')

    numbers, urls, titles, descriptions = combine_data(episode_number, episode_url, episode_description)

    number_list = number_list + numbers
    url_list = url_list + urls
    title_list = title_list + titles
    description_list = description_list + descriptions
    
    browser.quit()
    
    time.sleep(1)

In [None]:
# We build a dictionary and then a Data Frame from the 4 lists returned by our web scraper.

episode_dict = {}
episode_dict['Number'] = number_list
episode_dict['Title'] = title_list
episode_dict['Description'] = description_list
episode_dict['Link'] = url_list

episodes_df = pd.DataFrame(episode_dict)

In [None]:
# Some data cleaning to remove duplicates and seperate the 'Number' column into 'Season' and 'Episode'.

episodes_df.drop_duplicates(subset = ['Title'])

episodes_df[['Season','Episode']] = episodes_df['Number'].str.split(', ', expand = True)

episodes_df['Season'] = episodes_df['Season'].map(lambda x:x.replace('Season ', ''))
episodes_df['Episode'] = episodes_df['Episode'].map(lambda x:x.replace('Episode ', '')).astype(int)
episodes_df

In [None]:
episodes_df.to_csv('Resources/House_Hunters_International_Rescraped.csv')