In [2]:
import pandas as pd

In [3]:
ep_df = pd.read_csv('Resources/House_Hunters_International.csv', index_col=0)

In [4]:
# We need to clean the data we scraped. We do this by removing duplicates and then separating the 'Number' column into 'Season'
# and 'Episode'. We also want to strip away the words from this column.

ep_df.drop_duplicates(subset = ['Title'])

ep_df[['Season','Episode']] = ep_df['Number'].str.split(', ', expand = True)

ep_df['Season'] = ep_df['Season'].map(lambda x:x.replace('Season ', ''))
ep_df['Episode'] = ep_df['Episode'].map(lambda x:x.replace('Episode ', '')).astype(int)

ep_df.to_csv('Resources/House_Hunters_International_Cleaned.csv')

In [5]:
# In order to check if all the data was scraped properly, we should group our data by 'Season'.

season_group_df = pd.DataFrame(ep_df.groupby('Season')['Episode'].count())
season_group_df = season_group_df.reset_index()
season_group_df = season_group_df[season_group_df['Season'] != 'Specials'].astype(int)

In [13]:
# It seems like some seasons are skipped and others may be missing episodes. To quickly check this, let's export the Data Frame
# to Excel and analyze the data there.

season_group_df = season_group_df.sort_values(['Season']).reset_index(drop=True)
season_group_df.sample(20)

Unnamed: 0,Season,Episode
77,83,13
112,123,13
110,120,13
109,119,13
14,17,13
70,76,13
58,64,13
3,5,13
136,149,14
5,7,8


In [12]:
# From Excel, we can sort this data and check if a season is skipped. This analysis can be found in the 'Resources' folder.

season_group_df.to_csv('Resources/House_Hunters_International_Seasons_Grouped.csv')

In [27]:
# From our 'rescrape' notebook, we filled in the blanks in our data. Now we need to combine our two Data Frames using .concat
# and make sure that we have not accidentally duplicated any episodes

rescraped_ep_df = pd.read_csv('Resources/House_Hunters_International_Rescraped.csv')

compined_ep_df = pd.concat([ep_df, rescraped_ep_df])\
    .drop_duplicates(subset = ['Title'])\
    .drop('Unnamed: 0', axis=1)\
    .reset_index(drop = True)

compined_ep_df.to_csv('Resources/House_Hunters_International_Final.csv')

compined_ep_df

Unnamed: 0,Number,Title,Description,Link,Season,Episode
0,"Specials, Episode 1",Revisiting Big Renovations in the South of France,House Hunters International is revisiting the ...,www.hgtv.com/shows/house-hunters-international...,Specials,1
1,"Specials, Episode 2","Revisiting Remote Renovations in Australia, Ne...",House Hunters International checks back in wit...,www.hgtv.com/shows/house-hunters-international...,Specials,2
2,"Specials, Episode 3",Revisiting Historic Renovations in Italy and F...,House Hunters International wondered what happ...,www.hgtv.com/shows/house-hunters-international...,Specials,3
3,"Specials, Episode 4","Revisiting Euro Renovations in Berlin, Copenha...",House Hunters International travels across Eur...,www.hgtv.com/shows/house-hunters-international...,Specials,4
4,"Specials, Episode 5","Revisiting Latin Renovations in Belize, Panama...",House Hunters International retraces the steps...,www.hgtv.com/shows/house-hunters-international...,Specials,5
...,...,...,...,...,...,...
2108,"Season 152, Episode 9",Untapped Potential in Siem Reap,A globetrotting Australian fell in love with C...,www.hgtv.com/shows/house-hunters-international...,152,9
2109,"Season 152, Episode 10",Cooking Up a Fresh Start in Cambodia,A couple and their dog are relocating within C...,www.hgtv.com/shows/house-hunters-international...,152,10
2110,"Season 152, Episode 11",The Da Nang Dilemma,Newlyweds leave their comfortable lives in Aus...,www.hgtv.com/shows/house-hunters-international...,152,11
2111,"Season 152, Episode 12",Bargaining in Bratislava,A young professional longing for a European ex...,www.hgtv.com/shows/house-hunters-international...,152,12
