In [1]:
import pandas as pd

In [2]:
ep_df = pd.read_csv('Resources/House_Hunters_International.csv', index_col=0)

In [3]:
# We need to clean the data we scraped. We do this by removing duplicates and then separating the 'Number' column into 'Season'
# and 'Episode'. We also want to strip away the words from this column.

ep_df.drop_duplicates(subset = ['Title'])

ep_df[['Season','Episode']] = ep_df['Number'].str.split(', ', expand = True)

ep_df['Season'] = ep_df['Season'].map(lambda x:x.replace('Season ', ''))
ep_df['Episode'] = ep_df['Episode'].map(lambda x:x.replace('Episode ', '')).astype(int)

ep_df.to_csv('Resources/House_Hunters_International_Cleaned.csv')

In [4]:
# In order to check if all the data was scraped properly, we should group our data by 'Season'.

season_group_df = pd.DataFrame(ep_df.groupby('Season')['Episode'].count())
season_group_df = season_group_df.reset_index()
season_group_df = season_group_df[season_group_df['Season'] != 'Specials'].astype(int)

In [5]:
# It seems like some seasons are skipped and others may be missing episodes. To quickly check this, let's export the Data Frame
# to Excel and analyze the data there.

season_group_df = season_group_df.sort_values(['Season']).reset_index(drop=True)
season_group_df.sample(20)

Unnamed: 0,Season,Episode
29,32,13
12,14,13
38,41,13
26,29,13
40,43,13
15,17,13
49,52,13
35,38,13
17,20,13
33,36,13


In [6]:
# From Excel, we can sort this data and check if a season is skipped. This analysis can be found in the 'Resources' folder.

season_group_df.to_csv('Resources/House_Hunters_International_Seasons_Grouped.csv')

In [None]:
# From our 'rescrape' notebook, we filled in the blanks in our data. Now we need to combine our two Data Frames using .concat
# and make sure that we have not accidentally duplicated any episodes

rescraped_ep_df = pd.read_csv('Resources/House_Hunters_International_Rescraped.csv')

compined_ep_df = pd.concat([ep_df, rescraped_ep_df])\
    .drop_duplicates(subset = ['Title'])\
    .drop('Unnamed: 0', axis=1)\
    .reset_index(drop = True)

compined_ep_df.to_csv('Resources/House_Hunters_International_Final.csv')

compined_ep_df