In [None]:
import numpy as np
import pandas as pd

import re
import requests
from bs4 import BeautifulSoup
from helper.data_wrangling import load_and_clean_movies_df
from helper.scrap_helper import *

# Make sure that if any dependencies changes it will be reflected in the notebook (From the ML course)
%load_ext autoreload
%autoreload 2

In [2]:
df_movies = load_and_clean_movies_df()

Let's merge the entire dataframe of the movies with the one containing those selected with their IMDB ids in order to work only with this subset of movies.

In [None]:
merged_data = pd.read_csv('data/merged_data.csv', sep=',')
merged_data = merged_data.rename(columns={"0": "wikipedia_ID", "2": "title"})

merged_data = pd.merge(df_movies, merged_data, on='wikipedia_ID', how='inner')

In [5]:
n_total = merged_data.shape[0]
no_dates = merged_data.loc[(merged_data.release_month != '<NA>') & (merged_data.release_year != '<NA>')].shape[0]

print(f"There are {no_dates} movies among {n_total} for which we have the year and month of release.")

There are 18181 movies among 28415 for which we have the year and month of release.


Let's now use the wikipedia ID we have for each movie to scrap information from wikidata about the release dates.

In [None]:
# Create a copy of the dataframe of movies
merged_scraped = merged_data.copy()

merged_data = merged_data.rename(columns={"release_year": "release_year_x", "release_month": "release_month_x"})

dates_merged = merged_data[['wikipedia_ID', 'release_year_x', 'release_month_x']].copy()

# add empty columns that are going to be fill by scraping the dates
dates_merged['release_year_y'] = ''
dates_merged['release_month_y'] = ''

In [None]:
# Save the file where we will be scraping to
dates_merged.tocsv('dates_scraped.csv', index=False)

In [10]:
# Create a variable corresponding to how many movies we scraped, in case we need to
# to re launch the scraping while some data is already scraped.
number_scraped = 0

We have all we need to scrap ! Let's do it :

In [None]:
# TO RUN ONLY IF DATES_SCRAPED ALREADY EXITS
# If we need to continue to scrap on a file we've already started to scrap to.
dates_merged = pd.read_csv('dates_scraped.csv', sep=',')

In [None]:
for idx, row in dates_merged.iloc[number_scraped:].iterrows():
    w_id = row.wikipedia_ID
    year, month = format_date_numeric(get_release_date(wikidata_from_wikipedia_id(w_id)[1]))
    dates_merged.loc[idx, 'release_year_y'] = year
    dates_merged.loc[idx, 'release_month_y'] = month

    # save the file with the new scraped dates
    dates_merged.to_csv('dates_scraped.csv', index=False)
    number_scraped += 1