# Use Wikipedia as alternative data source

In [19]:
import numpy as np
import pandas as pd
import wikipedia
import re
import timeit
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import bs4
import pickle
import json

## Load Formated Data

In [3]:
country_df = pd.read_pickle("../../data/Tables/country_df.pkl")
comes_from_df = pd.read_pickle("../../data/Tables/comes_from_df.pkl")
genre_df = pd.read_pickle("../../data/Tables/genre_df.pkl")
is_of_type_df = pd.read_pickle("../../data/Tables/is_of_type_df.pkl")
language_df = pd.read_pickle("../../data/Tables/language_df.pkl")
spoken_languages_df = pd.read_pickle("../../data/Tables/spoken_languages_df.pkl")
character_df = pd.read_pickle("../../data/Tables/character_df.pkl")
actor_df = pd.read_pickle("../../data/Tables/actor_df.pkl")
movie_df = pd.read_pickle("../../data/Tables/movie_df.pkl")
belongs_to_df = pd.read_pickle("../../data/Tables/belongs_to_df.pkl")
play_df = pd.read_pickle("../../data/Tables/play_df.pkl")
appears_in_df = pd.read_pickle("../../data/Tables/appears_in_df.pkl")

## Parser

In [4]:
default_movie_entries = set(["Directed by","Written by","Produced by","Starring",
                            "Cinematography","Edited by","Music by","Distributed by",
                            "Release date","Running time","Country","Language",
                            "Budget","Box office","Screenplay by","Based on",
                            "Release dates","Story by","Director of Animation",
                            "Languages","Country of origin","Original language",
                            "Executive producer","Producer","Production","Production company",
                            "Distributor","Original release","Picture format","Audio format",
                            "Original network","Release","Editor","Composer", "Countries",
                            "Production locations","Camera setup"])

def get_movie_data(page_id,entry_keys=default_movie_entries):
    """ Grep data from wikipedia given the page_id of interest. """
    # We load and parse the main wikipedia page 
    page = wikipedia.page(pageid=page_id)
    page_parser = BeautifulSoup(page.html(),"html.parser")
    table_data = page_parser.find("table",class_="infobox")
    # We extract the data relevent for our usage on movies
    table_data_list = [s for s in list(table_data.descendants)[0].strings 
                       if re.match("\A[(,),\n,\[,\]]",s) == None]
    entry_indices = [idx for (idx,entry) in enumerate(table_data_list)
                    if entry in entry_keys]
    entry_indices.append(len(table_data_list))
    table_data_dict = dict([(table_data_list[entry_indices[i]],
                             table_data_list[entry_indices[i]+1:entry_indices[i+1]])
                           for i in range(len(entry_indices)-1)])
    table_data_dict["Summary"] = page.summary
    return table_data_dict

## Pipeline

In [5]:
def retrieve_wikipedia_data(movie_ids,wikipedia_data_dict,faulty_ids,
                            entry_keys=default_movie_entries,verbose=False):
    """ Pipeline that collect data from wikipedia for the given indices.
    
        It will add directly the information to the given lists. For the 
        ids where we cannot grep the wikipedia page we add them to the faulty_ids
        list and otherwise we add the data to the wikipedia_data_dict dictionnary.
        
    """
    for idx in tqdm(movie_ids):
        try:
            wikipedia_data_dict[idx] = get_movie_data(idx,entry_keys=entry_keys)
        except:
            if verbose:
                print(f"Unable to fetch data for id {idx}.")
            faulty_ids.append(idx)

#### Initialization of data containers

In [6]:
wikipedia_data_dict = dict()
faulty_ids = []

#### Pipeline Execution

In [7]:
no_release_date_movie_ids =list(movie_df[movie_df["release_date"].isna()].index)
retrieve_wikipedia_data(no_release_date_movie_ids,wikipedia_data_dict,faulty_ids,
                            entry_keys=default_movie_entries,verbose=False)

  0%|          | 0/6902 [00:00<?, ?it/s]



  lis = BeautifulSoup(html).find_all('li')


#### Format dictionnary

In [28]:
# Optionnal if the goal is to obtain a flatten version of the output dictionnary.s
wikipedia_data_list = []
for idx, data_dict in wikipedia_data_dict.items():
    new_data_dict = {"movie_id":idx}
    new_data_dict.update(data_dict)
    wikipedia_data_list.append(new_data_dict)

#### Save Results

In [32]:
with open('../../data/Wikipedia/no_release_date_movies.json', 'w') as outfile:
    json.dump(wikipedia_data_dict,outfile)

with open('../../data/Wikipedia/faulty_no_release_date_movies.pkl', 'wb') as handle:
    pickle.dump(faulty_ids, handle, protocol=pickle.HIGHEST_PROTOCOL)

---