In [1]:
import requests
import bs4 as bs
import pandas as pd

#### First, we get our movie dataframe from the dataset given

In [2]:
columns = ["WikiID", "FreebaseID", "Name", "ReleaseDate", "BoxOfficeRevenue", "Runtime", "Language", "Country", "Genres"]
movies = pd.read_csv('../../data/movie.metadata.tsv', sep='\t', names=columns, index_col=False)

<h4> As we want to relate the FreebaseID we have with the IMDB id, we are going to look for the latter in <a> https://www.wikidata.org </a>. First, we get the path to the film web page with the function <i>getLinks</i>, and then we get the IMDB id with the function <i>getFreeIDandIMDBid</i>. Both functions are based on the HTML tags of said webpage after consultating it and makes requests to WikiData.

In [3]:
def getLinks (t):
    """
    Get the path to the film webpage
    Argument:
        t: it is a string that contains the name of the film already prepared for the URL.
        -----> t = "City+of+the+dead"
    Returns:
        An array with all the paths that wikidata has for the film
    """
    # This is the URL we need to search for the film webpage path
    addr = "https://www.wikidata.org/w/index.php?go=Go&search="+t+"&search="+t+"&title=Special:Search&ns0=1&ns120=1&searchToken=dattnx8r0gvcfzgyehr2g6vg2"
    
    # We request the previous URL and get the HTML
    r = requests.get(addr)
    soup = bs.BeautifulSoup(r.text, "html.parser")

    # This is the array that will contain all the paths and we are returning
    link = []

    # We know the path we are looking for is contained in a div with the class ="mw-search-result-heading", inside the element <a> with the tag "href"
    for div in soup.find_all('div',{"class":"mw-search-result-heading"}):
        for a in div:
            link.append(a.get("href"))
            break

    return link

In [4]:
def getFreeIDandIMDBid (link):
    """
    Get the Freebase id and the IMDB id of the film from its wikidata webpage
    Argument:
        link: it is a string with the path for the URL film webpage.
        -----> link = "wiki/Q869644"
    Returns:
        The Freebase id and the IMDB id of the film 
    """
    # We create the whole URL with the baseURL (addr2) and the path (link)
    addr2 = "https://www.wikidata.org"
    url = addr2 + link

    # We request the previous URL and get the HTML
    r2 = requests.get(url)
    soup2 = bs.BeautifulSoup(r2.text, "html.parser")

    # The variables we are returning, which will contain the Freebase id and IMDB id
    freebaseid = ""
    imbdid = ""

    # We know the Freebase id is contain in a div with id ="P646", in the element a with the class ="wb-external-id external"
    for div in soup2.find_all('div', {"id": "P646"}):
        for a in div.find_all('a', {"class": "wb-external-id external"}):
            freebaseid = a.get_text()
            break

    # We know the IMDB id is contain in a div with id ="P345", in the element a with the class ="wb-external-id external"
    for div in soup2.find_all('div', {"id": "P345"}):
        for a in div.find_all('a', {"class": "wb-external-id external"}):
            imbdid = a.get_text()
            break
    
    return freebaseid, imbdid

In [5]:
# First, we divide the number of movies into 5 (number of files to parallel the search)
top = round(len(movies)/5)

# We get an array with the names of the movies we are going to make the search for
## If it has crashed, it would be movies1 = movies[top+len(dict_fbid_to_imbd2):top*2]
movies1 = movies[top:top*2].Name


In [10]:
# This dict will have the Freebase id as key and the IMDB id as its value
dict_fbid_to_imbd2 = {}

# For every title movie in the array:
for m1 in movies1:
    # We prepare the title for introducing it into an URL
    t1 = ("+").join(m1.split(" "))
    # We get the paths for the movie webpage
    links1 = getLinks(t1)
    # The Freebase id and IMDB id we are going to store in the dict
    freebaseid1 = ""
    imbid1 = ""
    # We search for the Freebase id and IMDB id in every path we received until we got them
    for link1 in links1:
        fid1, iid1 = getFreeIDandIMDBid(link1)
        if (fid1 != "" and freebaseid1 ==""):
            freebaseid1 = fid1
        if (iid1 != "" and imbid1 ==""):
            imbid1 = iid1
        if (imbid1 != "" or freebaseid1 != ""):
            break
    dict_fbid_to_imbd2[freebaseid1] = imbid1

dict_fbid_to_imbd2



{'/m/098tl5': 'tt0366505',
 '/m/09916h': 'tt0057586',
 '/m/0dr6f6': 'tt0106682',
 '/m/02qzrt_': 'tt0404163',
 '/m/0h97jr7': 'tt0089561',
 '/m/03hftz3': 'tt0029708',
 '/m/03bzqfs': 'tt0014357',
 '/m/0f_5yn': 'tt0089092',
 '/m/07s4s06': 'tt0171357',
 '/m/03fpx': '',
 '/m/02k52j': '',
 '/m/0gj9990': 'tt0360197',
 '/m/0kvbgs': 'tt0125712',
 '/m/0gjc3jj': '',
 '/m/05zzfjc': 'tt0143248',
 '/m/0yjgv': '',
 '/m/0b6p5m_': 'tt0020004',
 '/m/0ftcbx': 'tt0811785',
 '/m/02899y8': '',
 '/m/02stxk': 'tt0057372',
 '/m/0hgpgxl': 'tt2186875',
 '': '',
 '/m/040690k': 'tt0010255',
 '/m/0gj9c2z': 'tt0172738',
 '/m/0fcvkj': 'tt0066212',
 '/m/02ptl55': '',
 '/m/0h3m7gn': '',
 '/m/0crcw9h': 'tt0180801',
 '/m/0b6dntr': 'tt0058722',
 '/m/08v6y0': 'tt0459663',
 '/m/09k4glk': 'tt1612143',
 '/m/04f2hzd': 'tt0268255',
 '/m/047pzmk': 'tt0841108',
 '/m/01s7w3': 'tt0117998',
 '/m/09xph': '',
 '/m/02h9qh': '',
 '/m/0j45ws9': 'tt2014255',
 '/m/0g57zks': 'tt1579585',
 '/m/0j9q2pl': 'tt5986568',
 '/m/0dhrhx': 'tt0823240',

In [11]:
# We store the dict as a global variable, so we can use it in another file
%store dict_fbid_to_imbd2

Stored 'dict_fbid_to_imbd2' (dict)
