In [1]:
import requests
import bs4 as bs
import pandas as pd

#### First, we get our movie dataframe from the dataset given

In [2]:
columns = ["WikiID", "FreebaseID", "Name", "ReleaseDate", "BoxOfficeRevenue", "Runtime", "Language", "Country", "Genres"]
movies = pd.read_csv('../../data/movie.metadata.tsv', sep='\t', names=columns, index_col=False)

<h4> As we want to relate the FreebaseID we have with the IMDB id, we are going to look for the latter in <a> https://www.wikidata.org </a>. First, we get the path to the film web page with the function <i>getLinks</i>, and then we get the IMDB id with the function <i>getFreeIDandIMDBid</i>. Both functions are based on the HTML tags of said webpage after consultating it and makes requests to WikiData.

In [3]:
def getLinks (t):
    """
    Get the path to the film webpage
    Argument:
        t: it is a string that contains the name of the film already prepared for the URL.
        -----> t = "City+of+the+dead"
    Returns:
        An array with all the paths that wikidata has for the film
    """
    # This is the URL we need to search for the film webpage path
    addr = "https://www.wikidata.org/w/index.php?go=Go&search="+t+"&search="+t+"&title=Special:Search&ns0=1&ns120=1&searchToken=dattnx8r0gvcfzgyehr2g6vg2"
    
    # We request the previous URL and get the HTML
    r = requests.get(addr)
    soup = bs.BeautifulSoup(r.text, "html.parser")

    # This is the array that will contain all the paths and we are returning
    link = []

    # We know the path we are looking for is contained in a div with the class ="mw-search-result-heading", inside the element <a> with the tag "href"
    for div in soup.find_all('div',{"class":"mw-search-result-heading"}):
        for a in div:
            link.append(a.get("href"))
            break

    return link

In [4]:
def getFreeIDandIMDBid (link):
    """
    Get the Freebase id and the IMDB id of the film from its wikidata webpage
    Argument:
        link: it is a string with the path for the URL film webpage.
        -----> link = "wiki/Q869644"
    Returns:
        The Freebase id and the IMDB id of the film 
    """
    # We create the whole URL with the baseURL (addr2) and the path (link)
    addr2 = "https://www.wikidata.org"
    url = addr2 + link

    # We request the previous URL and get the HTML
    r2 = requests.get(url)
    soup2 = bs.BeautifulSoup(r2.text, "html.parser")

    # The variables we are returning, which will contain the Freebase id and IMDB id
    freebaseid = ""
    imbdid = ""

    # We know the Freebase id is contain in a div with id ="P646", in the element a with the class ="wb-external-id external"
    for div in soup2.find_all('div', {"id": "P646"}):
        for a in div.find_all('a', {"class": "wb-external-id external"}):
            freebaseid = a.get_text()
            break

    # We know the IMDB id is contain in a div with id ="P345", in the element a with the class ="wb-external-id external"
    for div in soup2.find_all('div', {"id": "P345"}):
        for a in div.find_all('a', {"class": "wb-external-id external"}):
            imbdid = a.get_text()
            break
    
    return freebaseid, imbdid

In [6]:
# First, we divide the number of movies into 5 (number of files to parallel the search)
top = round(len(movies)/5)

# We get an array with the names of the movies we are going to make the search for
## If it has crashed, it would be movies1 = movies[top*3+len(dict_fbid_to_imbd4):top*4]
movies1 = movies[top*3:top*4].Name

In [8]:
# This dict will have the Freebase id as key and the IMDB id as its value
dict_fbid_to_imbd4 = {}

# For every title movie in the array:
for m1 in movies1:
    # We prepare the title for introducing it into an URL
    t1 = ("+").join(m1.split(" "))
    # We get the paths for the movie webpage
    links1 = getLinks(t1)
    # The Freebase id and IMDB id we are going to store in the dict
    freebaseid1 = ""
    imbid1 = ""
    # We search for the Freebase id and IMDB id in every path we received until we got them
    for link1 in links1:
        fid1, iid1 = getFreeIDandIMDBid(link1)
        if (fid1 != "" and freebaseid1 ==""):
            freebaseid1 = fid1
        if (iid1 != "" and imbid1 ==""):
            imbid1 = iid1
        if (imbid1 != "" or freebaseid1 != ""):
            break
    dict_fbid_to_imbd4[freebaseid1] = imbid1

dict_fbid_to_imbd4



{'/m/0fg1g': '',
 '/m/0640hs3': 'tt0079086',
 '/m/07k2fr': 'tt0154587',
 '/m/019nwm': 'tt0046000',
 '/m/0h3pnf3': 'tt0122271',
 '/m/085n3x': 'tt0115610',
 '/m/02pyxdl': 'tt0059920',
 '/m/09rx6gz': 'tt0042923',
 '/m/088v19': 'tt0033149',
 '/m/08ghlq': 'tt0069803',
 '/m/05f8tn': 'tt0080881',
 '/m/04mnzm': '',
 '/m/0crw9rw': 'tt1135489',
 '/m/055gwx': 'tt0210234',
 '/m/0jww20l': 'tt0483167',
 '/m/06znqkb': 'tt0043412',
 '/m/019nwz': 'tt0046085',
 '/m/03d7vp7': 'tt0046460',
 '/m/04pxsv': 'tt0116441',
 '/m/0sm28_4': 'nm5809539',
 '/m/0806_kt': 'tt0095386',
 '/m/01y215': 'tt0283226',
 '/m/05b2flg': 'tt0089645',
 '': 'tt0389551',
 '/m/0crrmmd': 'tt0062825',
 '/m/02z72bt': 'tt0815245',
 '/m/0hnd7nk': 'tt0156961',
 '/m/01jcwr': '',
 '/m/03qb2q': '',
 '/m/0cs0ckt': 'tt0075614',
 '/m/02162': '',
 '/m/0cm97d2': 'tt4184350',
 '/m/02bxcy': '',
 '/m/03h1nhw': 'tt0177482',
 '/m/08j3jt': 'tt0035419',
 '/m/03c9_f1': 'tt0099076',
 '/m/027ycyf': 'tt0934870',
 '/m/0hrc3pc': 'tt1813523',
 '/m/01hqr5m': '',


In [9]:
# We store the dict as a global variable, so we can use it in another file
%store dict_fbid_to_imbd4

Stored 'dict_fbid_to_imbd4' (dict)
