In [1]:
import requests
import bs4 as bs
import pandas as pd

#### First, we get our movie dataframe from the dataset given

In [2]:
columns = ["WikiID", "FreebaseID", "Name", "ReleaseDate", "BoxOfficeRevenue", "Runtime", "Language", "Country", "Genres"]
movies = pd.read_csv('../../data/movie.metadata.tsv', sep='\t', names=columns, index_col=False)

<h4> As we want to relate the FreebaseID we have with the IMDB id, we are going to look for the latter in <a> https://www.wikidata.org </a>. First, we get the path to the film web page with the function <i>getLinks</i>, and then we get the IMDB id with the function <i>getFreeIDandIMDBid</i>. Both functions are based on the HTML tags of said webpage after consultating it and makes requests to WikiData.

In [4]:
def getLinks (t):
    """
    Get the path to the film webpage
    Argument:
        t: it is a string that contains the name of the film already prepared for the URL.
        -----> t = "City+of+the+dead"
    Returns:
        An array with all the paths that wikidata has for the film
    """
    # This is the URL we need to search for the film webpage path
    addr = "https://www.wikidata.org/w/index.php?go=Go&search="+t+"&search="+t+"&title=Special:Search&ns0=1&ns120=1&searchToken=dattnx8r0gvcfzgyehr2g6vg2"
    
    # We request the previous URL and get the HTML
    r = requests.get(addr)
    soup = bs.BeautifulSoup(r.text, "html.parser")

    # This is the array that will contain all the paths and we are returning
    link = []

    # We know the path we are looking for is contained in a div with the class ="mw-search-result-heading", inside the element <a> with the tag "href"
    for div in soup.find_all('div',{"class":"mw-search-result-heading"}):
        for a in div:
            link.append(a.get("href"))
            break

    return link

In [5]:
def getFreeIDandIMDBid (link):
    """
    Get the Freebase id and the IMDB id of the film from its wikidata webpage
    Argument:
        link: it is a string with the path for the URL film webpage.
        -----> link = "wiki/Q869644"
    Returns:
        The Freebase id and the IMDB id of the film 
    """
    # We create the whole URL with the baseURL (addr2) and the path (link)
    addr2 = "https://www.wikidata.org"
    url = addr2 + link

    # We request the previous URL and get the HTML
    r2 = requests.get(url)
    soup2 = bs.BeautifulSoup(r2.text, "html.parser")

    # The variables we are returning, which will contain the Freebase id and IMDB id
    freebaseid = ""
    imbdid = ""

    # We know the Freebase id is contain in a div with id ="P646", in the element a with the class ="wb-external-id external"
    for div in soup2.find_all('div', {"id": "P646"}):
        for a in div.find_all('a', {"class": "wb-external-id external"}):
            freebaseid = a.get_text()
            break

    # We know the IMDB id is contain in a div with id ="P345", in the element a with the class ="wb-external-id external"
    for div in soup2.find_all('div', {"id": "P345"}):
        for a in div.find_all('a', {"class": "wb-external-id external"}):
            imbdid = a.get_text()
            break
    
    return freebaseid, imbdid

In [6]:
# First, we divide the number of movies into 5 (number of files to parallel the search)
top = round(len(movies)/5)

# We get an array with the names of the movies we are going to make the search for
## If it has crashed, it would be movies1 = movies[top*2+len(dict_fbid_to_imbd3):top*3]
movies1 = movies[top*2:top*3].Name


In [9]:
# This dict will have the Freebase id as key and the IMDB id as its value
dict_fbid_to_imbd3 = {}

# For every title movie in the array:
for m1 in movies1:
    # We prepare the title for introducing it into an URL
    t1 = ("+").join(m1.split(" "))
    # We get the paths for the movie webpage
    links1 = getLinks(t1)
    # The Freebase id and IMDB id we are going to store in the dict
    freebaseid1 = ""
    imbid1 = ""
    # We search for the Freebase id and IMDB id in every path we received until we got them
    for link1 in links1:
        fid1, iid1 = getFreeIDandIMDBid(link1)
        if (fid1 != "" and freebaseid1 ==""):
            freebaseid1 = fid1
        if (iid1 != "" and imbid1 ==""):
            imbid1 = iid1
        if (imbid1 != "" or freebaseid1 != ""):
            break
    dict_fbid_to_imbd3[freebaseid1] = imbid1

dict_fbid_to_imbd3



{'/m/0fq2t_4': '',
 '/m/07f8dc': '',
 '/m/09ggmyr': '',
 '/m/01f66n': 'tt0060453',
 '/m/0dmjhw': 'tt0019611',
 '/m/06169j': 'tt0106102',
 '/m/03fz8rg': '',
 '/m/0424gt': 'tt0099044',
 '/m/03zddt': '',
 '/m/0kvb4t': 'tt0058754',
 '/m/04083xt': 'tt0034525',
 '/m/0dxmnh': 'tt0061420',
 '/m/0h75ll0': 'tt0466367',
 '/m/0h3wpp_': 'tt1814830',
 '/m/02wx450': '',
 '/m/012z21mr': '',
 '/m/02720hj': 'tt0173582',
 '/m/043n8_w': 'tt0080341',
 '/m/02hzj9': '',
 '/m/0gh664': 'tt0264578',
 '': '',
 '/m/028bcn0': 'tt0050432',
 '/m/02x29nw': '',
 '/m/020mjt': '',
 '/m/0fb2hj': 'tt0100821',
 '/m/02qvc9j': 'tt0068310',
 '/m/02r3_g5': 'tt0388389',
 '/m/030w8h': '',
 '/m/04946n': 'tt0430484',
 '/m/0ch0vpg': 'tt0130989',
 '/m/04cnrb': 'tt0093502',
 '/m/02pkfmy': 'tt0203408',
 '/m/0cn__0x': 'tt0075833',
 '/m/01kgrx': 'tt0489049',
 '/m/0gsklq': 'tt0198837',
 '/m/01lshv9': '',
 '/m/0gdtdd': 'tt0094119',
 '/m/065y0b0': 'tt0174058',
 '/m/0gtrg': '',
 '/m/0fpj9bv': 'tt0084606',
 '/m/0b6jrhk': 'tt0095796',
 '/m/01

In [10]:
# We store the dict as a global variable, so we can use it in another file
%store dict_fbid_to_imbd3

Stored 'dict_fbid_to_imbd3' (dict)
