In [2]:
import requests
import bs4 as bs
import pandas as pd


#### First, we get our movie dataframe from the dataset given

In [3]:
columns = ["WikiID", "FreebaseID", "Name", "ReleaseDate", "BoxOfficeRevenue", "Runtime", "Language", "Country", "Genres"]
movies = pd.read_csv('../../data/movie.metadata.tsv', sep='\t', names=columns, index_col=False)

<h4> As we want to relate the FreebaseID we have with the IMDB id, we are going to look for the latter in <a> https://www.wikidata.org </a>. First, we get the path to the film web page with the function <i>getLinks</i>, and then we get the IMDB id with the function <i>getFreeIDandIMDBid</i>. Both functions are based on the HTML tags of said webpage after consultating it and makes requests to WikiData.

In [4]:
def getLinks (t):
    """
    Get the path to the film webpage
    Argument:
        t: it is a string that contains the name of the film already prepared for the URL.
        -----> t = "City+of+the+dead"
    Returns:
        An array with all the paths that wikidata has for the film
    """
    # This is the URL we need to search for the film webpage path
    addr = "https://www.wikidata.org/w/index.php?go=Go&search="+t+"&search="+t+"&title=Special:Search&ns0=1&ns120=1&searchToken=dattnx8r0gvcfzgyehr2g6vg2"
    
    # We request the previous URL and get the HTML
    r = requests.get(addr)
    soup = bs.BeautifulSoup(r.text, "html.parser")

    # This is the array that will contain all the paths and we are returning
    link = []

    # We know the path we are looking for is contained in a div with the class ="mw-search-result-heading", inside the element <a> with the tag "href"
    for div in soup.find_all('div',{"class":"mw-search-result-heading"}):
        for a in div:
            link.append(a.get("href"))
            break

    return link

In [5]:
def getFreeIDandIMDBid (link):
    """
    Get the Freebase id and the IMDB id of the film from its wikidata webpage
    Argument:
        link: it is a string with the path for the URL film webpage.
        -----> link = "wiki/Q869644"
    Returns:
        The Freebase id and the IMDB id of the film 
    """
    # We create the whole URL with the baseURL (addr2) and the path (link)
    addr2 = "https://www.wikidata.org"
    url = addr2 + link

    # We request the previous URL and get the HTML
    r2 = requests.get(url)
    soup2 = bs.BeautifulSoup(r2.text, "html.parser")

    # The variables we are returning, which will contain the Freebase id and IMDB id
    freebaseid = ""
    imbdid = ""

    # We know the Freebase id is contain in a div with id ="P646", in the element a with the class ="wb-external-id external"
    for div in soup2.find_all('div', {"id": "P646"}):
        for a in div.find_all('a', {"class": "wb-external-id external"}):
            freebaseid = a.get_text()
            break

    # We know the IMDB id is contain in a div with id ="P345", in the element a with the class ="wb-external-id external"
    for div in soup2.find_all('div', {"id": "P345"}):
        for a in div.find_all('a', {"class": "wb-external-id external"}):
            imbdid = a.get_text()
            break
    
    return freebaseid, imbdid

In [6]:
# First, we divide the number of movies into 5 (number of files to parallel the search)
top = round(len(movies)/5)

# We get an array with the names of the movies we are going to make the search for
## If it has crashed, it would be movies1 = movies[len(dict_fbid_to_imbd1):top]
movies1 = movies[:top].Name


In [23]:
# This dict will have the Freebase id as key and the IMDB id as its value
dict_fbid_to_imbd1 = {}

# For every title movie in the array:
for m1 in movies1:
    # We prepare the title for introducing it into an URL
    t1 = ("+").join(m1.split(" "))
    # We get the paths for the movie webpage
    links1 = getLinks(t1)
    # The Freebase id and IMDB id we are going to store in the dict
    freebaseid1 = ""
    imbid1 = ""
    # We search for the Freebase id and IMDB id in every path we received until we got them
    for link1 in links1:
        fid1, iid1 = getFreeIDandIMDBid(link1)
        if (fid1 != "" and freebaseid1 ==""):
            freebaseid1 = fid1
        if (iid1 != "" and imbid1 ==""):
            imbid1 = iid1
        if (imbid1 != "" or freebaseid1 != ""):
            break
    dict_fbid_to_imbd1[freebaseid1] = imbid1

dict_fbid_to_imbd1



{'/m/03vyhn': 'tt0228333',
 '/m/08yl5d': 'tt0245916',
 '/m/0crgdbh': 'tt0094806',
 '/m/0285_cd': 'tt0094320',
 '/m/01mrr1': 'tt0083949',
 '/m/03cfc81': 'tt0002894',
 '/m/05pdd86': 'tt0963966',
 '/m/0gstb': '',
 '/m/06_y2j7': 'tt0200545',
 '/m/02wwdnm': '',
 '/m/017n1p': 'tt0021335',
 '/m/07kjkz6': 'tt0072157',
 '/m/0gffwj': 'tt0119548',
 '/m/0cd17': '',
 '/m/04cqrs4': 'tt0278891',
 '/m/02r52hc': 'tt0033888',
 '/m/02k338': '',
 '/m/09yn5x': '',
 '/m/0gyryjt': 'tt0061637',
 '/m/0gj30jx': 'tt0104601',
 '/m/017n1b': 'tt0020823',
 '/m/02wjqm': 'tt0133122',
 '/m/05p45cv': 'tt0892904',
 '/m/0h964n1': 'tt0055997',
 '/m/0g4_n3m': 'tt0022289',
 '/m/047t9rb': 'tt0026167',
 '/m/05pckh0': 'tt0053820',
 '/m/0523t_1': '',
 '/m/05f5_5y': 'tt0079552',
 '/m/06zw99': '',
 '/m/02rc_h4': 'tt0367546',
 '/m/02pml15': 'tt0255668',
 '/m/01lz54': '',
 '/m/0dxtw': '',
 '': 'tt1671738',
 '/m/04j0lfk': 'tt0366182',
 '/m/0404m8g': '',
 '/m/0fq260_': 'tt1849787',
 '/m/0b6c_nw': 'tt0178022',
 '/m/0bh9fbk': '',
 '/m/0

In [25]:
# We store the dict as a global variable, so we can use it in another file
%store dict_fbid_to_imbd1

Stored 'dict_fbid_to_imbd1' (dict)
