In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import math
# import re

In [2]:
current_year = 2019

dict_year = {}

venice = {1932:1,
          1934:2,
          1935:3,
          1936:4,
          1937:5,
          1938:6,
          1939:7}
venice.update(zip(range(1947, 1972+1), range(8, 33+1)))
venice.update(zip(range(1979, current_year+1), range(36, (current_year-1979+36)+1)))
venice

berlin = dict(zip(range(1951, current_year+1), range(1, (current_year-1951+1)+1)))

dict_year['venice'] = venice
dict_year['berlin'] = berlin

In [3]:
def get_urls(festival, from_year, to_year):
    '''
    Return list of tuples with the urls of relevant festival Wikipedia pages up to the current year,
    the corresponding festivals, and the corresponding years.
    
    Keyword arguments:
    festival -- string for which festival, e.g. 'cannes', 'venice', or 'berlin'
    from_year -- the first year in a range of festival years, inclusive
    to_year -- the last year in a range of festival years, inclusive
    Note: The range between from_year and to_year must only include valid years.
    '''
    urls = []
    
    ordinal = lambda n: "%d%s" % (n, 'tsnrhtdd'[(math.floor(n/10)%10!=1)*(n%10<4)*n%10::4]) # from https://stackoverflow.com/questions/9647202/ordinal-numbers-replacement
      
    if festival == 'cannes':
        for year in range(from_year, to_year+1):
            urls.append(('https://en.wikipedia.org/wiki/' + str(year) + '_Cannes_Film_Festival', festival, year))
    elif festival == 'venice':
        for year in range(from_year, to_year+1):
            urls.append(('https://en.wikipedia.org/wiki/' + ordinal(dict_year['venice'][year]) + '_Venice_International_Film_Festival', festival, year))
    else: # berlin
        for year in range(from_year, to_year+1):
            urls.append(('https://en.wikipedia.org/wiki/' + ordinal(dict_year['berlin'][year]) + '_Berlin_International_Film_Festival', festival, year))
    
    return urls

In [4]:
def get_festival_table(url, festival, year_festival):
    '''
    Return dataframe for festival formatted as table on Wikipedia.
    
    url -- url of the relevant festival
    festival -- string for which festival, e.g. 'cannes', 'venice', or 'berlin'
    year_festival -- the year of the relevant festival
    
    For Cannes, tables[0] is the table of films selected to compete for the Palme d'Or
    For Venice, tables[0] is the table of films selected to compete for the Golden Lion
    For Berlin, tables[0] is the table of films selected to compete for the Golden Bear
    '''
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'lxml')
    tables = soup.findAll('table', {'class': 'wikitable'})
    
    title_english = []
    title_original = []
    director = []
    country = []
    winner = []
    link_film = []
    link_director = []
    
    trs = tables[0].find_all('tr')[1:]
    for tr in trs:
        tds = tr.find_all('td')

        # title_english
        title_english.append(tds[0].find('i').get_text() if tds[0].find('i') else 'N/A')

        # title_original, director, link_film, link_director, country
        if tds[0].has_attr('colspan') and tds[0]['colspan']=='2':
            title_original.append('N/A')
            director.append(tds[1].get_text().strip('\n'))
            link_film.append('https://en.wikipedia.org' + tds[0].find('a').get('href') if tds[0].find('a') else 'N/A')

            links = tds[1].find_all('a')
            links_string = ''
            for link in links:
                links_string += 'https://en.wikipedia.org' + link.get('href') + ' '
            link_director.append(links_string if links else 'N/A')

            country.append(tds[2].get_text().strip('\n'))
        else:
            title_original.append(tds[1].get_text().strip('\n'))
            director.append(tds[2].get_text().strip('\n'))
            if tds[0].find('a'):
                link_film.append('https://en.wikipedia.org' + tds[0].find('a').get('href'))
            elif tds[1].find('a'):
                link_film.append('https://en.wikipedia.org' + tds[1].find('a').get('href'))
            else:
                link_film.append('N/A')

            links = tds[2].find_all('a')
            links_string = ''
            for link in links:
                links_string += 'https://en.wikipedia.org' + link.get('href') + ' '
            link_director.append(links_string if links else 'N/A')

            country.append(tds[3].get_text().strip('\n'))

        # winner
        if tr.has_attr('style'):
            winner.append(1)
        else:
            winner.append(0)

    # replace '' with 'N/A'
    title_original = ['N/A' if t=='' else t for t in title_original]
    director = ['N/A' if d=='' else d for d in director]
    country = ['N/A' if c=='' else c for c in country]
    
    df_tmp = pd.DataFrame({'title_english': title_english,
                           'title_original': title_original,
                           'director': director,
                           'country': country,
                           'winner': winner,
                           'festival': festival,
                           'year_festival': year_festival,
                           'link_film': link_film,
                           'link_director': link_director,})
    
    return df_tmp

In [5]:
# TWO POSSIBLE TO-DOS: 1) country, 2) films/directors that don't have links (current code sets to 'N/A', which isn't ideal)
def get_festival_bullet(url, festival, year_festival):
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'lxml')
    
    title_english = []
    title_original = []
    # country = []
    director = []
    link_film = []
    link_director = []
    
    if festival == 'cannes':
        list_of_films = soup.find('span', {'id': ["In_competition_-_Feature_film", "In_competition_–_Feature_film", "In_competition_-_Feature_films", "Feature_film_competition", "Feature_films_competition", "Films_in_competition"]}).parent.find_next_sibling('div').ul.find_all('li')
    else: # venice
        list_of_films = soup.find('span', {'id': ["In-Competition_films", "Films_premiered"]}).parent.find_next_sibling('ul').find_all('li')
    
    for child in list_of_films:
        
        # title_english
        try:
            title_english.append(child.i.get_text())
        except:
            title_english.append('N/A')
        
        # title_original
        try:
            title_original.append(child.i.find_next_sibling('i').get_text())
        except:
            title_original.append('N/A')

        # link_film
        try:
            link_film.append('https://en.wikipedia.org' + child.i.a['href'])
        except:
            link_film.append('N/A')
        
        # come back to country later b/c venice 1969, 1970, 1971, 1972, 1979 + maybe just use the country in the API
        
        # the following code does not take into account directors that don't have links, but are only found in the HTML text.
        # I have decided to leave out films and directors that don't have links because I will assume that means those directors are less important.
        list_of_directors = child.find_all('a', recursive=False)
        director_string = ''
        link_director_string = ''
        for d in list_of_directors:
            director_string += d.get_text() + ', '
            link_director_string += ('https://en.wikipedia.org' + d['href']) + ' '
        director.append(director_string if list_of_directors else 'N/A')
        link_director.append(link_director_string if list_of_directors else 'N/A')    
        
    director = [d.strip(', ') for d in director]
    link_director = [link.strip(' ') for link in link_director]
    
    if (festival == 'cannes' and year_festival in [1946, 1947, 1951, 1968] or (festival == 'venice')):
        winner = 0
    else:
        palmedor = [] # all links associated to the Palme d'Or
        for child in soup.find('span', {'id': "Official_awards"}).parent.find_next_sibling('ul').find('a', {'title': "Palme d'Or"}).parent.find_all('a'):
            palmedor.append('https://en.wikipedia.org' + child['href'])
        winner = [1 if link in palmedor else 0 for link in link_film]
        
    df_tmp = pd.DataFrame({'title_english': title_english,
                           'title_original': title_original,
                           'director': director,
                           'country': 'N/A',
                           'winner': winner,
                           'festival': festival,
                           'year_festival': year_festival,
                           'link_film': link_film,
                           'link_director': link_director})
     
    return df_tmp

In [6]:
def get_festival_dfs(urls, data_format):
    '''
    Given urls from get_urls() and the format of the films on Wikipedia,
    return the overall festival dataframe.
    
    urls -- must be from get_urls()
    data_format -- the format of the films on Wikipedia, e.g. 'table', 'bullet'
    '''
    df_festival = pd.DataFrame(columns = ['title_english', 'title_original', 'director', 'country', 'winner', 'festival', 'year_festival', 'link_film', 'link_director'])
    
    for url in urls:
        print(url)
        if(data_format == 'table'):
            df_festival = df_festival.append(get_festival_table(url[0], url[1], url[2]), ignore_index=True)
        else: # bullet
            df_festival = df_festival.append(get_festival_bullet(url[0], url[1], url[2]), ignore_index=True)
    
    return df_festival

In [7]:
# CANNES
# table: 2007 to 2019, inclusive
# bullet: 1949 ("directed by"), 1952 to 1967, 1969 to 1979, 1981 to 1984, 1986 to 1990, 1992, 1995 to 2006, inclusive
# exceptions: 1946 (films fine, awards manual, "directed by"), 1947 (films fine, awards manual), 1948 (nonexistent), 1950 (nonexistent), 1951 (films fine, awards manual), 1968 (films fine, awards manual), 1980 (diff tag format, manual), 1985 (table), 1991 (table), 1993, (table), 1994 (table)

# VENICE
# table: 1939, 1947 to 1968, 1980 to 2019, inclusive
# bullet: 1932, 1934 to 1938, 1969 to 1972, 1979
# exceptions: 1932 (films not available, so treat as nonexistent, will not include in dataset) 1969 (country), 1970 (country), 1971 (country), 1972 (country), 1979 (country)

urls_table = get_urls('cannes', 1985, 1985) + get_urls('cannes', 1991, 1991) + get_urls('cannes', 1993, 1994) + get_urls('cannes', 2007, 2019) + get_urls('venice', 1939, 1939) + get_urls('venice', 1947, 1968) + get_urls('venice', 1980, 2019) + get_urls('berlin', 1951, 2009) + get_urls('berlin', 2011, 2019)
urls_bullet = get_urls('cannes', 1946, 1947) + get_urls('cannes', 1949, 1949) + get_urls('cannes', 1951, 1979) + get_urls('cannes', 1981, 1984) + get_urls('cannes', 1986, 1990) + get_urls('cannes', 1992, 1992) + get_urls('cannes', 1995, 2006) + get_urls('venice', 1934, 1938) + get_urls('venice', 1969, 1972) + get_urls('venice', 1979, 1979)

df_festival_table = get_festival_dfs(urls_table, 'table')
df_festival_bullet = get_festival_dfs(urls_bullet, 'bullet')

df_festival = df_festival_table.append(df_festival_bullet, ignore_index=True)

df_festival.to_csv('./data/festivals_tmp.csv', index_label='id')

('https://en.wikipedia.org/wiki/1985_Cannes_Film_Festival', 'cannes', 1985)
('https://en.wikipedia.org/wiki/1991_Cannes_Film_Festival', 'cannes', 1991)
('https://en.wikipedia.org/wiki/1993_Cannes_Film_Festival', 'cannes', 1993)
('https://en.wikipedia.org/wiki/1994_Cannes_Film_Festival', 'cannes', 1994)
('https://en.wikipedia.org/wiki/2007_Cannes_Film_Festival', 'cannes', 2007)
('https://en.wikipedia.org/wiki/2008_Cannes_Film_Festival', 'cannes', 2008)
('https://en.wikipedia.org/wiki/2009_Cannes_Film_Festival', 'cannes', 2009)
('https://en.wikipedia.org/wiki/2010_Cannes_Film_Festival', 'cannes', 2010)
('https://en.wikipedia.org/wiki/2011_Cannes_Film_Festival', 'cannes', 2011)
('https://en.wikipedia.org/wiki/2012_Cannes_Film_Festival', 'cannes', 2012)
('https://en.wikipedia.org/wiki/2013_Cannes_Film_Festival', 'cannes', 2013)
('https://en.wikipedia.org/wiki/2014_Cannes_Film_Festival', 'cannes', 2014)
('https://en.wikipedia.org/wiki/2015_Cannes_Film_Festival', 'cannes', 2015)
('https://en

('https://en.wikipedia.org/wiki/15th_Berlin_International_Film_Festival', 'berlin', 1965)
('https://en.wikipedia.org/wiki/16th_Berlin_International_Film_Festival', 'berlin', 1966)
('https://en.wikipedia.org/wiki/17th_Berlin_International_Film_Festival', 'berlin', 1967)
('https://en.wikipedia.org/wiki/18th_Berlin_International_Film_Festival', 'berlin', 1968)
('https://en.wikipedia.org/wiki/19th_Berlin_International_Film_Festival', 'berlin', 1969)
('https://en.wikipedia.org/wiki/20th_Berlin_International_Film_Festival', 'berlin', 1970)
('https://en.wikipedia.org/wiki/21st_Berlin_International_Film_Festival', 'berlin', 1971)
('https://en.wikipedia.org/wiki/22nd_Berlin_International_Film_Festival', 'berlin', 1972)
('https://en.wikipedia.org/wiki/23rd_Berlin_International_Film_Festival', 'berlin', 1973)
('https://en.wikipedia.org/wiki/24th_Berlin_International_Film_Festival', 'berlin', 1974)
('https://en.wikipedia.org/wiki/25th_Berlin_International_Film_Festival', 'berlin', 1975)
('https://

('https://en.wikipedia.org/wiki/1997_Cannes_Film_Festival', 'cannes', 1997)
('https://en.wikipedia.org/wiki/1998_Cannes_Film_Festival', 'cannes', 1998)
('https://en.wikipedia.org/wiki/1999_Cannes_Film_Festival', 'cannes', 1999)
('https://en.wikipedia.org/wiki/2000_Cannes_Film_Festival', 'cannes', 2000)
('https://en.wikipedia.org/wiki/2001_Cannes_Film_Festival', 'cannes', 2001)
('https://en.wikipedia.org/wiki/2002_Cannes_Film_Festival', 'cannes', 2002)
('https://en.wikipedia.org/wiki/2003_Cannes_Film_Festival', 'cannes', 2003)
('https://en.wikipedia.org/wiki/2004_Cannes_Film_Festival', 'cannes', 2004)
('https://en.wikipedia.org/wiki/2005_Cannes_Film_Festival', 'cannes', 2005)
('https://en.wikipedia.org/wiki/2006_Cannes_Film_Festival', 'cannes', 2006)
('https://en.wikipedia.org/wiki/2nd_Venice_International_Film_Festival', 'venice', 1934)
('https://en.wikipedia.org/wiki/3rd_Venice_International_Film_Festival', 'venice', 1935)
('https://en.wikipedia.org/wiki/4th_Venice_International_Film_

In [8]:
df_festival

Unnamed: 0,title_english,title_original,director,country,winner,festival,year_festival,link_film,link_director
0,Adieu Bonaparte,وداعا بونابرت,Youssef Chahine,Egypt,0,cannes,1985,https://en.wikipedia.org/wiki/Adieu_Bonaparte,https://en.wikipedia.org/wiki/Youssef_Chahine
1,Birdy,,Alan Parker,United States,0,cannes,1985,https://en.wikipedia.org/wiki/Birdy_(film),https://en.wikipedia.org/wiki/Alan_Parker
2,Bliss,,Ray Lawrence,Australia,0,cannes,1985,https://en.wikipedia.org/wiki/Bliss_(1985_film),https://en.wikipedia.org/wiki/Ray_Lawrence_(fi...
3,Chicken with Vinegar,Poulet au vinaigre,Claude Chabrol,France,0,cannes,1985,https://en.wikipedia.org/wiki/Chicken_with_Vin...,https://en.wikipedia.org/wiki/Claude_Chabrol
4,The Coca-Cola Kid,,Dušan Makavejev,Australia,0,cannes,1985,https://en.wikipedia.org/wiki/The_Coca-Cola_Kid,https://en.wikipedia.org/wiki/Du%C5%A1an_Makav...
5,Colonel Redl,Oberst Redl,István Szabó,Hungary,0,cannes,1985,https://en.wikipedia.org/wiki/Colonel_Redl,https://en.wikipedia.org/wiki/Istv%C3%A1n_Szab...
6,Derborence,,Francis Reusser,Switzerland,0,cannes,1985,https://en.wikipedia.org/wiki/Derborence_(film),https://en.wikipedia.org/wiki/Francis_Reusser
7,Détective,Détective,Jean-Luc Godard,France,0,cannes,1985,https://en.wikipedia.org/wiki/D%C3%A9tective_(...,https://en.wikipedia.org/wiki/Jean-Luc_Godard
8,Farewell to the Ark,さらば箱舟,Shūji Terayama,Japan,0,cannes,1985,https://en.wikipedia.org/wiki/Farewell_to_the_Ark,https://en.wikipedia.org/wiki/Sh%C5%ABji_Teray...
9,Insignificance,,Nicolas Roeg,United Kingdom,0,cannes,1985,https://en.wikipedia.org/wiki/Insignificance_(...,https://en.wikipedia.org/wiki/Nicolas_Roeg


#### Manual

In [None]:
# CANNES
# table: 1985, 1991, 1993 to 1994, 2007 to 2019, inclusive
# bullet: 1949 ("directed by"), 1952 to 1967, 1969 to 1979, 1981 to 1984, 1986 to 1990, 1992, 1995 to 2006, inclusive
# exceptions: 1946 (films fine, awards manual, "directed by"), 1947 (films fine, awards manual), 1948 (nonexistent), 1950 (nonexistent), 1951 (films fine, awards manual), 1968 (films fine, no awards given), 1980 (diff tag format, manual)

# VENICE
# table: 1939, 1947 to 1968, 1980 to 2019, inclusive
# bullet: 1934 to 1938, 1969 to 1972, 1979, inclusive
# exceptions: 1932 (films not available, so treat as nonexistent, will not include in dataset) 1969 (country), 1970 (country), 1971 (country), 1972 (country), 1979 (country)

# BERLIN
# table: 1951 to 2009, 2011 to 2019, inclusive
# bullet: none
# exceptions: 2010 (diff table format, manual)

In [9]:
# 1946 Cannes

url_cannes_1946 = 'https://en.wikipedia.org/wiki/1946_Cannes_Film_Festival'
r_cannes_1946 = requests.get(url_cannes_1946)
soup_cannes_1946 = BeautifulSoup(r_cannes_1946.content, 'lxml')

palmedor = [] # all links associated to the Palme d'Or
for child in soup_cannes_1946.find('span', {'id': "Official_awards"}).parent.find_next_sibling('ul').find('a', {'title': "Palme d'Or"}).parent.ul.find_all('a'):
    palmedor.append('https://en.wikipedia.org' + child['href'])
link_film = df_festival.loc[(df_festival['festival'] == 'cannes') & (df_festival['year_festival'] == 1946)]['link_film']
winner = [1 if link in palmedor else 0 for link in link_film]
df_festival.loc[(df_festival['festival'] == 'cannes') & (df_festival['year_festival'] == 1946), 'winner'] = winner

In [10]:
# 1947 Cannes

url_cannes_1947 = 'https://en.wikipedia.org/wiki/1947_Cannes_Film_Festival'
r_cannes_1947 = requests.get(url_cannes_1947)
soup_cannes_1947 = BeautifulSoup(r_cannes_1947.content, 'lxml')

palmedor = [] # all links associated to the Palme d'Or
for child in soup_cannes_1947.find('span', {'id': "Awards"}).parent.find_next_sibling('ul').find_all('a'):
    palmedor.append('https://en.wikipedia.org' + child['href'])
link_film = df_festival.loc[(df_festival['festival'] == 'cannes') & (df_festival['year_festival'] == 1947)]['link_film']
winner = [1 if link in palmedor else 0 for link in link_film]
df_festival.loc[(df_festival['festival'] == 'cannes') & (df_festival['year_festival'] == 1947), 'winner'] = winner
df_festival.loc[df_festival['title_english'] == 'Antoine et Antoinette', 'winner'] = 1

In [11]:
# 1951 Cannes

url_cannes_1951 = 'https://en.wikipedia.org/wiki/1951_Cannes_Film_Festival'
r_cannes_1951 = requests.get(url_cannes_1951)
soup_cannes_1951 = BeautifulSoup(r_cannes_1951.content, 'lxml')

palmedor = [] # all links associated to the Palme d'Or
for child in soup_cannes_1951.find('span', {'id': "Awards"}).parent.find_next_sibling('ul').find('a', {'title': "Palme d'Or"}).parent.ul.find_all('a'):
    palmedor.append('https://en.wikipedia.org' + child['href'])
link_film = df_festival.loc[(df_festival['festival'] == 'cannes') & (df_festival['year_festival'] == 1951)]['link_film']
winner = [1 if link in palmedor else 0 for link in link_film]
df_festival.loc[(df_festival['festival'] == 'cannes') & (df_festival['year_festival'] == 1951), 'winner'] = winner

In [12]:
# 1980 Cannes

url_cannes_1980 = 'https://en.wikipedia.org/wiki/1980_Cannes_Film_Festival'
r_cannes_1980 = requests.get(url_cannes_1980)
soup_cannes_1980 = BeautifulSoup(r_cannes_1980.content, 'lxml')

title_english = []
director = []
link_film = []
link_director = []

for child in soup_cannes_1980.find('span', {'id': "In_competition_-_Feature_film"}).parent.find_next_sibling('ul').find_all('li'):
    title_english.append(child.get_text().split(' by ')[0] if child.get_text() else 'N/A')
    director.append(child.get_text().split(' by ')[1] if child.get_text() else 'N/A')
    
    links = ['https://en.wikipedia.org' + a['href'] for a in child.find_all('a')]
    link_film.append(links[0] if links else 'N/A')
    links_string = ''
    for i in range(1, len(links)):
        links_string += links[i] + ' '
    link_director.append(links_string if links else 'N/A')

title_english = [title.strip(' ') for title in title_english]
director = [d.strip(' ') for d in director]
link_director = [link.strip(' ') for link in link_director]

palmedor = [] # all links associated to the Palme d'Or
for child in soup_cannes_1980.find('span', {'id': "Official_awards"}).parent.find_next_sibling('ul').find('a', {'title': "Palme d'Or"}).parent.ul.find_all('a'):
    palmedor.append('https://en.wikipedia.org' + child['href'])
winner = [1 if link in palmedor else 0 for link in link_film]

df_cannes_1980 = pd.DataFrame({'title_english': title_english,
                               'title_original': 'N/A',
                               'director': director,
                               'country': 'N/A',
                               'winner': winner,
                               'festival': 'cannes',
                               'year_festival': 1980,
                               'link_film': link_film,
                               'link_director': link_director})

df_festival = df_festival.append(df_cannes_1980, ignore_index=True)

In [13]:
# 2010 Berlin

url_berlin_2010 = 'https://en.wikipedia.org/wiki/60th_Berlin_International_Film_Festival'
r_berlin_2010 = requests.get(url_berlin_2010)
soup_berlin_2010 = BeautifulSoup(r_berlin_2010.content, 'lxml')
tables = soup_berlin_2010.findAll('table', {'class': 'wikitable'})
    
title_english = []
director = []
country = []
winner = []
link_film = []
link_director = []

trs = tables[0].find_all('tr')[1:]
for tr in trs:
    tds = tr.find_all('td')
    
    title_english.append(tds[0].find('i').get_text() if tds[0].find('i') else 'N/A')
    director.append(tds[1].get_text().strip('\n'))
    country.append(tds[2].span['data-sort-value'])
    
    link_film.append('https://en.wikipedia.org' + tds[0].find('a').get('href'))
    
    links = tds[1].find_all('a')
    links_string = ''
    for link in links:
        links_string += 'https://en.wikipedia.org' + link.get('href') + ' '
    link_director.append(links_string if links else 'N/A')
    
    # winner
    if tr.has_attr('style'):
        winner.append(1)
    else:
        winner.append(0)

df_berlin_2010 = pd.DataFrame({'title_english': title_english,
                               'title_original': 'N/A',
                               'director': director,
                               'country': country,
                               'winner': winner,
                               'festival': 'berlin',
                               'year_festival': 2010,
                               'link_film': link_film,
                               'link_director': link_director,})

df_festival = df_festival.append(df_berlin_2010, ignore_index=True)

In [14]:
df_festival['title_english'] = [title.strip(' ') for title in df_festival['title_english']]
df_festival['title_original'] = [title.strip(' ') for title in df_festival['title_original']]
df_festival['director'] = [d.strip(' ') for d in df_festival['director']]
df_festival['country'] = [c.strip(' ') for c in df_festival['country']]
df_festival['link_film'] = [link_f.strip(' ') for link_f in df_festival['link_film']]
df_festival['link_director'] = [link_d.strip(' ') for link_d in df_festival['link_director']]

In [15]:
df_festival.to_csv('./data/festivals.csv', index_label='id')