In [25]:
import wikipediaapi
from bs4 import BeautifulSoup
import requests
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON, CSV

user_agent = "Mus/1.0"
wiki_wiki = wikipediaapi.Wikipedia(user_agent=user_agent, language='en')
sparql = SPARQLWrapper("https://query.wikidata.org/sparql", agent=user_agent)

In [26]:
def get_films_by_year(year):
    category_name = f"Category:{year} films"
    category = wiki_wiki.page(category_name)

    # Check if the category exists
    if not category.exists():
        print(f"Category '{category_name}' does not exist.")
        return []

    # Collect film titles from the category
    films = [page.title for page_title, page in category.categorymembers.items()
             if page.ns == wikipediaapi.Namespace.MAIN]
    return films


In [27]:
def get_id(page_title):
    # Get the page
    
    wiki_wiki = wikipediaapi.Wikipedia(user_agent=user_agent, language='en')
    page = wiki_wiki.page(page_title)
    
    if not page.exists():
        print(f"Page '{page_title}' does not exist.")
        return None
    
    # Get the page url
    url = page.canonicalurl
    
    response = requests.get(url)
    
    if response.status_code != 200:
        print(f"Failed to retrieve page '{page_title}'.")
        return None
    
    # Parse the page content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    table = soup.find("table", {"class": "infobox"})  # Find the infobox table
    
    informations = soup.find("a", title="More information about this page")
    
    page_info_url = "https://en.wikipedia.org"+informations.get("href")
    
    page_info_response = requests.get(page_info_url)
    
    page_info_soup = BeautifulSoup(page_info_response.content, 'html.parser')
    
    # Find all <tr> tags
    rows = page_info_soup.find_all('tr')

    # Loop through each row to find the one with the desired structure
    for row in rows:
        # Find all <td> elements within the row
        tds = row.find_all('td')
        # Check if there are exactly two <td> elements and if one contains the desired text
        if len(tds) == 2 and "Wikidata item ID" in tds[0].text:
            # Found the desired row, process it as needed
            wikidata_id = tds[1].text.strip()
        elif len(tds) == 2 and "Page ID" in tds[0].text:
            page_id = tds[1].text.strip()
    
    return wikidata_id, page_id



In [28]:
def get_wikidata_info(wikidata_id, sparqlAgent):
    
    query = f"""
      SELECT ?film ?filmLabel (MIN(?releaseDate) AS ?earliestReleaseDate) (MAX(?boxOffice) AS ?highestBoxOffice) ?runtime 
            (GROUP_CONCAT(DISTINCT ?languageLabel; separator=", ") AS ?languages) 
            (GROUP_CONCAT(DISTINCT ?countryLabel; separator=", ") AS ?countries) 
            (GROUP_CONCAT(DISTINCT ?genreLabel; separator=", ") AS ?genres)
            (GROUP_CONCAT(DISTINCT ?reviewScoreLabel; separator=", ") AS ?reviewScores)
            (GROUP_CONCAT(DISTINCT ?awardLabel; separator=", ") AS ?awardsReceived)
            (GROUP_CONCAT(DISTINCT ?nominatedAwardLabel; separator=", ") AS ?awardsNominated)
            ?capitalCost WHERE {{
        BIND(wd:{wikidata_id} AS ?film)  # Using the specific Wikidata movie ID
        
        ?film wdt:P31 wd:Q11424;  # Instance of film
              wdt:P577 ?releaseDate.
        
        OPTIONAL {{ ?film wdt:P2142 ?boxOffice. }}
        OPTIONAL {{ ?film wdt:P2047 ?runtime. }}
        OPTIONAL {{ ?film wdt:P364 ?language. ?language rdfs:label ?languageLabel. FILTER(LANG(?languageLabel) = "en") }}
        OPTIONAL {{ ?film wdt:P495 ?country. ?country rdfs:label ?countryLabel. FILTER(LANG(?countryLabel) = "en") }}
        OPTIONAL {{ ?film wdt:P136 ?genre. ?genre rdfs:label ?genreLabel. FILTER(LANG(?genreLabel) = "en") }}
        OPTIONAL {{ ?film wdt:P444 ?reviewScoreLabel. }}
        OPTIONAL {{ ?film wdt:P166 ?award. ?award rdfs:label ?awardLabel. FILTER(LANG(?awardLabel) = "en") }}
        OPTIONAL {{ ?film wdt:P1411 ?nominatedAward. ?nominatedAward rdfs:label ?nominatedAwardLabel. FILTER(LANG(?nominatedAwardLabel) = "en") }}
        OPTIONAL {{ ?film wdt:P2130 ?capitalCost. }}

        SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
      }}
      GROUP BY ?film ?filmLabel ?runtime ?capitalCost
      """
    
    sparqlAgent.setQuery(query)
    sparqlAgent.setReturnFormat(JSON)
    results = sparqlAgent.query().convert()
    
    return results['results']['bindings']

In [None]:
def get_film_info(film_title):
    wikidata_id, page_id = get_id(film_title)
    
    if wikidata_id is None:
        print(f"Failed to retrieve Wikidata ID for '{film_title}'.")
        return None
    
    film_info = get_wikidata_info(wikidata_id, sparql)
    
    
    info = {}
    info['page_id'] = page_id
    info['wikidata_id'] = wikidata_id
    
    if len(film_info) == 0:
        info['film'] = film_title
        info['release_date'] = None
        info['box_office'] = None
        info['runtime'] = None
        info['languages'] = None
        info['countries'] = None
        info['genres'] = None
        info['reviewScores'] = None
        info['awardsReceived'] = None
        info['awardsNominated'] = None
        info['capitalCost'] = None
    
    
    
    for film in film_info:
        
        info['film'] = film['filmLabel']['value']
        info['release_date'] = film['earliestReleaseDate']['value']
        info['box_office'] = film['highestBoxOffice']['value'] if 'highestBoxOffice' in film else None
        info['runtime'] = film['runtime']['value'] if 'runtime' in film else None
        info['languages'] = film['languages']['value'] if 'languages' in film else None
        info['countries'] = film['countries']['value'] if 'countries' in film else None
        
        
        genres = film['genres']['value'] if 'genres' in film else None
        genres = genres.split(", ") if genres is not None else None
        info['genres'] = [genre.replace("film", "").strip() for genre in genres if len(genre.split()) <= 2 ]
        
        reviewScores = film['reviewScores']['value'] if 'reviewScores' in film else None
        reviewScores = reviewScores.split(", ") if reviewScores is not None else None
        info['reviewScores'] = reviewScores
        
        awardsReceived = film['awardsReceived']['value'] if 'awardsReceived' in film else None
        awardsReceived = awardsReceived.split(", ") if awardsReceived is not None else None
        info['awardsReceived'] = awardsReceived
        
        awardsNominated = film['awardsNominated']['value'] if 'awardsNominated' in film else None
        awardsNominated = awardsNominated.split(", ") if awardsNominated is not None else None
        info['awardsNominated'] = awardsNominated
        
        info['capitalCost'] = film['capitalCost']['value'] if 'capitalCost' in film else None

        
    return info

In [30]:
infos = get_film_info("The Godfather")

In [31]:
df = pd.DataFrame.from_dict(infos, orient='index').T
df.set_index('page_id', inplace=True)

df

Unnamed: 0_level_0,wikidata_id,film,release_date,box_office,runtime,languages,countries,genres,reviewScores,awardsReceived,awardsNominated,capitalCost
page_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2466773,Q47703,The Godfather,1972-03-15T00:00:00Z,250341816,175,"Italian, English","Italy, United States of America","[drama, epic, crime, thriller, gangster, histo...","[100/100, 97%, 9.4/10]",[Golden Globe Award for Best Motion Picture – ...,"[Academy Award for Best Director, Academy Awar...",6000000


In [32]:
films_since_2013 = {}

for year in range(2013, 2025):  # Adjust end year as needed
    films = get_films_by_year(year)
    films_since_2013[year] = films
    print(f"{year}: Retrieved {len(films)} films.")

2013: Retrieved 3325 films.
2014: Retrieved 3478 films.
2015: Retrieved 3504 films.
2016: Retrieved 3447 films.
2017: Retrieved 3310 films.
2018: Retrieved 3253 films.
2019: Retrieved 3317 films.
2020: Retrieved 2268 films.
2021: Retrieved 2580 films.
2022: Retrieved 2887 films.
2023: Retrieved 2827 films.
2024: Retrieved 2191 films.


In [33]:
film_dict = {}

for year, films in films_since_2013.items():
    print(f"\n{year} films:")
    for film in films:
        film_info = get_film_info(film)
        if film_info is not None:
            film_dict[film] = film_info
            print(f'processed film {film}.')
        else:
            print(f"Failed to retrieve information for '{film}'.")
            
    df = pd.DataFrame.from_dict(film_dict, orient='index').T
    df.to_csv(f'films_{year}.csv')
    
    film_dict = {}
        
        
    print(f'Processed films for {year}.')
        
        


2013 films:
processed film +1 (film).
processed film 1 (2013 film).
processed film The 1 Up Fever.
processed film 1,001 Apples.
processed film 2 Autumns, 3 Winters.
processed film 2 Guns.
processed film 3 Days in Havana.
processed film 3 Dots.
processed film 3 Geezers!.
processed film 3 Peas in a Pod.
processed film 3G (film).
processed film 3G Love.
processed film 3x3D.
processed film 5 Sundarikal.
processed film 6 (film).
processed film 6-5=2.
processed film 7 Assassins.
processed film 7 Pecados Rurais.
processed film 7th Floor.
processed film 8-pallo.
processed film 9 Full Moons.
processed film 9 Meter.
processed film 9 Month Stretch.
processed film 10 Minutes (2013 film).
processed film 10 Rules for Sleeping Around.
processed film 10%: What Makes a Hero?.
processed film 10,000 Hours (film).


KeyboardInterrupt: 