In [1]:
from contextlib import redirect_stdout
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import wikipedia
import langdetect
import difflib
from typing import Optional

In [3]:
if 1:

    print(2)

2


In [5]:
def pascal(n: int):
    res = []
    res.append([1])
    if n == 1:
        return res
    res.append([1, 1])
    if n == 2:
        return res
    for idx in range(2, n):
        prev = res[idx - 1]
        curr = []
        for i in range(len(prev) + 1):
            j = i - 1
            if i >= 0 and i < len(prev):
                v_i = prev[i]
            else:
                v_i = 0
            if j >= 0 and j < len(prev):
                v_j = prev[j]
            else:
                v_j = 0
            curr.append(v_i + v_j)
        res.append(curr)
    return res


pascal(5)

[[1], [1, 1], [1, 2, 1], [1, 3, 3, 1], [1, 4, 6, 4, 1]]

In [52]:
# IO Function
def read_input_movies(file: str = "input.txt") -> list[str]:
    """Read a list of movies from a give text file.

    Args:
        file (str, optional): The path to the file. Defaults to "input.txt".

    Returns:
        list[str]: The list of all movies specified in the text file.
    """
    with open(file) as f:
        lines = f.readlines()
    return [line.strip() for line in lines]


# Scrape The infobox of movies
def scrape_infobox(infobox: bs4.element.Tag, lang: str) -> dict:
    """Generate the key-value mapping of the informations found in

    Args:
        infobox (bs4.element.Tag): _description_
        lang (str): _description_

    Returns:
        _type_: _description_
    """

    tr_in_info_box = infobox.find_all("tr")
    mapping = {}
    for tr in tr_in_info_box:
        th = tr.find("th")
        if not th:
            continue
        key = th.text.strip()
        entries = tr.find("td")
        if not entries:
            continue
        links = entries.find_all("a" if lang == "fr" else "li")
        values = []
        for link in links:
            values.append(link.text.strip())

        if len(values) == 0:
            values.append(entries.text.strip())

        mapping[key] = values
    mapping["lang"] = lang
    return mapping


# Clean titles input
def clean_inputs(titles: list[str]) -> list[str]:
    """Clean titles given by the user so that they match the casing of the URL

    Args:
        titles (list[str]): the input titles

    Returns:
        list[str]: the input titles formatted
    """
    # TODO : Add more cases as we go !
    processed_titles = []
    for title in titles:
        split = title.split()
        # capitalized = [word.capitalize() for word in split]
        processed_titles.append("_".join(split))
    return processed_titles


def clean_outputs(infos: dict[str, any]) -> dict[str, any]:
    """Clean the output dictionary so that they all have a corresponding format

    Args:
        infos (dict): dictionary containing the corresponding information retrieved for a given movie

    Returns:
        dict[str, any]: cleaned dictionary
    """
    TRANSLATE_CATEGORIES = {
        "Directed by": "Réalisation",
        "Starring": "Acteurs principaux",
        "Release date": "Sortie",
        "Running time": "Durée",
        "Country": "Pays de production",
    }

    # Rename only one Genre Category
    if "Genres" in infos.keys():
        infos["Genre"] = infos.pop("Genres")

    if infos["lang"] == "en":
        keys = list(infos.keys())
        for key in keys:
            if key in TRANSLATE_CATEGORIES:
                infos[TRANSLATE_CATEGORIES[key]] = infos.pop(key)

    # Regex match year only in release date
    if "Sortie" in infos.keys():
        infos["Sortie"] = [re.search(r"\d+\d+\d+\d+", infos["Sortie"][0]).group(0)]

    # Clean the outputs
    for key, value in infos.items():
        if type(value) == str or not value:
            continue

        infos[key] = [re.sub(r"\[.*\]", "", v) for v in value if v and v != ""]
    return infos

In [53]:
def search_potential_articles(titles: list[str], k: int = 5) -> dict[str, list[str]]:
    """Search potential articles corresponding to titles in the list using the Wikipedia Python API.
    It will search the french wikipedia for title that are detected in french and the english one for any other language.

    Note : This is to prevent querying languages that I dont speak as my movie list mainly contain these two languages.
    We might want to use a more accurate language detector.

    Args:
        titles (list[str]): List of the movie titles.

    Returns:
        dict[str, list[str]]: _description_
    """

    potential_articles = {}
    for title in titles:
        if langdetect.detect(title) == "fr":
            wikipedia.set_lang("fr")
            wiki_lang = "fr"
        else:
            wikipedia.set_lang("en")
            wiki_lang = "en"

        articles = wikipedia.search(title + "_(film)", results=k)
        potential_articles[title] = (articles, wiki_lang)
        # print(title, wiki_lang)
    return potential_articles


def check_title_overlap(title: str, target: str) -> bool:
    """Check if the title is contained in the target or the other way around.

    Args:
        title (str): The seach title.
        target (str): The page title that we want to compare it to.

    Returns:
        bool: True if title is totally contained in target or the other way around.
    """
    return title.lower() in target.lower() or target.lower() in title.lower()


def get_most_likely_article(
    title: str, potential_articles: list[str], verbose: bool = True
) -> Optional[str]:
    """Determines the most likely article name from a list of potential articles.
    Use string similarity and heuristics to determine the best match.

    Args:
        title (str): The title we search.
        potential_articles (list[str]): The list of articles retrieved on Wikipedia.
        verbose (bool, optional): Toggle detailed print in case of error. Defaults to True.

    Returns:
        Optional[str]: The most likely article title or None if the answer is too unsure.
    """
    if not potential_articles:
        return None

    similarities = [
        difflib.SequenceMatcher(None, title, p).ratio() for p in potential_articles
    ]
    highest_similartiy_index = np.argmax(similarities)

    # If the title is completely contained in page 0, return it.
    top_article = potential_articles[0]
    if check_title_overlap(title, top_article):
        return top_article

    # If we're confident enough in the first page and it either mentions 'film' or has the highest similarity in the list.
    if similarities[0] >= 0.6:
        #
        if (
            "film" in top_article
            or similarities[0] >= similarities[highest_similartiy_index]
        ):
            return top_article

    # If none of the pages are similar to the query, assume that there is a spelling error and return None
    if similarities[highest_similartiy_index] < 0.6:
        if verbose:
            print(
                f"The retrived pages for {title} are pretty uncertain. Try changing the spelling to one of the following : \n {potential_articles}"
            )
        return None
    return potential_articles[highest_similartiy_index]

In [54]:
def detect_genre_for_en_lang(page: wikipedia.wikipedia.WikipediaPage) -> Optional[str]:
    summary = page.summary[:256].split(" ")
    genre = None
    if "film" in summary:
        idx_film = summary.index("film")
        if "American" in summary:
            genre = " ".join(summary[summary.index("American") + 1 : idx_film])
        else:
            genre = " ".join(summary[summary.index("film") - 4 : summary.index("film")])
    return genre


def scrape_wikipedia_article(title: str, article: str, lang: str):

    # Get the page off wikipedia
    page = wikipedia.page(title=article, auto_suggest=False)

    # Parse it using bs4
    good_soup = BeautifulSoup(page.html(), "html.parser")

    movies_found = {}
    # Look for the 'infobox_v3' div of the page which is the movie header.

    if lang == "fr":
        tag_type = "div"
        infobox_class = "infobox_v3"
    else:
        tag_type = "table"
        infobox_class = "infobox"
    infoboxes = good_soup.find_all(tag_type, class_=infobox_class)
    if len(infoboxes) > 0:
        infos = scrape_infobox(infoboxes[0], lang)

        if lang == "en":
            genre = detect_genre_for_en_lang(page)
            infos["Genre"] = [genre]

        infos = clean_outputs(infos)
        print(f"Retrieved informations for {title} !")
        movies_found[title] = infos
        print(title, infos)
    else:
        print(f"Could not find any infoboxes on the page of {title} !")
        return None
    return infos

In [58]:
# Post processing
def convert_duration(duration):
    if duration:
        mins = duration.replace("\xa0", " ").split(" ")[0]
        try:
            int_mins = int(mins)
            display_hours = int_mins // 60
            display_mins = int_mins % 60
            display_mins = (
                f"0{display_mins}" if display_mins < 10 else f"{display_mins}"
            )
            return (
                f"{display_hours}h{display_mins}"
                if display_hours > 0
                else f"{display_mins}mins"
            )
        except ValueError:
            return None
    else:
        return None


def get_informations(movie_infos, categorie, index: int = None):
    # Check that the categorie exist in the retrived informations o.w return None
    if categorie not in movie_infos:
        return None

    #
    info = movie_infos[categorie]
    if not index:
        return info

    if index and len(info) >= index:
        return info[index]

    return None


def movie_to_df_row(title, movie_infos):
    # TODO: Move Constants as global variables
    REALISATEUR = "Réalisation"
    GENRE = "Genre"
    ACTEURS = "Acteurs principaux"
    SORTIE = "Sortie"
    DUREE = "Durée"
    PAYS_PROD = "Pays de production"
    # Get values then aggregate
    realisation = get_informations(movie_infos, REALISATEUR, 0)
    print(title, movie_infos[GENRE])
    genre = get_informations(movie_infos, GENRE, 0)
    premier_role = get_informations(movie_infos, ACTEURS, 0)
    second_role = get_informations(movie_infos, ACTEURS, 1)
    sortie = get_informations(movie_infos, SORTIE, 0)
    duree = get_informations(movie_infos, DUREE, 0)
    duree = convert_duration(duree)
    pays_production = get_informations(movie_infos, PAYS_PROD, 0)
    return [
        title,
        None,  # Note
        None,  # Remarques
        realisation,
        genre,
        premier_role,
        second_role,
        sortie,
        duree,
        None,  # Rythme
        None,  # Accessibilité
        None,  # Violence
        None,  # Recompenses TODO: add this
        pays_production,
    ]


def write_movies_df(movies):
    # TODO: Move this as a global variable
    categories = [
        "Nom",
        "Note",
        "Remarques",
        "Réalisateur",
        "Style/Genre",
        "Acteur principal",
        "Second rôle",
        "Année de sortie",
        "Durée",
        "Rythme",
        "Accessibilité",
        "Violence/Effrayant",
        "Récompense Oscar etc",
        "Pays producteur",
    ]

    processed_movies = []

    for movie, infos in movies.items():
        processed_movies.append(movie_to_df_row(movie, infos))

    return pd.DataFrame(processed_movies, columns=categories)


def save_as_excel(df: pd.DataFrame, file: str = "output.xlsx"):
    df.to_excel(file, index=False)

In [56]:
titles = read_input_movies()
potential_articles = search_potential_articles(titles)

movies_articles = {}
for title, (page_list, lang) in potential_articles.items():
    most_likely_article = get_most_likely_article(title, page_list)
    # print(title, "-", most_likely_article)
    movies_articles[title] = (most_likely_article, lang)


movies_infos = {}
cumul = 0
for title, (article, lang) in movies_articles.items():
    if not article:
        continue

    # if cumul > 5:
    #     break
    # cumul += 1
    wikipedia.set_lang(lang)
    print(title, article, lang)
    infos = scrape_wikipedia_article(title, article, lang)
    if infos:
        infos["lang"] = lang
    movies_infos[title] = infos
    print()

The retrived pages for Fog hill of five element are pretty uncertain. Try changing the spelling to one of the following : 
 ['Silent Hill 2', 'List of films: F', 'List of cult films: F', 'Alice in Chains', 'Escape from New York']
The retrived pages for Sexy Dance 2 are pretty uncertain. Try changing the spelling to one of the following : 
 ['Sexy, Sexy, Sexy', 'Sexy and I Know It', 'Dance Dance (film)', 'Sexy Boy (Air song)', 'You Sexy Thing']
Armageddon Armageddon (1998 film) en
Retrieved informations for Armageddon !
Armageddon {'Screenplay by': ['Jonathan Hensleigh', 'J. J. Abrams'], 'Adaptation by': ['Tony Gilroy', 'Shane Salerno'], 'Story by': ['Robert Roy Pool', 'Jonathan Hensleigh'], 'Produced by': ['Michael Bay', 'Jerry Bruckheimer', 'Gale Anne Hurd'], 'Cinematography': ['John Schwartzman'], 'Edited by': ['Mark Goldblatt', 'Chris Lebenzon', 'Glen Scantlebury'], 'Music by': ['Trevor Rabin'], 'Productioncompanies': ['Touchstone Pictures', 'Jerry Bruckheimer Films', 'Valhalla Moti

In [59]:
write_movies_df(movies_infos)

Armageddon ['science fiction disaster']
Project X ['found footage teen comedy']
L'enfant, la taupe, le renard et le cheval ['aventure', 'animation']
La planète sauvage ['Animation', 'Aventures', 'Science-fiction']
Spartacus ['epic historical drama']
Inspecteur Harry ['policier']
The Art of Flight ['Red Bull sponsored documentary']
Get Out ['psychological black horror']
Des hommes sans loi ['Biopic', 'drame', 'action', 'film de gangsters']
The King's avatar []


IndexError: list index out of range

In [39]:
re.search(r"\d+\d+\d+\d+", "July 1, 1998 (1998-07-01)").group(0)

'1998'

### Archives

In [None]:
# titles = [
#     "Armageddon",
#     "Project X",
#     "L'enfant, la taupe, le renard et le cheval",
#     "La Planète sauvage",
#     "Spartacus",
#     "Inspecteur Harry",
#     "The Art of Flight",
#     "Get Out",
#     "Des hommes sans loi",
#     "Fog hill of five element",
#     "The King's avatar",
#     "L'oeil du mal",
#     "La classe à l'Américaine",
#     "La Sociologie est un sport de combat",
#     "Les rivières pourpres",
#     "Propaganda la fabrique du consentement",
#     "Rio Bravo",
#     "Bottle Rocket",
#     "Tirangle of Sadness",
#     "Mario",
#     "Transformers : Rise of the Beasts",
#     "Nope",
#     "Le sens de la fête",
#     "Interstella 5555",
#     "Spiderman : Into the Spider-verse",
#     "Spiderman : Across the Spider-verse",
#     "Human Traffic",
#     "Sexy Dance 2",
#     "Sexy Dance 3D",
#     "Twilight 1",
#     "Ready Player One",
#     "The Interview",
#     "Your Name",
#     "I am Legend",
#     "La planète au trésor",
#     "Buzz L'éclair",
#     "Barbie",
#     "Good Luck to you Leo Grande",
#     "Summer Palace",
#     "District 9",
#     "Signes",
#     "Asteroid City",
#     "Passengers",
#     "Perfect Days",
#     "La zone d'intérêt",
#     "Le règne animal",
#     "Simple comme Sylvain",
# ]


# category_map = {
#     "Titre": "Nom",
#     "Note": "Note",
#     "Remarques": "Remarques",
#     "Réalisation": "Réalisateur",
#     "Genres": "Style/Genre",
#     "Acteurs principaux": "Acteur principal",
#     "Acteurs principaux": "Second rôle",
#     "Sortie": "Année de sortie",
#     "Durée": "Durée",
#     "Rythme": "Rythme",
#     "Accessibilité": "Accessibilité",
#     "Violence/Effrayant": "Violence/Effrayant",
#     "Récompense": "Récompense Oscar etc",
#     "Pays de production": "Pays producteur",
# }

# def scrape_infobox_english(infobox):
#     tr_in_info_box = infobox.find_all("tr")
#     mapping = {}
#     for tr in tr_in_info_box:
#         th = tr.find("th")
#         if not th:
#             continue
#         key = th.text.strip()
#         entries = tr.find("td")
#         if not entries:
#             continue
#         lists = entries.find_all("li")
#         # print(lists)
#         values = []
#         for link in lists:
#             # print(link.text.strip())
#             values.append(link.text.strip())
#         if len(values) == 0:
#             values.append(entries.text.strip())
#         mapping[key] = values
#     return mapping

# import copy
# import re

# infos = copy.deepcopy(movies_infos["Bottle Rocket"])


# def clean_outputs_2(infos: dict[str, any]) -> dict[str, any]:
#     """Clean the output dictionary so that they all have a corresponding format

#     Args:
#         infos (dict): dictionary containing the corresponding information retrieved for a given movie

#     Returns:
#         dict[str, any]: cleaned dictionary
#     """
#     # Transform in only on Genre Category
#     if "Genres" in infos.keys():
#         infos["Genre"] = infos.pop("Genres")

#     for key, value in infos.items():
#         if type(value) == str:
#             continue
#         infos[key] = [re.sub(r"\[.*\]", "", v) for v in value if v != ""]
#     return infos


# clean_outputs(infos)
# # Query Wikipedia URL


# def is_homonym_page(soup):
#     # TODO : Add more conditions for EN wiki and other pages
#     homonym_divs = soup.find_all("div", {"id": "homonymie"})
#     return len(homonym_divs) > 0


# urls_to_try = [
#     "https://fr.wikipedia.org/wiki/{}_(film)",
#     "https://fr.wikipedia.org/wiki/{}",
# ]

# processing_functions = [
#     lambda x: x,
#     lambda x: "_".join([word.capitalize() for word in x.split("_")]),
# ]


# def soup_soup_wikipedia_movie(title: str) -> bs4.element.ResultSet:
#     """_summary_

#     Args:
#         title (str): _description_

#     Returns:
#         bs4.element.ResultSet: _description_
#     """

#     is_homonym = False
#     good_soup = None
#     # Try all URLs versions
#     for url in urls_to_try:
#         for pf in processing_functions:
#             # Get page
#             processed_title = pf(title)
#             formatted_url = url.format(processed_title)
#             page = requests.get(formatted_url)
#             print(formatted_url, page.status_code)

#             # Check it exists
#             if page.status_code == 404:
#                 continue

#             # Soup it :frog:
#             good_soup = BeautifulSoup(page.content, "html.parser")

#             # Check that it's not an homonym page
#             is_homonym = is_homonym_page(good_soup)
#             if is_homonym:
#                 print(
#                     f"{title}'s page point to Homonym page, you might want to check it out : {formatted_url}"
#                 )
#                 print(
#                     "You might also want to change the spelling to a less generic name."
#                 )
#             return good_soup

#     return good_soup

# def run_pipeline(titles):
#     acc = 0
#     movies = {}

#     clean_titles = clean_inputs(titles)
#     for title, processed_title in zip(titles, clean_titles):
#         print(f"\nProcessing {title} ...")
#         processed_title = "_".join(title.split())
#         good_soup = soup_soup_wikipedia_movie(processed_title)
#         if good_soup is None:
#             print(f"Could not find a matching page for {title}.")
#             print("You might also want to change the spelling to a less generic name.")
#             continue
#         infoboxes = good_soup.find_all("div", class_="infobox_v3")
#         if len(infoboxes) > 0:
#             infos = scrape_infobox(infoboxes[0])
#             infos = clean_outputs(infos)
#             print(f"Retrieved informations for {title} !")
#             print(title, infos)
#             movies[title] = infos
#             if "Réalisation" in infos.keys():
#                 acc += 1
#         print(type(infoboxes))
#     print(f"Accuracy {acc / len(titles) * 100} %")
#     return movies


# def scrape_wikipedia(titles: list[str]) -> tuple[dict[str, dict[str, str]], list[str]]:
#     """Scrape the wikipedia pages for a given list of movie titles.

#     Args:
#         titles (list[str]): List of movie titles to search.

#     Returns:
#         tuple[dict[str, dict[str, str]], list[str]]: A dictionnary for found movies mapping titles to retrieved infos and a list of all titles that were not found.
#     """
#     movies_found = {}
#     movies_not_found = []

#     processed_titles = clean_inputs(titles)
#     for title, processed_title in zip(titles, processed_titles):
#         # Scrape Wikipedia
#         print(f"\nProcessing {title} ...")
#         print(processed_title)
#         good_soup = soup_soup_wikipedia_movie(processed_title)

#         # If the result is empty, continue and mark the title as not found.
#         if not good_soup:
#             print(f"Could not find a matching page for {title}.")
#             print("You might also want to change the spelling to a less generic name.")
#             movies_not_found.append(title)
#             movies_found[title] = {}
#             continue

#         # Else look for the 'infobox_v3' div of the page which is the movie header.
#         infoboxes = good_soup.find_all("div", class_="infobox_v3")
#         if len(infoboxes) > 0:
#             infos = scrape_infobox(infoboxes[0])
#             infos = clean_outputs(infos)
#             print(f"Retrieved informations for {title} !")
#             movies_found[title] = infos
#         else:
#             print(f"Could not find any infobox_v3 on the page of {title} !")
#             print(infoboxes)

#     return movies_found, movies_not_found