# Remakes 

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
from pathlib import Path
import requests
import re
import requests
from urllib.parse import urlparse, unquote
from tqdm import tqdm
import wptools



## Acquiring the data from Wikipedia

We've acquired the data from this [Wikipedia page](https://en.wikipedia.org/wiki/List_of_film_remakes_(A%E2%80%93M)). Since the data are in the tables, we extract the table elements using the `BeautifulSoup` library.

In [2]:
urls = [
    "https://en.wikipedia.org/wiki/List_of_film_remakes_(A%E2%80%93M)#A",
    "https://en.wikipedia.org/wiki/List_of_film_remakes_(N%E2%80%93Z)#Z",
]
tables = []

for url in urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    table = soup.find_all("table", class_="wikitable")
    tables.extend(table)

## Parsing the Webpage Tables

After acquiring the data, we use the `BeatingSoup` library to parse the tables into a structure dataframe format. Then, we will do a manual post-clean up to ensure the data is in the correct format.


In [3]:
def unwrap(cell):
    """
    Unwrap a cell from its <p> tag if it exists.
    """
    if cell.p:
        return cell.p
    return cell


def parse_cell(cell):
    """
    Parse a cell with a single-entry in it.
    The expected format is "Title (YYYY)" or "Title (YYYY-YYYY)", or even "Title (YYYY-YY)".
    Also, we expect a hyperlink to the wikipedia page of the movie.

    Args:
        cell: BeautifulSoup tag of the cell.

    Returns:
        title: str, title of the movie.
        year: str, year of the movie. `None` if not found.
        url: str, wikipedia page of the movie. `None` if not found.
    """
    # if cell type is a, then it is a hyperlink
    if cell.name == "a":
        url = cell["href"]
    else:
        url = cell.a["href"].strip() if cell.a else None
    title_year = cell.get_text()
    title = re.sub(r"\((\d{4}(?:-\d{2,4})?)\)", "", title_year).strip()
    match = re.search(r"\((\d{4}(?:-\d{2,4})?)\)", title_year)
    if match:
        year = match.group(1)
    else:
        year = None
    return title, year, url


def parse_multi_cell(cell):
    """
    Parse a cell with multiple entries in it.
    The expected format is a list of <p> tags, <i> tags, and <a> tags.
    
    Args:
        cell: BeautifulSoup tag of the cell.
        
    Returns:
        titles: list, titles of the movies.
        years: list, years of the movies. `None` for each entry if not found.
        urls: list, wikipedia page of the movies. `None` for each if not found.
    """
    titles = []
    years = []
    urls = []
    years = re.findall(r"\((\d{4}(?:-\d{2,4})?)\)", cell.get_text())
    for tag in cell.find_all(["p", "i", "a"], recursive=False):
        tag = unwrap(tag)
        if tag.name == "a":
            url = tag["href"]
        else:
            url = cell.a["href"].strip() if cell.a else None
        title = tag.get_text()
        titles.append(title)
        urls.append(url)
    if len(title) != len(years) and len(years) == 0:
        # NOTE: fallback to filling all years with None if no years has been found.
        years = [None] * len(titles)
    assert len(titles) == len(years) == len(urls)
    return titles, years, urls


def convert_to_df(table):
    rows = table.find_all("tr")
    data = {
        "original": [],
        "original_year": [],
        "original_url": [],
        "remake": [],
        "remake_year": [],
        "remake_url": [],
    }
    for row in rows[1:]:
        cols = row.find_all(["td", "th"])
        if len(cols) == 0:
            continue
        if len(cols) < 3:
            remake_titles, remake_years, remake_urls = parse_multi_cell(cols[0])
            original_title = data["original"][-1]
            original_year = data["original_year"][-1]
            original_url = data["original_url"][-1]
        else:
            original_title, original_year, original_url = parse_cell(cols[0])
            remake_titles, remake_years, remake_urls = parse_multi_cell(cols[1])
        for remake_title, remake_year, remake_url in zip(
            remake_titles, remake_years, remake_urls
        ):
            data["original"].append(original_title)
            data["original_year"].append(original_year)
            data["original_url"].append(original_url if original_url else None)
            data["remake"].append(remake_title)
            data["remake_year"].append(remake_year)
            data["remake_url"].append(remake_url if remake_url else None)

    df = pd.DataFrame(data)
    return df

In [4]:
df_list = []
for table in tables:
    df = convert_to_df(table)
    df_list.append(df)
df = pd.concat(df_list, ignore_index=True)
df

Unnamed: 0,original,original_year,original_url,remake,remake_year,remake_url
0,12 Angry Men,1957,/wiki/12_Angry_Men_(1957_film),12 Angry Men (1997),1997,/wiki/12_Angry_Men_(1997_film)
1,13 Assassins,1963,/wiki/13_Assassins_(1963_film),13 Assassins,2010,/wiki/13_Assassins_(2010_film)
2,13 Ghosts,1960,/wiki/13_Ghosts,Thirteen Ghosts,2001,/wiki/Thirteen_Ghosts
3,13 Tzameti,2005,/wiki/13_Tzameti,13,2010,/wiki/13_(2010_film)
4,3 Idiots,2009,/wiki/3_Idiots,Nanban,2012,/wiki/Nanban_(2012_film)
...,...,...,...,...,...,...
733,Yellow Sky,1948,/wiki/Yellow_Sky,The Jackals,1967,/wiki/The_Jackals
734,Yojimbo,1961,/wiki/Yojimbo_(film),A Fistful of Dollars,1964,/wiki/A_Fistful_of_Dollars
735,Yojimbo,1961,/wiki/Yojimbo_(film),Last Man Standing,1996,/wiki/Last_Man_Standing_(1996_film)
736,"Yours, Mine and Ours",1968,"/wiki/Yours,_Mine_and_Ours_(1968_film)","Yours, Mine & Ours",2005,"/wiki/Yours,_Mine_%26_Ours_(2005_film)"


## Post Cleaning

### Years with Manual Intervention

A small subset of the data does not have the year of the movie. We will manually add the year of the movie.


#### Exploring Messy Data

In [5]:
df[df["original_year"].isnull() | df["remake_year"].isnull()]

Unnamed: 0,original,original_year,original_url,remake,remake_year,remake_url
246,The Godfather (1972–90),,/wiki/The_Godfather_(film_series),Godfather (2007),2007.0,/wiki/Godfather_(2007_film)
347,Kaththi,2014.0,/wiki/Kaththi,Khaidhi No. 150,,/wiki/Khaidi_No._150
452,My Favorite Wife,1940.0,/wiki/My_Favorite_Wife,Something's Got to Give,,/wiki/Something%27s_Got_to_Give


#### Manual Intervention

In [6]:
df.loc[246, ['original', 'original_year', 'remake', 'remake_year']] = ["The Godfather", "1972", "Godfather", "2007"]
df.iloc[246]

original                             The Godfather
original_year                                 1972
original_url     /wiki/The_Godfather_(film_series)
remake                                   Godfather
remake_year                                   2007
remake_url             /wiki/Godfather_(2007_film)
Name: 246, dtype: object

In [7]:
df.loc[347, ['remake_year']] = ["2017"]
df.iloc[347]

original                      Kaththi
original_year                    2014
original_url            /wiki/Kaththi
remake                Khaidhi No. 150
remake_year                      2017
remake_url       /wiki/Khaidi_No._150
Name: 347, dtype: object

In [8]:
df.loc[452, ['remake_year']] = ["1962"]
df.iloc[452]

original                        My Favorite Wife
original_year                               1940
original_url              /wiki/My_Favorite_Wife
remake                   Something's Got to Give
remake_year                                 1962
remake_url       /wiki/Something%27s_Got_to_Give
Name: 452, dtype: object

#### Formatting

In [9]:
df["original_year"] = df["original_year"].astype(int)
df["remake_year"] = df["remake_year"].astype(int)
df

Unnamed: 0,original,original_year,original_url,remake,remake_year,remake_url
0,12 Angry Men,1957,/wiki/12_Angry_Men_(1957_film),12 Angry Men (1997),1997,/wiki/12_Angry_Men_(1997_film)
1,13 Assassins,1963,/wiki/13_Assassins_(1963_film),13 Assassins,2010,/wiki/13_Assassins_(2010_film)
2,13 Ghosts,1960,/wiki/13_Ghosts,Thirteen Ghosts,2001,/wiki/Thirteen_Ghosts
3,13 Tzameti,2005,/wiki/13_Tzameti,13,2010,/wiki/13_(2010_film)
4,3 Idiots,2009,/wiki/3_Idiots,Nanban,2012,/wiki/Nanban_(2012_film)
...,...,...,...,...,...,...
733,Yellow Sky,1948,/wiki/Yellow_Sky,The Jackals,1967,/wiki/The_Jackals
734,Yojimbo,1961,/wiki/Yojimbo_(film),A Fistful of Dollars,1964,/wiki/A_Fistful_of_Dollars
735,Yojimbo,1961,/wiki/Yojimbo_(film),Last Man Standing,1996,/wiki/Last_Man_Standing_(1996_film)
736,"Yours, Mine and Ours",1968,"/wiki/Yours,_Mine_and_Ours_(1968_film)","Yours, Mine & Ours",2005,"/wiki/Yours,_Mine_%26_Ours_(2005_film)"


### Clean Up URLs


#### Adding Base URL

URLs does not include the domain name in most of the cases. We clean the URLs by adding the domain name to the URLs.

In [10]:
def clean_url(url):
    if url is None:
        return None
    if url.startswith("http"):
        return url
    return f"https://en.wikipedia.org{url}"


In [11]:
df["original_url"] = df["original_url"].apply(clean_url)
df["remake_url"] = df["remake_url"].apply(clean_url)
df

Unnamed: 0,original,original_year,original_url,remake,remake_year,remake_url
0,12 Angry Men,1957,https://en.wikipedia.org/wiki/12_Angry_Men_(19...,12 Angry Men (1997),1997,https://en.wikipedia.org/wiki/12_Angry_Men_(19...
1,13 Assassins,1963,https://en.wikipedia.org/wiki/13_Assassins_(19...,13 Assassins,2010,https://en.wikipedia.org/wiki/13_Assassins_(20...
2,13 Ghosts,1960,https://en.wikipedia.org/wiki/13_Ghosts,Thirteen Ghosts,2001,https://en.wikipedia.org/wiki/Thirteen_Ghosts
3,13 Tzameti,2005,https://en.wikipedia.org/wiki/13_Tzameti,13,2010,https://en.wikipedia.org/wiki/13_(2010_film)
4,3 Idiots,2009,https://en.wikipedia.org/wiki/3_Idiots,Nanban,2012,https://en.wikipedia.org/wiki/Nanban_(2012_film)
...,...,...,...,...,...,...
733,Yellow Sky,1948,https://en.wikipedia.org/wiki/Yellow_Sky,The Jackals,1967,https://en.wikipedia.org/wiki/The_Jackals
734,Yojimbo,1961,https://en.wikipedia.org/wiki/Yojimbo_(film),A Fistful of Dollars,1964,https://en.wikipedia.org/wiki/A_Fistful_of_Dol...
735,Yojimbo,1961,https://en.wikipedia.org/wiki/Yojimbo_(film),Last Man Standing,1996,https://en.wikipedia.org/wiki/Last_Man_Standin...
736,"Yours, Mine and Ours",1968,"https://en.wikipedia.org/wiki/Yours,_Mine_and_...","Yours, Mine & Ours",2005,"https://en.wikipedia.org/wiki/Yours,_Mine_%26_..."


#### Handling Missing URLs

In [12]:
df[df["original_url"].isnull() | df["remake_url"].isnull()]

Unnamed: 0,original,original_year,original_url,remake,remake_year,remake_url
95,Brewster's Millions,1914,https://en.wikipedia.org/wiki/Brewster%27s_Mil...,To Ryca!,2016,
144,Dehleez,1983,,Oonche Log,1985,https://en.wikipedia.org/wiki/Oonche_Log_(1985...
260,Haq Mehar,1985,,Sanam Bewafa,1991,https://en.wikipedia.org/wiki/Sanam_Bewafa
509,La Otra,1945,https://en.wikipedia.org/wiki/La_Otra_(film),Killer in the Mirror,1986,
515,Bazar e Husan,1988,,Pati Patni Aur Tawaif,1990,https://en.wikipedia.org/wiki/Pati_Patni_Aur_T...
634,A Storm in Summer,1970,,A Storm in Summer,2000,https://en.wikipedia.org/wiki/A_Storm_in_Summer


#### Manual Intervention

Manually fill the missing urls with other languages Wikipedia pages. We did not found any links for other ones.


In [13]:
df.loc[95,    ['remake_url']] = "https://pt.wikipedia.org/wiki/T%C3%B4_Ryca"
df.loc[144, ['original_url']] = "https://en.wikipedia.org/wiki/Dehleez_(film)"
df.loc[515, ['original_url']] = "https://en.wikipedia.org/wiki/Bazar-e-Husn"

## Get Wikidata IDs

In [14]:
def fetch_actual_title(url):
    """Fetch the actual title of a Wikipedia page by scraping the HTML."""
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    title = soup.find("h1", {"id": "firstHeading"}).text
    return title


def get_qid(wikipedia_url):
    if wikipedia_url is None:
        return None
    parsed_url = urlparse(wikipedia_url)
    language_code = parsed_url.netloc.split(".")[0]
    page_title = fetch_actual_title(wikipedia_url)
    api_url = f"https://{language_code}.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "prop": "pageprops",
        "titles": page_title,
        "format": "json",
    }
    response = requests.get(api_url, params=params)
    data = response.json()
    pages = data.get("query", {}).get("pages", {})
    for _, page_info in pages.items():
        wikidata_id = page_info.get("pageprops", {}).get("wikibase_item")
        if wikidata_id:
            return wikidata_id
    return None


original_wikidata_ids = [get_qid(url) for url in tqdm(df["original_url"].tolist())]
remake_wikidata_ids = [get_qid(url) for url in tqdm(df["remake_url"].tolist())]

df["original_wikidata_id"] = original_wikidata_ids
df["remake_wikidata_id"] = remake_wikidata_ids

100%|██████████| 738/738 [07:04<00:00,  1.74it/s]
100%|██████████| 738/738 [08:39<00:00,  1.42it/s]


In [15]:
df[df["original_wikidata_id"].isnull() | df["remake_wikidata_id"].isnull()]

Unnamed: 0,original,original_year,original_url,remake,remake_year,remake_url,original_wikidata_id,remake_wikidata_id
260,Haq Mehar,1985,,Sanam Bewafa,1991,https://en.wikipedia.org/wiki/Sanam_Bewafa,,Q2220230
509,La Otra,1945,https://en.wikipedia.org/wiki/La_Otra_(film),Killer in the Mirror,1986,,Q3915489,
634,A Storm in Summer,1970,,A Storm in Summer,2000,https://en.wikipedia.org/wiki/A_Storm_in_Summer,,Q2300421


## Save the Output

In [16]:
root_dir = Path("../data/remake")
root_dir.mkdir(parents=True, exist_ok=True)
df.to_csv(root_dir / "remakes.csv", index=False)