### Imports

In [1]:
import pandas as pd
import requests
import json
import concurrent.futures
from bs4 import BeautifulSoup

from helpers.readers import read_dataframe

### Loading CMU movies

In [2]:
cmu_movies = read_dataframe(name='cmu/movies', preprocess=True, usecols=[
    "Wikipedia movie ID", 
    "Freebase movie ID", 
    "Movie name", 
    "Movie release date", 
    "Movie box office revenue", 
    "Movie runtime", 
    "Movie languages", 
    "Movie countries", 
    "Movie genres",
])

Preprocess logs:
✅ Fixed Movie Languages inside Movie Countries
✅ Removed Deseret characters
✅ Movie release date splitted to three columns: Movie release Year, Movie release Month, Movie release Day
✅ Seperated freebase identifiers from Movie Languages, Movie Countries and Movie Genres
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81741 entries, 0 to 81740
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Wikipedia movie ID        81741 non-null  int32  
 1   Freebase movie ID         81741 non-null  string 
 2   Movie name                81741 non-null  string 
 3   Movie box office revenue  8401 non-null   float64
 4   Movie runtime             61291 non-null  float32
 5   Movie release Year        74839 non-null  Int16  
 6   Movie release Month       42667 non-null  Int8   
 7   Movie release Day         39373 non-null  Int8   
 8   Movie languages           81741 non-null  string

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie name,Movie box office revenue,Movie runtime,Movie release Year,Movie release Month,Movie release Day,Movie languages,Movie countries,Movie genres
0,975900,/m/03vyhn,Ghosts of Mars,14010832.0,98.0,2001,8,24,English,United States of America,"Thriller,Science Fiction,Horror,Adventure,Supe..."


### Scraping wikipedia infobox

![](https://iili.io/Jzt025J.png)

Two weaknesses:
- Uses the english version of wikipedia, non english or less popular movies may not have an english webpage
- Less popular movies don't have an infobox

In [None]:
def scrape_infobox(wiki_movie_id):
    url = f"https://en.wikipedia.org/?curid={wiki_movie_id}"
    response = requests.get(url)
    infobox_data = {}

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        infobox = soup.find('table', {'class': 'infobox'})

        if infobox:
            rows = infobox.find_all('tr')
            for row in rows:
                header = row.find('th')
                if header:
                    header_text = header.text.strip()
                    value = row.find('td')
                    if value:
                        # Some values might have references/citations, which you might want to remove
                        for sup in value.find_all('sup'):
                            sup.decompose()
                        infobox_data[header_text] = value.get_text(separator=" ", strip=True)

    return infobox_data

In [None]:
def save_results(results, filename):
    with open(filename, 'w') as f:
        json.dump(results, f)

In [None]:
def process_movie_id(wiki_id):
    try:
        movie_infobox = scrape_infobox(wiki_id)
        return wiki_id, movie_infobox
    except Exception as e:
        print(f"Error processing {wiki_id}: {e}")
        return wiki_id, None

In [3]:
movie_wiki_ids = cmu_movies["Wikipedia movie ID"].tolist()
results = []
max_workers = 10  # Adjust based on network/environment

with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
    results = list(tqdm(executor.map(process_movie_id, movie_wiki_ids), total=len(movie_wiki_ids)))

save_results(results, 'infobox_results.json')
print("Scraping completed.")

  0%|          | 0/81741 [00:00<?, ?it/s]

Error processing 3380491: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
Error processing 27375141: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
Error processing 28131017: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
Error processing 14193740: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
Error processing 31431586: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
Error processing 4287489: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
Error processing 1

### Checking results

In [4]:
import json
import pandas as pd

with open('./generated/infobox_results.json', 'r') as file:
    results = json.load(file)

failed_wiki_ids = [wiki_id for wiki_id, data in results if not data]

print("Failed Wiki IDs:", failed_wiki_ids)

Failed Wiki IDs: [18998739, 9997961, 33427105, 20604092, 31025505, 23799443, 16686643, 19929835, 11717027, 3222645, 32229606, 12142896, 32839744, 30693170, 12491178, 31025540, 18496109, 28327554, 8955475, 22644083, 6792375, 16672517, 33519683, 5771666, 15457461, 30624204, 13931033, 16572418, 33544694, 22757028, 22557257, 28858056, 26726624, 20611297, 9139404, 18848953, 19961614, 4038269, 15890474, 665676, 18900393, 7267251, 30729485, 7458959, 26283021, 2298689, 7280802, 9782958, 20695060, 8088400, 33616397, 18053600, 9993314, 31157561, 9993323, 15753282, 25131827, 7586366, 27044179, 9985468, 36410662, 11921020, 34090675, 23404570, 6109092, 8913767, 34992168, 22644468, 28379587, 9919006, 28492939, 5445021, 34916655, 9919009, 19490946, 6602958, 30531735, 7612646, 28692419, 28379461, 9308502, 34953631, 9879584, 28275468, 967367, 8781041, 28374748, 33551059, 21482599, 36052259, 28492562, 15284386, 27278051, 29510726, 27744173, 24833392, 5922485, 19931996, 6109156, 2267337, 35784403, 619094

In [5]:
len(failed_wiki_ids)

6584

In [6]:
unique_keys = set()
for _, infobox in results:
    if infobox:
        unique_keys.update(infobox.keys())

print("Unique Keys:", unique_keys)

Unique Keys: {'Albanian', 'Tax ID no.', 'Subsidiaries', 'Music', 'Cause of death', 'Former names', 'Debut', "Chosŏn'gŭl", 'Launch date', 'Release date', 'Japanese', 'Followed by', 'Language', 'Education', 'Brands', 'Play(s)', 'Commercial', 'Samkhya', 'Editing by', 'Adaptation by', 'Class', 'Begins', 'TV adaptations', 'Programmer(s)', 'Position(s)', 'Unit', 'President', 'Buy rate', 'Producer(s)', 'Spanish', 'Born', 'Chinese', 'Text', 'Organization', 'Significant other', 'Elevation', 'Greek', 'Relative(s)', 'Teleplay by', 'Purpose', 'First organized', 'Animationservices', '2001', 'Nation from', 'Director(s)', 'Parent', 'Position:', 'Licensed\xa0from', 'Height:', 'Narration by', 'Relatives', 'Sinhala', 'Origin', 'Partnerships', 'Country\xa0(sports)', 'Hannah Montana tour chronology', 'Label', 'Best Comedy Series', 'Ground', 'Creator(s)', 'Animators', 'Thought sequences dialogue by', 'Italian', 'Other names', 'Activity sectors', 'Format(s)', 'Theme music composer', 'Dialogs, lyrics', 'Even

A lot of unique keys, keep only the useful ones

In [7]:
# for example the keys from Avengers2012
# https://en.wikipedia.org/wiki/The_Avengers_(2012_film)
desired_keys = [
    'Directed by', 'Screenplay by', 'Story by', 'Based on', 
    'Produced by', 'Starring', 'Cinematography', 'Edited by', 
    'Music by', 'Production company', 'Distributed by', 
    'Release dates', 'Running time', 'Country', 'Language', 
    'Budget', 'Box office'
]

In [8]:
processed_data = []

for wiki_id, infobox in results:
    if infobox:
        row = [infobox.get(key) for key in desired_keys]
    else:
        row = [None] * len(desired_keys)
    processed_data.append([wiki_id, *row])

column_names = ['Wikipedia movie ID', *desired_keys]

In [10]:
df = pd.DataFrame(processed_data, columns=column_names)
df.fillna(value=pd.NA, inplace=True)
df

Unnamed: 0,Wikipedia movie ID,Directed by,Screenplay by,Story by,Based on,Produced by,Starring,Cinematography,Edited by,Music by,Production company,Distributed by,Release dates,Running time,Country,Language,Budget,Box office
0,975900,John Carpenter,,,,Sandy King,Ice Cube Natasha Henstridge Jason Statham Pam ...,Gary B. Kibbe,Paul C. Warschilka,John Carpenter,,Sony Pictures Releasing,,98 minutes,United States,English,$28 million,$14 million
1,3196793,Edward Lucas,,,,,,,,,Rocket Science Laboratories,,,44–45 minutes,,,,
2,28463795,Sølve Skagen,,,,,Frank Krog Kristin Kajander Anne Krigsvoll,,,,,,,83 minutes,Norway,Norwegian,,
3,9363483,Donald Cammell,Donald Cammell China Cammell,,Mrs. White by Andrew Klavan,Sue Baden-Powell Cassian Elwes Elliott Kastner...,David Keith Cathy Moriarty,Larry McConkey,Terry Rawlings,Rick Fenn Nick Mason,,Cannon Films,9 May 1987 ( 1987-05-09 ) ( Cannes ) 20 May 19...,111 minutes,United Kingdom,English,$2.8 million,
4,261236,Robert van Ackeren,,,,Robert van Ackeren Dieter Geissler [ de ],Gudrun Landgrebe Mathieu Carrière Hanns Zischl...,Jürgen Jürges,Tanja Schmidbauer,Peer Raben,,,,106 minutes,West Germany,German,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81736,35228177,Sid Bennett Director of Animation - Steve Gomez,,Charlie Foley Vaibhav Bhatt,,"Darlow Smithson, Tom Brisley",,,,,,Discovery Communications Animal Planet,,90 minutes,,English,,
81737,34980460,Ian Palmer,,,,Teddy Leifer Ian Palmer,,Michael Doyle Ian Palmer,Ollie Huddleston,Ilan Eshkeri,,,,96 minutes,Ireland,,,
81738,9971909,Bob Einstein,,,,Jonathan Haze Tom Smothers,Rich Little Herb Voland Bruce Kirby Diahn Will...,,,Bob Emenegger,,Fine Films,,66 min,United States,English,,
81739,913762,,,,,,,,,,,,,,,,,


In [11]:
len(df["Box office"].dropna())

18410

In [5]:
df.to_parquet('./generated/scraped_cmu_movies_infobox.parquet', compression='brotli')

In [22]:
subset = df.drop('Wikipedia movie ID', axis=1)

df[subset.notna().any(axis=1)]

Unnamed: 0,Wikipedia movie ID,Directed by,Screenplay by,Story by,Based on,Produced by,Starring,Cinematography,Edited by,Music by,Production company,Distributed by,Release dates,Running time,Country,Language,Budget,Box office
0,975900,John Carpenter,,,,Sandy King,Ice Cube Natasha Henstridge Jason Statham Pam ...,Gary B. Kibbe,Paul C. Warschilka,John Carpenter,,Sony Pictures Releasing,,98 minutes,United States,English,$28 million,$14 million
1,3196793,Edward Lucas,,,,,,,,,Rocket Science Laboratories,,,44–45 minutes,,,,
2,28463795,Sølve Skagen,,,,,Frank Krog Kristin Kajander Anne Krigsvoll,,,,,,,83 minutes,Norway,Norwegian,,
3,9363483,Donald Cammell,Donald Cammell China Cammell,,Mrs. White by Andrew Klavan,Sue Baden-Powell Cassian Elwes Elliott Kastner...,David Keith Cathy Moriarty,Larry McConkey,Terry Rawlings,Rick Fenn Nick Mason,,Cannon Films,9 May 1987 ( 1987-05-09 ) ( Cannes ) 20 May 19...,111 minutes,United Kingdom,English,$2.8 million,
4,261236,Robert van Ackeren,,,,Robert van Ackeren Dieter Geissler [ de ],Gudrun Landgrebe Mathieu Carrière Hanns Zischl...,Jürgen Jürges,Tanja Schmidbauer,Peer Raben,,,,106 minutes,West Germany,German,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81735,32468537,Anton Megerdichev,,,,,Denis Nikiforov Yelena Panova Andrey Panin Vic...,,,Aleksei Shelygin,,,,132 minutes,Russia,Russian / English,,
81736,35228177,Sid Bennett Director of Animation - Steve Gomez,,Charlie Foley Vaibhav Bhatt,,"Darlow Smithson, Tom Brisley",,,,,,Discovery Communications Animal Planet,,90 minutes,,English,,
81737,34980460,Ian Palmer,,,,Teddy Leifer Ian Palmer,,Michael Doyle Ian Palmer,Ollie Huddleston,Ilan Eshkeri,,,,96 minutes,Ireland,,,
81738,9971909,Bob Einstein,,,,Jonathan Haze Tom Smothers,Rich Little Herb Voland Bruce Kirby Diahn Will...,,,Bob Emenegger,,Fine Films,,66 min,United States,English,,
