In [92]:
import gzip
from tqdm.auto import tqdm
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from tqdm.auto import tqdm, trange
import seaborn as sns
from collections import defaultdict
import json
import urllib.parse
import requests as rq

In [93]:
char_processed = pd.read_csv(
    "../../data/MovieSummaries/character_processed.csv",
    parse_dates=["actor_date_of_birth", "movie_release_date"],
)

char_processed = char_processed.rename(columns={
    'Wikipedia movie ID': "wikipedia_movie_id",
    'Freebase movie ID': "fb_movie_id",
    'Character name': "character_name",
    'Actor gender': "actor_gender",
    'Actor height (in meters)': "actor_height",
    'Actor ethnicity (Freebase ID)': "fb_actor_eth_id",
    'Actor name': "actor_name",
    'Freebase character/actor map ID': "fb_char_actor_map_id",
    'Freebase character ID': "fb_char_id",
    'Freebase actor ID': "fb_actor_id",
})

char_processed

Unnamed: 0,wikipedia_movie_id,fb_movie_id,character_name,actor_gender,actor_height,fb_actor_eth_id,actor_name,fb_char_actor_map_id,fb_char_id,fb_actor_id,actor_date_of_birth,movie_release_date,ethn_name,race
0,975900,/m/03vyhn,Akooshay,F,1.620,,Wanda De Jesus,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7,1958-08-26,2001-08-24,,
1,975900,/m/03vyhn,Lieutenant Melanie Ballard,F,1.780,/m/044038p,Natasha Henstridge,/m/0jys3m,/m/0bgchn4,/m/0346l4,1974-08-15,2001-08-24,,
2,975900,/m/03vyhn,Desolation Williams,M,1.727,/m/0x67,Ice Cube,/m/0jys3g,/m/0bgchn_,/m/01vw26l,1969-06-15,2001-08-24,African American,African
3,975900,/m/03vyhn,Sgt Jericho Butler,M,1.750,,Jason Statham,/m/02vchl6,/m/0bgchnq,/m/034hyc,1967-09-12,2001-08-24,,
4,975900,/m/03vyhn,Bashira Kincaid,F,1.650,,Clea DuVall,/m/02vbb3r,/m/0bgchp9,/m/01y9xg,1977-09-25,2001-08-24,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
334982,913762,/m/03pcrp,UN Spacy Commander,M,,,Sonny Byrkett,/m/0kr407w,/m/0kr407_,/m/0gn4bz,1954-01-01,1992-05-21,,
334983,913762,/m/03pcrp,Silvie Gena,F,,,Susan Byrkett,/m/0kr40b9,/m/0kr40bf,/m/0gn4nd,1958-01-01,1992-05-21,,
334984,913762,/m/03pcrp,Elensh,F,,,Dorothy Elias-Fahn,/m/0kr406c,/m/0kr406h,/m/0b_vcv,1970-05-01,1992-05-21,,
334985,913762,/m/03pcrp,Hibiki,M,,,Jonathan Fahn,/m/0kr405_,/m/0kr4090,/m/0bx7_j,1965-04-12,1992-05-21,,


In [3]:
interesting_ids = (
    set(char_processed.fb_movie_id)
)
interesting_ids = set([
    '<http://rdf.freebase.com/ns/' + i.lstrip('/').replace('/', '.') + '>'
    for i in interesting_ids
    if isinstance(i, str)
])

len(interesting_ids), next(iter(interesting_ids))

(59015, '<http://rdf.freebase.com/ns/m.07mwqm>')

In [95]:
movies_data = defaultdict(list)

with gzip.open('extracted-all.gz', 'r') as fin:
    progress = tqdm(fin, total=289924448, mininterval=2.0)
    for line in progress:
        line = line.decode('utf-8')
        triplets = line.split('\t', 2)
        if triplets[0] in interesting_ids:
            progress.set_description(f"{len(movies_data) = }", refresh=False)
            movies_data[triplets[0]].append(triplets[1:])

  0%|          | 0/289924448 [00:00<?, ?it/s]

In [78]:
def extract_movie_data(fb_id):
    orig_id = fb_id
    fb_id = fb_id.strip('/').replace('/', '.')
    fb_id = f"<http://rdf.freebase.com/ns/{fb_id}>"

    if fb_id not in movies_data:
        return None

    info = movies_data[fb_id]
    result = dict(
        fb_movie_id=orig_id,
    )

    for predicate, value in info:
        value = value.strip('.\t \n')

        if '<http://rdf.freebase.com/ns/common.topic.topic_equivalent_webpage>' in predicate and '<http://en.wikipedia.org/' in value:
            result['wiki_en_page'] = value.strip('<> \n')
        if '<http://rdf.freebase.com/key/wikipedia.en_id>' in predicate:
            result['wiki_en_id'] = value.strip('<> \n"')
        if '<http://rdf.freebase.com/key/wikipedia.en_title>' in predicate:
            result['wiki_en_title'] = value.strip('<> \n"')
        if '<http://rdf.freebase.com/ns/type.object.key>' in predicate and '/wikipedia/en' in value:
            result['wiki_en_path'] = value.strip('<> \n"')
        if '<http://rdf.freebase.com/ns/common.topic.description>' in predicate and '@en' in value:
             result['discription'] = value.replace('@en', '').strip('<> \n"')
    return result

In [79]:
extract_movie_data(char_processed.fb_movie_id[1])

{'fb_movie_id': '/m/03vyhn',
 'discription': "John Carpenter's Ghosts of Mars is a 2001 American science fiction action horror film composed, written, and directed by John Carpenter. The film stars Ice Cube, Natasha Henstridge, Jason Statham, Pam Grier, Clea DuVall, and Joanna Cassidy. The film had received negative reviews and was a box office bomb, scoring just a 21% rating on Rotten Tomatoes and earning $14 million at the box office, against a $28 million production budget.",
 'wiki_en_path': '/wikipedia/en/Ghosts_of_mars',
 'wiki_en_id': '975900',
 'wiki_en_title': 'Ghosts_of_Mars',
 'wiki_en_page': 'http://en.wikipedia.org/wiki/Ghosts_of_Mars'}

In [80]:
result = map(extract_movie_data, tqdm(char_processed.fb_movie_id.unique()))
result = list(filter(lambda x: x is not None, result))

  0%|          | 0/59015 [00:00<?, ?it/s]

In [87]:
extracted_titles = dict()
batch = 50

In [91]:
for i in trange(0, len(result), batch):
    pageids = []
    for res in result[i:i+batch]:
        if 'wiki_en_id' in res and str(res['wiki_en_id']) not in extracted_titles:
            pageids.append(str(res['wiki_en_id']))
    if len(pageids) == 0:
        continue
    pageids = '|'.join(pageids) 
    request_str = f'https://en.wikipedia.org/w/api.php?action=query&pageids={pageids}&format=json&prop=info'
    resp = rq.get(request_str)
    data = json.loads(resp.text)
    if 'query' in data and 'pages' in data['query']:
       extracted_titles |= data['query']['pages']

  0%|          | 0/1179 [00:00<?, ?it/s]

In [97]:
for i in result:
    if 'wiki_en_id' in i:
        i['wiki_api_title'] = extracted_titles.get(str(i['wiki_en_id']))

In [98]:
with open('movies_freebase.json', 'w') as file:
    json.dump(
        result,
        file
    )

In [99]:
import pandas as pd
df = pd.DataFrame(result)
df

Unnamed: 0,fb_movie_id,discription,wiki_en_path,wiki_en_id,wiki_en_title,wiki_en_page,wiki_api_title
0,/m/03vyhn,John Carpenter's Ghosts of Mars is a 2001 Amer...,/wikipedia/en/Ghosts_of_mars,975900,Ghosts_of_Mars,http://en.wikipedia.org/wiki/Ghosts_of_Mars,"{'pageid': 975900, 'ns': 0, 'title': 'Ghosts o..."
1,/m/08yl5d,Getting Away with Murder: The JonBenet Ramsey ...,/wikipedia/en_title/Getting_Away_with_Murder$0...,3196793,Getting_Away_with_Murder$003A_The_JonBen$00E9t...,http://en.wikipedia.org/wiki/Getting_Away_with...,"{'pageid': 3196793, 'ns': 0, 'title': 'Getting..."
2,/m/0crgdbh,"Brun bitter, also called Hair of the Dog, is a...",/wikipedia/en_id/28463795,28463795,Brun_bitter,http://en.wikipedia.org/wiki/Brun_bitter,"{'pageid': 28463795, 'ns': 0, 'title': 'Brun b..."
3,/m/01mrr1,A Woman in Flames is a German drama film from ...,/wikipedia/en_id/261236,261236,A_Woman_in_Flames,http://en.wikipedia.org/wiki/index.html?curid=...,"{'pageid': 261236, 'ns': 0, 'title': 'A Woman ..."
4,/m/06yc6v,Me and You and Everyone We Know is a 2005 Amer...,/wikipedia/en/Me_and_You_and_Everyone_We_Know,2238856,Me_and_You_and_Everyone_We_Know,http://en.wikipedia.org/wiki/index.html?curid=...,"{'pageid': 2238856, 'ns': 0, 'title': 'Me and ..."
...,...,...,...,...,...,...,...
58927,/m/0kv0xs,The Heart Breaker is a 1925 American short com...,/wikipedia/en/The_Heart-Breaker,7223293,The_Heart_Breaker,http://en.wikipedia.org/wiki/The_Heart_Breaker,"{'pageid': 7223293, 'ns': 0, 'title': 'The Hea..."
58928,/m/02pygw1,Another Nice Mess is a 1972 comedy film writte...,/wikipedia/en_title/Another_Nice_Mess,9971909,Another_Nice_Mess,http://en.wikipedia.org/wiki/Another_Nice_Mess,"{'pageid': 9971909, 'ns': 0, 'title': 'Another..."
58929,/m/02w7zz8,Spliced is a 2002 horror film starring Ron Sil...,/wikipedia/en_title/Spliced_$0028film$0029,12476867,Spliced_$0028film$0029,http://en.wikipedia.org/wiki/Spliced_(film),"{'pageid': 12476867, 'ns': 0, 'title': 'Splice..."
58930,/m/03pcrp,Super Dimensional Fortress Macross II: Lovers ...,/wikipedia/en/Marduk_$0028Macross$0029,913762,Super_Dimensional_Fortress_Macross_II$003A_Lov...,http://en.wikipedia.org/wiki/Super_Dimensional...,"{'pageid': 913762, 'ns': 0, 'title': 'Super Di..."


In [101]:
df.wiki_api_title.isna().sum()

1223

In [48]:
import wikipediaapi

wiki = wikipediaapi.Wikipedia(
    user_agent="MovieMetadataSelector", language="en"
)

In [65]:
page = wiki.info(
    result[1]['wiki_en_id']
)
page.exists()

False

In [66]:
page

3196793 (id: -1, ns: 0)

'{"batchcomplete":"","query":{"pages":{"3196793":{"pageid":3196793,"ns":0,"title":"Getting Away with Murder: The JonBenet Ramsey Story","contentmodel":"wikitext","pagelanguage":"en","pagelanguagehtmlcode":"en","pagelanguagedir":"ltr","touched":"2024-12-10T03:01:09Z","lastrevid":1256673340,"length":7196}}}}'