# Preliminary analysis of Movies Dataset

In [48]:
import os
while os.path.basename(os.getcwd()) != "ada-project-private":
    os.chdir('..')

import pandas as pd
import json

DATA_FOLDER = './MovieSummaries/'

# Load character.metadata.tsv
character_metadata_cols = ["Wikipedia movie ID", "Freebase movie ID",
                           "Movie release date", "Character name",
                           "Actor date of birth", "Actor gender", "Actor height (in meters)", "Actor ethnicity (Freebase ID)",
                           "Actor name", "Actor age at movie release",
                           "Freebase character/actor map ID", "Freebase character ID", "Freebase actor ID"]
character_metadata_df = pd.read_csv(DATA_FOLDER + 'character.metadata.tsv', sep='\t', header=None, names=character_metadata_cols)

# Load movie.metadata.tsv
movie_metadata_cols = ["Wikipedia movie ID", "Freebase movie ID", "Movie name", "Movie release date", "Movie box office revenue", "Movie runtime",
                       "Movie languages (Freebase ID:name tuples)", "Movie countries (Freebase ID:name tuples)",
                       "Movie genres (Freebase ID:name tuples)"]
movie_metadata_df = pd.read_csv(DATA_FOLDER + 'movie.metadata.tsv', sep='\t', header=None, names=movie_metadata_cols)

# Load name.clusters.txt
name_clusters_cols = ["Character name", "Freebase movie ID"]
name_clusters_df = pd.read_csv(DATA_FOLDER + 'name.clusters.txt', sep='\t', header=None, names=name_clusters_cols)

# Load plot_summaries.txt
plot_summaries_cols = ["Wikipedia movie ID", "Wikipedia plot"]
plot_summaries_df = pd.read_csv(DATA_FOLDER + 'plot_summaries.txt', sep='\t', header=None, names=plot_summaries_cols)

# Load tvtropes.clusters.txt
tvtropes_clusters_cols = ["Character types", "details"]
tvtropes_clusters_df = pd.read_csv(DATA_FOLDER + 'tvtropes.clusters.txt', sep='\t', header=None, names=tvtropes_clusters_cols)

tvtropes_clusters_df["details_dict"] = tvtropes_clusters_df["details"].apply(json.loads)
tvtropes_clusters_df["Character name"] = tvtropes_clusters_df["details_dict"].apply(lambda x: x.get('char'))
tvtropes_clusters_df["Movie name"] = tvtropes_clusters_df["details_dict"].apply(lambda x: x.get('movie'))
tvtropes_clusters_df["Freebase character/actor map ID"] = tvtropes_clusters_df["details_dict"].apply(lambda x: x.get('id'))
tvtropes_clusters_df["Actor name"] = tvtropes_clusters_df["details_dict"].apply(lambda x: x.get('actor'))

tvtropes_clusters_df.drop(columns=["details", "details_dict"], inplace=True)

In [50]:
def print_missing_stats(df):
    print("total len:", len(df))
    for col in df.columns:
        print("missing " + col + ":", sum(df[col].isna()))

In [5]:
print_missing_stats(character_metadata_df)

print("")
print("missing Freebase actor ID but not Actor name:", sum(character_metadata_df["Freebase actor ID"].isna() & character_metadata_df["Actor name"].notna()))
print("We drop all the rows that have NA in the column \"Freebase actor ID\"")
character_metadata_df = character_metadata_df.dropna(subset=["Freebase actor ID"])

print("")
print_missing_stats(character_metadata_df)

character_metadata_df[character_metadata_df["Freebase movie ID"] == "/m/03vyhn"].head()

total len: 450669
missing Wikipedia movie ID: 0
missing Freebase movie ID: 0
missing Movie release date: 9995
missing Character name: 257875
missing Actor date of birth: 106145
missing Actor gender: 45609
missing Actor height (in meters): 295845
missing Actor ethnicity (Freebase ID): 344611
missing Actor name: 1228
missing Actor age at movie release: 158113
missing Freebase character/actor map ID: 0
missing Freebase character ID: 257865
missing Freebase actor ID: 815

missing Freebase actor ID but not Actor name: 0
We drop all the rows that have NA in the column "Freebase actor ID"

total len: 449854
missing Wikipedia movie ID: 0
missing Freebase movie ID: 0
missing Movie release date: 9980
missing Character name: 257389
missing Actor date of birth: 105330
missing Actor gender: 44794
missing Actor height (in meters): 295030
missing Actor ethnicity (Freebase ID): 343796
missing Actor name: 413
missing Actor age at movie release: 157298
missing Freebase character/actor map ID: 0
missing 

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie release date,Character name,Actor date of birth,Actor gender,Actor height (in meters),Actor ethnicity (Freebase ID),Actor name,Actor age at movie release,Freebase character/actor map ID,Freebase character ID,Freebase actor ID
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.78,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4
2,975900,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l
3,975900,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12,M,1.75,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc
4,975900,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25,F,1.65,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg


In [51]:
print_missing_stats(movie_metadata_df)

movie_metadata_df.head(10)

total len: 81741
missing Wikipedia movie ID: 0
missing Freebase movie ID: 0
missing Movie name: 0
missing Movie release date: 6902
missing Movie box office revenue: 73340
missing Movie runtime: 20450
missing Movie languages (Freebase ID:name tuples): 0
missing Movie countries (Freebase ID:name tuples): 0
missing Movie genres (Freebase ID:name tuples): 0


Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie name,Movie release date,Movie box office revenue,Movie runtime,Movie languages (Freebase ID:name tuples),Movie countries (Freebase ID:name tuples),Movie genres (Freebase ID:name tuples)
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"
5,13696889,/m/03cfc81,The Gangsters,1913-05-29,,35.0,"{""/m/06ppq"": ""Silent film"", ""/m/02h40lc"": ""Eng...","{""/m/09c7w0"": ""United States of America""}","{""/m/02hmvc"": ""Short Film"", ""/m/06ppq"": ""Silen..."
6,18998739,/m/04jcqvw,The Sorcerer's Apprentice,2002,,86.0,"{""/m/02h40lc"": ""English Language""}","{""/m/0hzlz"": ""South Africa""}","{""/m/0hqxf"": ""Family Film"", ""/m/01hmnh"": ""Fant..."
7,10408933,/m/02qc0j7,Alexander's Ragtime Band,1938-08-16,3600000.0,106.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/04t36"": ""Musical"", ""/m/01z4y"": ""Comedy"", ..."
8,9997961,/m/06_y2j7,Contigo y aquí,1974,,,"{""/m/06nm1"": ""Spanish Language""}","{""/m/0jgd"": ""Argentina""}","{""/m/04t36"": ""Musical"", ""/m/07s9rl0"": ""Drama"",..."
9,2345652,/m/075f66,City of the Dead,1960,,76.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/03npn"": ""Horror"", ""/m/0fdjb"": ""Supernatur..."


In [7]:
print("total len:", len(plot_summaries_df))

# plot_summaries_df.head(10)

total len: 42303


In [8]:
import requests

def download_sparql_csv(query, file_path):
    url = 'https://query.wikidata.org/sparql'
    
    response = requests.get(url, params={'query': query}, headers={'Accept': 'text/csv'})
    
    if response.status_code == 200:
        with open(file_path, 'wb') as file:
            file.write(response.content)
        return True
    else:
        print(f"Error: Failed to retrieve data. Status code {response.status_code}")
        return False

In [7]:
generic_query = '''
SELECT ?s ?sLabel ?freebaseID ?duration
WHERE {
  VALUES ?freebaseID {
|
  }
  
  ?s wdt:P646 ?freebaseID .
  
  OPTIONAL { ?s wdt:P2047 ?duration }
  
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . }
}
'''

In [8]:
# n = len(movie_metadata_df)
# step = 100

# low = 0
# c = 1
# while low < n:
#     up = min(low + step, n)

#     ids_string = "\n".join(movie_metadata_df["Freebase movie ID"][low:up].astype(str).apply(lambda x: "\"" + x + "\""))
#     query = generic_query.replace("|",ids_string)
    
#     success = download_sparql_csv(query, f"AdditionalData/duration_{c:04d}.csv")
    
#     if success:
#         low = up
#         c += 1
    
#     print(f"{up}/{n} done")

# pd.concat([pd.read_csv(f"AdditionalData/duration_{i:04d}.csv") for i in range(1,c)], ignore_index=True).to_csv("movie_duration_from_wikidata.csv", index=False)

In [13]:
import re

durations_df = pd.read_csv("movie_duration_from_wikidata.csv")

counts_durations_df = durations_df["freebaseID"].value_counts()

durations_df[durations_df["freebaseID"].apply(lambda x: counts_durations_df[x] > 1)].head(20)

pattern = r'^Q\d+$'
def merge_labels(labels):
    n = len(labels)
    for i in range(n):
        if not re.match(pattern, labels.iloc[i]):
            return labels.iloc[i]
    return labels.iloc[n-1]

def first(l):
    return l.iloc[0]

def max_skip_na(l):
    return l.dropna().max()

result = durations_df.groupby('freebaseID').agg({
    's': first,
    'sLabel': merge_labels,
    'duration': max_skip_na
})

result = result.reset_index()

In [14]:
movies_df = pd.merge(movie_metadata_df, result, left_on="Freebase movie ID", right_on="freebaseID", how="outer")

movies_df["true_duration"] = movies_df.apply(lambda row: row["Movie runtime"] if pd.notna(row["Movie runtime"]) else row["duration"], axis=1)

print_missing_stats(movies_df)

81741 75376 81741


In [1]:
character_metadata_df["Actor ethnicity (Freebase ID)"].unique()[1:]

basic_query = '''
SELECT ?s ?sLabel ?freebaseID
WHERE {{
  
  VALUES ?freebaseID {{ 
{ids_string}
  }}
  
  ?s wdt:P646 ?freebaseID .
  
  SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en" . }}
  
}}
'''

ids_string = "\n".join(["\"" + x + "\"" for x in character_metadata_df["Actor ethnicity (Freebase ID)"].unique()[1:].astype(str)])

query = basic_query.format(ids_string = ids_string)

print(query)

NameError: name 'character_metadata_df' is not defined

In [10]:
generic_query = '''
SELECT ?s ?sLabel ?freebaseID ?castMember ?castMemberLabel ?character ?characterLabel ?roleLabel
WHERE {
  
  VALUES ?freebaseID { 
|
  }
  
  ?s wdt:P646 ?freebaseID .

  OPTIONAL { 
    ?s p:P161 ?castStatement .
    ?castStatement ps:P161 ?castMember .
    OPTIONAL { ?castStatement pq:P453 ?character }
    OPTIONAL { ?castStatement pq:P4633 ?role }
  }
  
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . }
  
}
'''

n = len(movie_metadata_df)
step = 100

low = 0
c = 1
while low < n:
    up = min(low + step, n)

    ids_string = "\n".join(movie_metadata_df["Freebase movie ID"][low:up].astype(str).apply(lambda x: "\"" + x + "\""))
    query = generic_query.replace("|", ids_string)
    
    success = download_sparql_csv(query, f"tempData/actors_roles_{c:04d}.csv")
    
    if success:
        low = up
        c += 1
        print(f"{up}/{n} done")
    else:
        print(f"{low} - {up} to redo")

pd.concat([pd.read_csv(f"tempData/actors_roles_{i:04d}.csv") for i in range(1,c)], ignore_index=True).to_csv("actors_roles_from_wikidata.csv", index=False)

100/81741 done
200/81741 done
300/81741 done
400/81741 done
500/81741 done
600/81741 done
700/81741 done
800/81741 done
900/81741 done
1000/81741 done
1100/81741 done
1200/81741 done
1300/81741 done
1400/81741 done
1500/81741 done
1600/81741 done
1700/81741 done
1800/81741 done
1900/81741 done
2000/81741 done
2100/81741 done
2200/81741 done
2300/81741 done
2400/81741 done
2500/81741 done
2600/81741 done
2700/81741 done
2800/81741 done
2900/81741 done
3000/81741 done
3100/81741 done
3200/81741 done
3300/81741 done
3400/81741 done
Error: Failed to retrieve data. Status code 429
3400 - 3500 to redo
3500/81741 done
3600/81741 done
3700/81741 done
3800/81741 done
3900/81741 done
4000/81741 done
4100/81741 done
4200/81741 done
4300/81741 done
4400/81741 done
4500/81741 done
4600/81741 done
4700/81741 done
4800/81741 done
4900/81741 done
5000/81741 done
5100/81741 done
5200/81741 done
5300/81741 done
5400/81741 done
5500/81741 done
5600/81741 done
5700/81741 done
5800/81741 done
5900/81741 do

FileNotFoundError: [Errno 2] No such file or directory: 'AdditionalData/actors_roles_0001.csv'

In [35]:
from fuzzywuzzy import fuzz

def names_similarity(str1, str2):
    return fuzz.ratio(str1.lower(), str2.lower())/100.

In [42]:
def time_to_minutes(time_string):
    hours = 0
    minutes = 0
    
    time_parts = time_string.split()

    for part in time_parts:
        if 'h' in part:
            hours = int(part.replace('h', ''))
        if 'm' in part:
            minutes = int(part.replace('m', ''))

    total_minutes = hours * 60 + minutes
    return total_minutes

def safe_int_conversion(value, default):
    try:
        return int(value)
    except ValueError:
        return default

In [46]:
import requests
from bs4 import BeautifulSoup

def get_rotten_url_from_movie_name(movie_name, year = None, toll = 0.8):
    r = requests.get("https://www.rottentomatoes.com/search?search=" + requests.utils.quote(movie_name, safe=''))
    soup = BeautifulSoup(r.content, 'html.parser')
    movies = soup.find_all('search-page-media-row')
    movie_list = []
    
    for movie in movies:
        title_tag = movie.find('a', {'slot': 'title'})
        title = title_tag.get_text(strip=True) if title_tag else ''
        
        release_year = safe_int_conversion(movie.get('releaseyear', ''), None)
        
        link = title_tag.get('href') if title_tag else ''
        
        movie_list.append((title, release_year, link))

    best_match = dict()
    best_match['toll'] = toll
    best_match['movie'] = (None, None, None)
    for movie in movie_list:
        similarity = names_similarity(movie[0], movie_name)
        if similarity > best_match['toll'] and (year is None or movie[1] is None or abs(movie[1] - year) <= 1):
            best_match['toll'] = similarity
            best_match['movie'] = movie
    return best_match['movie'][2]

print(get_rotten_url_from_movie_name("The Sorcerer's Apprentice", 2001))

https://www.rottentomatoes.com/m/1170191-sorcerers_apprentice


In [61]:

def get_rotten_movie_metadata(url, toll = 0.95):
    movie_metadata = dict()
    
    if url is None:
        return movie_metadata
    r = requests.get(url)
    soup = BeautifulSoup(r.content, "html.parser")
    
    movie_metadata["title"] = soup.find('rt-text', {'slot': 'title'}).get_text(strip=True)
    
    critics_score_string = soup.find('rt-button', {'slot': 'criticsScore'}).find('rt-text').get_text(strip=True).replace("%","")
    movie_metadata["critics_score"] = safe_int_conversion(critics_score_string, None)
    audience_score_string = soup.find('rt-button', {'slot': 'audienceScore'}).find('rt-text').get_text(strip=True).replace("%","")
    movie_metadata["audience_score"] = safe_int_conversion(audience_score_string, None)
    
    category_wraps = soup.find_all('div', {'class': 'category-wrap', 'data-qa': 'item'})
    for wrap in category_wraps:
        wrap_title = wrap.find('rt-text', {'class': 'key'}).get_text(strip=True)
        
        if names_similarity(wrap_title, "Rating") > toll:
            movie_metadata["rating"] = wrap.find(['rt-text', 'rt-link'], {'data-qa': 'item-value'}).get_text(strip=True)
        if names_similarity(wrap_title, "Genre") > toll:
            movie_metadata["genre"] = wrap.find(['rt-text', 'rt-link'], {'data-qa': 'item-value'}).get_text(strip=True)
        if names_similarity(wrap_title, "Original Language") > toll:
            movie_metadata["language"] = wrap.find(['rt-text', 'rt-link'], {'data-qa': 'item-value'}).get_text(strip=True)
        if names_similarity(wrap_title, "Runtime") > toll:
            runtime_string = wrap.find(['rt-text', 'rt-link'], {'data-qa': 'item-value'}).get_text(strip=True)
            movie_metadata["runtime"] = time_to_minutes(runtime_string)
    
    return movie_metadata

for i in range(50):
    row = movie_metadata_df.iloc[i]

    movie_title = row["Movie name"]
    if pd.notna(row["Movie release date"]):
        year = safe_int_conversion(row["Movie release date"].split('-')[0], None)
    else:
        year = None
    
    print(movie_title, year)
    print(get_rotten_movie_metadata(get_rotten_url_from_movie_name(movie_title, year)))

Ghosts of Mars 2001
{}
Getting Away with Murder: The JonBenét Ramsey Mystery 2000
{}
Brun bitter 1988
{}
White Of The Eye 1987
{'title': 'White of the Eye', 'critics_score': 50, 'audience_score': 57, 'rating': 'R', 'genre': 'Mystery & Thriller', 'language': 'English', 'runtime': 110}
A Woman in Flames 1983
{'title': 'A Woman in Flames', 'critics_score': None, 'audience_score': 62, 'genre': 'Drama', 'language': 'German', 'runtime': 107}
The Gangsters 1913
{}
The Sorcerer's Apprentice 2002
{'title': "The Sorcerer's Apprentice", 'critics_score': None, 'audience_score': 32, 'genre': 'Fantasy', 'language': 'English', 'runtime': 89}
Alexander's Ragtime Band 1938
{'title': "Alexander's Ragtime Band", 'critics_score': 83, 'audience_score': 46, 'genre': 'Musical', 'language': 'English', 'runtime': 105}
Contigo y aquí 1974
{}
City of the Dead 1960
{}
Sarah and Son 1930
{'title': 'Sarah and Son', 'critics_score': None, 'audience_score': 22, 'genre': 'Drama', 'language': 'English', 'runtime': 86}
