In [1]:
import pandas as pd

plot_summaries_path = "MovieSummaries/plot_summaries.txt"
movie_metadata_path = "MovieSummaries/movie.metadata.tsv"
character_metadata_path = "MovieSummaries/character.metadata.tsv"


movie_metadata = pd.read_csv(
    movie_metadata_path,
    sep='\t',
    header=None,
    names=[
        "Wikipedia_movie_ID",
        "Freebase_movie_ID",
        "Movie_name",
        "Movie_release_date",
        "Movie_box_office_revenue",
        "Movie_runtime",
        "Movie_languages",
        "Movie_countries",
        "Movie_genres"
    ]
)

character_metadata = pd.read_csv(
    character_metadata_path,
    sep='\t',
    header=None,
    names=[
        "Wikipedia_movie_ID",
        "Freebase_movie_ID",
        "Movie_release_date",
        "Character_name",
        "Actor_date_of_birth",
        "Actor_gender",
        "Actor_height",
        "Actor_ethnicity",
        "Actor_name",
        "Actor_age_at_movie_release",
        "Freebase_character_actor_map_ID",
        "Freebase_character_ID",
        "Freebase_actor_ID"
    ]
)

In [2]:
#convert movie date
movie_metadata['Movie_release_date'] = pd.to_datetime(
    movie_metadata['Movie_release_date'], errors='coerce', format='%Y-%m-%d'
)

**Crawling revenue information from websites**

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

base_url = 'https://www.boxofficemojo.com/year/world/'

request_headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'}

start_year = 1977
end_year = 2017

all_data = []

for year in range(start_year, end_year + 1):
    url = f"{base_url}{year}/"
    print(year)

    response = requests.get(url, headers=request_headers)
    
    if response.status_code == 200:

        soup = BeautifulSoup(response.text, 'html.parser')
        

        table = soup.find('table')
        
        if table:
            column_headers = [header.text.strip() for header in table.find_all('th')]
            
            rows = []
            for row in table.find_all('tr')[1:]:
                cols = row.find_all('td')
                cols = [ele.text.strip() for ele in cols]
                if cols:
                    rows.append(cols)
            
            if rows:
                df = pd.DataFrame(rows, columns=column_headers)
                df['Year'] = year
                
                all_data.append(df)

    time.sleep(1)

final_df = pd.concat(all_data, ignore_index=True)
final_df.to_csv('box_office_1977_to_2017.csv', index=False)

**Using API to fill Nan**

Due to regulations, I do not display and store any of the data we acquire and only use it for analysis.

In [7]:
API_KEY = ''

This if for all data

In [17]:
def get_movie_id(movie_name):
    search_url = 'https://api.themoviedb.org/3/search/movie'
    params = {
        'api_key': API_KEY,
        'query': movie_name,
        'language': 'en-US'
    }
    response = requests.get(search_url, params=params)
    data = response.json()
    if data['results']:
        return data['results'][0]['id']
    else:
        print(f"Movie not found: {movie_name}")
        return None

def get_movie_details(movie_id):
    details_url = f'https://api.themoviedb.org/3/movie/{movie_id}'
    params = {
        'api_key': API_KEY,
        'language': 'en-US'
    }
    response = requests.get(details_url, params=params)
    return response.json()

def get_movie_keywords(movie_id):
    keywords_url = f'https://api.themoviedb.org/3/movie/{movie_id}/keywords'
    params = {
        'api_key': API_KEY
    }
    response = requests.get(keywords_url, params=params)
    return response.json()

def get_movie_certification(movie_id, country_code='US'):
    release_dates_url = f'https://api.themoviedb.org/3/movie/{movie_id}/release_dates'
    params = {
        'api_key': API_KEY
    }
    response = requests.get(release_dates_url, params=params)
    data = response.json()
    certifications = []
    for result in data.get('results', []):
        if result['iso_3166_1'] == country_code:
            for release in result['release_dates']:
                certification = release.get('certification')
                if certification:
                    certifications.append(certification)
    # 返回唯一的认证列表
    return list(set(certifications))

def extract_movie_info(movie_data, keywords, certification):
    title = movie_data.get('title')
    rating = movie_data.get('vote_average')
    vote_count = movie_data.get('vote_count')
    revenue = movie_data.get('revenue')
    budget = movie_data.get('budget')
    genres = [genre['name'] for genre in movie_data.get('genres', [])]
    keyword_list = [keyword['name'] for keyword in keywords.get('keywords', [])]
    certification_info = certification
    return {
        'title': title,
        'rating': rating,
        'vote_count': vote_count,
        'genres': genres,
        'budget': budget,
        'revenue': revenue,
        'keywords': keyword_list,
        'certification': certification_info
    }

In [None]:
from tqdm import tqdm

movie_names = movie_metadata['Movie_name']

movie_data_all = []
for name in tqdm(movie_names, desc='Processing movies', unit='movie'):
    movie_id = get_movie_id(name)
    if movie_id:
        details = get_movie_details(movie_id)
        keywords = get_movie_keywords(movie_id)
        certification = get_movie_certification(movie_id)
        info = extract_movie_info(details, keywords, certification)
        movie_data_all.append(info)


In [None]:
for movie in movie_data_all:
    print(f"Title: {movie['title']}")
    print(f"Genres: {', '.join(movie['genres'])}")
    print(f"Budget: ${movie['budget']}")
    print(f"Revenue: ${movie['revenue']}")
    print(f"Average Rating: {movie['rating']} (based on {movie['vote_count']} votes)")
    print(f"Keywords: {', '.join(movie['keywords'])}")
    print(f"Certification: {', '.join(movie['certification'])}")
    print('-----------------------')

In [21]:
def get_movie_id(movie_name):
    search_url = 'https://api.themoviedb.org/3/search/movie'
    params = {
        'api_key': API_KEY,
        'query': movie_name,
        'language': 'en-US'
    }
    response = requests.get(search_url, params=params)
    data = response.json()
    if data['results']:
        return data['results'][0]['id']
    else:
        print(f"Movie not found: {movie_name}")
        return None

def get_movie_details(movie_id):
    details_url = f'https://api.themoviedb.org/3/movie/{movie_id}'
    params = {
        'api_key': API_KEY,
        'language': 'en-US'
    }
    response = requests.get(details_url, params=params)
    return response.json()

def extract_movie_info(movie_data):
    title = movie_data.get('title')
    rating = movie_data.get('vote_average')
    vote_count = movie_data.get('vote_count')
    return {
        'title': title,
        'rating': rating,
        'vote_count': vote_count
    }

In [22]:
from tqdm import tqdm

movie_names = movie_metadata['Movie_name'][0:5]

movie_data_ratings = []
for name in tqdm(movie_names, desc='Processing movies', unit='movie'):
    movie_id = get_movie_id(name)
    if movie_id:
        details = get_movie_details(movie_id)
        info = extract_movie_info(details) 
        movie_data_ratings.append(info)

Processing movies: 100%|██████████| 5/5 [00:02<00:00,  1.74movie/s]


**Graph**

In [3]:
import numpy as np
import networkx as nx
import plotly.graph_objs as go

#replace this
movie_metadata['ratings'] = np.random.uniform(0.0, 5.0, size=len(movie_metadata))

# ratings_df = pd.read_csv('ratings.csv')
# movie_metadata = pd.merge(movie_metadata, ratings_df, on='Movie_name', how='right')

In [4]:
merged_data = pd.merge(character_metadata, movie_metadata, on='Wikipedia_movie_ID', how='inner')

merged_data = merged_data[merged_data['Actor_name'].notna()]
merged_data = merged_data[merged_data['Actor_name'] != '']


In [25]:
G = nx.Graph()

movie_actor_group = merged_data.groupby('Wikipedia_movie_ID')['Actor_name'].apply(list)

for actors in movie_actor_group:
    actors = list(set(actors))
    for i in range(len(actors)):
        for j in range(i + 1, len(actors)):
            G.add_edge(actors[i], actors[j])

degree_dict = dict(G.degree())
nx.set_node_attributes(G, degree_dict, 'degree')

top_actors = sorted(degree_dict, key=degree_dict.get, reverse=True)[:60]

G_sub = G.subgraph(top_actors).copy()

actor_movie_ratings = merged_data.groupby('Actor_name')['ratings'].apply(list)

actor_avg_ratings = {}
for actor, ratings in actor_movie_ratings.items():
    if len(ratings) >= 5:
        avg_rating = np.mean(sorted(ratings, reverse=True)[:10])
        actor_avg_ratings[actor] = avg_rating

for node in G_sub.nodes():
    G_sub.nodes[node]['avg_rating'] = actor_avg_ratings.get(node, None)
    G_sub.nodes[node]['degree'] = G_sub.degree[node]

nodes_to_remove = [node for node, attr in G_sub.nodes(data=True) if attr['avg_rating'] is None]
G_sub.remove_nodes_from(nodes_to_remove)

pos = nx.spring_layout(G_sub, k=0.5, iterations=50, seed=42)

node_x = []
node_y = []
node_text = []
node_hovertext = []
node_size = []
node_color = []

for node in G_sub.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)
    node_size.append(np.log(G_sub.nodes[node]['degree'] + 1) * 20)
    node_color.append(G_sub.nodes[node]['avg_rating'])
    node_text.append(f"{node}")
    node_hovertext.append(f"Degree: {G_sub.nodes[node]['degree']}<br>Avg Rating: {G_sub.nodes[node]['avg_rating']:.2f}")

edge_x = []
edge_y = []
for edge in G_sub.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.extend([x0, x1, None])
    edge_y.extend([y0, y1, None])

edge_trace = go.Scatter(
    x=edge_x,
    y=edge_y,
    line=dict(width=1, color='#888'),
    hoverinfo='none',
    mode='lines'
)

node_trace = go.Scatter(
    x=node_x,
    y=node_y,
    mode='markers+text',
    text=node_text,
    hovertext=node_hovertext,
    hoverinfo='text',
    marker=dict(
        showscale=True,
        colorscale='Inferno',
        reversescale=True,
        color=node_color,
        size=node_size,
        colorbar=dict(
            thickness=15,
            title='Average Rating',
            xanchor='left',
            titleside='right'
        ),
        line_width=2
    ),
    textposition='middle center',
    textfont_size=10
)

fig = go.Figure(data=[edge_trace, node_trace],
                layout=go.Layout(
                    title='Top 20 Actors Network Graph',
                    titlefont_size=16,
                    showlegend=False,
                    hovermode='closest',
                    margin=dict(b=20, l=5, r=5, t=40),
                    annotations=[dict(
                        text="Size: degree，colour: avg rating",
                        showarrow=False,
                        xref="paper", yref="paper",
                        x=0.005, y=-0.002
                    )],
                    xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                    yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
                )
)


In [26]:
fig.show(renderer='browser')