In [1]:
#imports for website dev/plotting
import dash
import dash_core_components as dcc
import dash_html_components as html
import plotly.graph_objs as go
import plotly.express as px

#imports for data + manipulation
import pandas as pd
import numpy as np
import imdb
from bs4 import BeautifulSoup
import requests
import re

#other imports
import os

## Get Data
Using the imdbpy API to get data about Rachel Weisz from IMDB.

In [2]:
ia = imdb.IMDb()

In [3]:
movie_cols = ia.get_movie_infoset()
star_cols = ia.get_person_infoset()

In [4]:
rachel_id = ia.search_person('Rachel Weisz')[0].personID

In [5]:
rachel = ia.get_person(rachel_id)

In [6]:
rachel_movies = rachel['filmography'][0]['actress']
rachel_movie_ids = [i.movieID for i in rachel_movies]

## Data Cleaning
Pulling relevant data from each movie from Rachel Weisz's filmography. Putting the data into a DataFrame to be saved to a csv so that it can be easily accessed in the Dash app. Made function to scrape boxofficemojo to get the box office gross of each movie.

In [28]:
#mov.infoset2keys (to see the info pulled)
#var(mov) (to see the attributes of Movies/Person)
movies = []
important_keys = ['original title', 'genres', 'runtimes', 'countries', 'country codes', 'votes',
                 'year', 'rating', 'original air date', 'kind', 'plot']

for i in rachel_movie_ids:
    movie = ia.get_movie(i)
    movie_info = []
    for k in important_keys:
        try:
            val = movie[k]
            movie_info.append(val)
        except:
            movie_info.append(np.nan)
    
    movies.append(movie_info)

In [40]:
df = pd.DataFrame(movies, columns = important_keys)
df['id'] = rachel_movie_ids
df.head()

Unnamed: 0,original title,genres,runtimes,countries,country codes,votes,year,rating,original air date,kind,plot,id
0,A Special Relationship,[Drama],,"[United Kingdom, Australia]","[gb, au]",,,,,,[The story of how Hollywood icon Elizabeth Tay...,11204050
1,Black Widow (2020),"[Action, Adventure, Sci-Fi]",,[United States],[us],,2020.0,,01 May 2020 (USA),movie,[A film about Natasha Romanoff in her quests b...,3480822
2,The Favourite (2018),"[Biography, Drama, History]",[119],"[Ireland, United Kingdom, United States]","[ie, gb, us]",137100.0,2018.0,7.6,21 Dec 2018 (USA),movie,"[In early 18th century England, a frail Queen ...",5083738
3,The Mercy (2018),"[Adventure, Biography, Drama, Mystery]",[112],[United Kingdom],[gb],6100.0,2018.0,6.0,30 Nov 2018 (USA),movie,[The incredible story of amateur sailor Donald...,3319730
4,Disobedience (2017),"[Drama, Romance]",[114],"[Ireland, United Kingdom, United States]","[ie, gb, us]",20359.0,2017.0,6.6,27 Apr 2018 (USA),movie,[A woman returns to her Orthodox Jewish commun...,6108178


In [30]:
def box_office(movie_id):
    try:
        imdb_id = 'tt' + str(movie_id)

        # Fetch the box office numbers
        base = 'https://www.boxofficemojo.com'
        url = base + '/title/' + imdb_id
        source = requests.get(url).text
        soup = BeautifulSoup(source,'lxml')
        table = soup('th', text=re.compile(r'Release Group'))[0].parent.parent
        group = table.findAll('tr', recursive=False)[1].find('a').get('href')
        url = base + group

        # Get total earnings domestic and international
        source = requests.get(url).text
        soup = BeautifulSoup(source,'lxml')
        earnings = soup('h2', text=re.compile(r'Rollout'))[0].parent.parent.findAll('div')
        domestic = earnings[1].find('span', {'class': 'money'}).get_text()
        domestic_url = earnings[1].find('a').get('href')
        international = earnings[2].find('span', {'class': 'money'}).get_text()

        domestic = domestic.replace('$', '').replace(',', '')
        international = international.strip('$').replace(',', '')
    except:
        domestic = np.nan
        international = np.nan
    
    return str(domestic) + ',' + str(international)

In [41]:
gross = df.loc[df['kind'] == 'movie']['id'].apply(box_office)

In [42]:
gross_re = gross.str.split(',', expand=True)
gross_re.columns = ['domestic', 'international']
gross_re = gross_re.replace('nan', np.nan)
gross_re['domestic'] = gross_re['domestic'].astype(float)
gross_re['international'] = gross_re['international'].astype(float)

In [43]:
rachel_df = pd.concat([df.loc[df['kind'] == 'movie'], gross_re], axis = 1)

In [44]:
rachel_df['runtimes'] = rachel_df['runtimes'].apply(lambda x: x[0] if type(x) != float else x)
rachel_df['plot'] = rachel_df['plot'].apply(lambda x: x[0] if type(x) != float else x)
rachel_df['total'] = rachel_df['domestic'] + rachel_df['international']

In [46]:
rachel_df.reset_index(drop = True).to_csv('rachel.csv')

In [None]:
countries = rachel_df.groupby('year')['countries'].sum()
countries = countries.reset_index()
yearly = countries['countries'].apply(lambda x: pd.Series(x).value_counts()).fillna(0)
yearly['year'] = countries.year
country_df = pd.melt(yearly, 'year').rename({'variable': 'Country', 'value': 'Movie Count', 'year': 'Year'}, axis = 1)
country_df = country_df.loc[country_df['Movie Count'] > 0].sort_values('Year').reset_index(drop = True)

In [None]:
px.choropleth(country_df, locations =  'Country', locationmode = 'country names', color = 'Movie Count', 
             animation_frame="Year")

In [None]:
#TODO: network graph, something about financials and being gay

In [None]:
dropped = rachel_df.dropna(subset = ['domestic', 'international'])\
[['original title', 'domestic', 'international', 'total']]

labels = dropped['original title'].tolist()
parents = ['Rachel Weisz Movies' for x in labels]

labels.append('Rachel Weisz Movies')
parents.append('')

fig = go.Figure()
fig.add_trace(go.Treemap(
    labels = labels,
    parents = parents,
    ))
fig.show()

In [None]:
import plotly
plotly.__version__