# Importing movie data and first glimpsse

In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [41]:
# import data sets (column names are based on readme file of data set publisher)
character_metadata = pd.read_csv('./data/character.metadata.tsv', sep='\t', names=[
    'Wikipedia movie ID',
    'Freebase movie ID',
    'Movie release date',
    'Character name',
    'Actor date of birth',
    'Actor gender',
    'Actor height (in meters)',
    'Actor ethnicity (Freebase ID)',
    'Actor name',
    'Actor age at movie release',
    'Freebase character/actor map ID',
    'Freebase character ID',
    'Freebase actor ID',
])
movie_metadata = pd.read_csv('./data/movie.metadata.tsv', sep='\t', names=[
    'Wikipedia movie ID', 
    'Freebase movie ID',
    'Movie name',
    'Movie release date',
    'Movie box office revenue',
    'Movie runtime',
    'Movie languages (Freebase ID:name tuples)',
    'Movie countries (Freebase ID:name tuples)',
    'Movie genres (Freebase ID:name tuples)'
])
name_cluster = pd.read_csv('./data/name.clusters.txt', sep='\t', names=['Name', 'Freebase ID']) # maybe wrong column names?
plot_summaries = pd.read_csv('./data/plot_summaries.txt', sep='\t', names=['Wikipedia movie ID', 'Summary'])
tvtropes = pd.read_csv('./data/tvtropes.clusters.txt', sep='\t', names=['Type', 'Freebase character/actor map ID']) # maybe wrong column names?

# variableto be able to iterate over all data sets
data_sets = [
    {'name' : 'character_metadata', 'data' : character_metadata},
    {'name' : 'movie_metadata', 'data' : movie_metadata}, 
    {'name' : 'name_cluster', 'data' : name_cluster},
    {'name' : 'plot_summaries', 'data' : plot_summaries},
    {'name' : 'tvtropes_cluster', 'data' : tvtropes}
]

In [42]:
# first glimpse of each data set and soome basic stats
for data_set in data_sets:
    print('data set:', data_set['name'])
    print('shape:', data_set['data'].shape)
    print('first five rows:\n', data_set['data'].head())
    print('description:\n', data_set['data'].describe(include='all'))
    print('\n\n')

data set: character_metadata
shape: (450669, 13)
first five rows:
    Wikipedia movie ID Freebase movie ID Movie release date  \
0              975900         /m/03vyhn         2001-08-24   
1              975900         /m/03vyhn         2001-08-24   
2              975900         /m/03vyhn         2001-08-24   
3              975900         /m/03vyhn         2001-08-24   
4              975900         /m/03vyhn         2001-08-24   

               Character name Actor date of birth Actor gender  \
0                    Akooshay          1958-08-26            F   
1  Lieutenant Melanie Ballard          1974-08-15            F   
2         Desolation Williams          1969-06-15            M   
3          Sgt Jericho Butler          1967-09-12            M   
4             Bashira Kincaid          1977-09-25            F   

   Actor height (in meters) Actor ethnicity (Freebase ID)          Actor name  \
0                     1.620                           NaN      Wanda De Jesus   
1

# Pre-processsing

In [43]:
# standardizing release date columns so that is only showing the year (essential since many entries only reveal the year and some a complete date)
movie_metadata['Movie release date'] = movie_metadata['Movie release date'].str.slice(0, 4)

# NaNs values
print('percentage of NaNs in "Movie release date" column:', movie_metadata['Movie release date'].isna().sum()/movie_metadata['Movie release date'].size)
movie_metadata.head()

percentage of NaNs in "Movie release date" column: 0.08443743042047443


Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie name,Movie release date,Movie box office revenue,Movie runtime,Movie languages (Freebase ID:name tuples),Movie countries (Freebase ID:name tuples),Movie genres (Freebase ID:name tuples)
0,975900,/m/03vyhn,Ghosts of Mars,2001,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"


# Analysis of "Movie release date"

In [44]:
# basic stats about movie release year column
movie_metadata['Movie release date'].describe(include='all')

# visualise amount of movies per year
movie_metadata_release_date_analysis = movie_metadata.dropna(subset=['Movie release date']) # erase rows with NaNs
movie_metadata_release_date_analysis['Movie release date'] = movie_metadata_release_date_analysis['Movie release date'].astype(int) #essential for plotting
print('oldest movie(s) from:', movie_metadata_release_date_analysis['Movie release date'].min())
print('newest movie(s) from:', movie_metadata_release_date_analysis['Movie release date'].max())

boxplot_movies_over_time = movie_metadata_release_date_analysis.boxplot(column='Movie release date')
movies_per_year = movie_metadata_release_date_analysis.hist(column='Movie release date')
print('histogram', movies_per_year)
print('boxplot', boxplot_movies_over_time) #--> reveals faulty values due to existance of extreme outliers

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_metadata_release_date_analysis['Movie release date'] = movie_metadata_release_date_analysis['Movie release date'].astype(int) #essential for plotting


oldest movie(s) from: 1010
newest movie(s) from: 2016
histogram [[<AxesSubplot:title={'center':'Movie release date'}>]]
boxplot AxesSubplot(0.125,0.11;0.775x0.77)


In [51]:
# erasing outliers which are faulty entries

## top 10 oldest movies in given data
movie_metadata['Movie release date'].sort_values().head(10)

## since the the oldest movie is "Roundhay Garden Scene" aired in 1888 all movies before this date are faulty entries
movie_metadata[movie_metadata['Movie release date'] < "1888"]["Movie release date"] = "nan" #TODO make it work
movie_metadata[movie_metadata['Movie name'] == "Hunting Season"]

## number of movies with NAN as movie release date
movie_metadata[movie_metadata['Movie release date'].isnull()].shape

## due to high number of movies with unknown movie release years, websrape move release years from imdb
    ### TODO

## count movies per decade
movie_metadata["decade"] = movie_metadata['Movie release date'].str.slice(0, 3) + "0"
movies_grouped_by_decade = movie_metadata.groupby("decade")["Wikipedia movie ID"].count()
## TODO visualise movies per decade

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_metadata[movie_metadata['Movie release date'] < "1888"]["Movie release date"] = "nan" #TODO make it work


decade
1010        1
1880        2
1890      149
1900      193
1910     2669
1920     2883
1930     4800
1940     4466
1950     5723
1960     5875
1970     6716
1980     7397
1990     9469
2000    19268
2010     5228
Name: Wikipedia movie ID, dtype: int64

# Look at TV Tropes dataset

In [6]:
tvtropes.describe()

Unnamed: 0,Type,Freebase character/actor map ID
count,501,501
unique,72,447
top,crazy_jealous_guy,"{""char"": ""Captain Jack Sparrow"", ""movie"": ""Pir..."
freq,25,5


In [7]:
tvtropes.head()

Unnamed: 0,Type,Freebase character/actor map ID
0,absent_minded_professor,"{""char"": ""Professor Philip Brainard"", ""movie"":..."
1,absent_minded_professor,"{""char"": ""Professor Keenbean"", ""movie"": ""Richi..."
2,absent_minded_professor,"{""char"": ""Dr. Reinhardt Lane"", ""movie"": ""The S..."
3,absent_minded_professor,"{""char"": ""Dr. Harold Medford"", ""movie"": ""Them!..."
4,absent_minded_professor,"{""char"": ""Daniel Jackson"", ""movie"": ""Stargate""..."


In [8]:
hmm = tvtropes.rename(columns={'Freebase character/actor map ID': "JSON"})["JSON"]
l = []
for elem in hmm:
    l.append(elem)

def write_to_file():
    with open("data/tropes_col2.json", "w") as fp:
        fp.write("[\n")
        for elem in l:
            fp.write("%s,\n" % elem)
        fp.write("]")

In [9]:
tropes_col2 = pd.read_json("tropes_col2.json")
tropes_col2 # It works! 

Unnamed: 0,char,movie,id,actor
0,Professor Philip Brainard,Flubber,/m/0jy9q0,Robin Williams
1,Professor Keenbean,Richie Rich,/m/02vchl3,Michael McShane
2,Dr. Reinhardt Lane,The Shadow,/m/0k6fkc,Ian McKellen
3,Dr. Harold Medford,Them!,/m/0k6_br,Edmund Gwenn
4,Daniel Jackson,Stargate,/m/0k3rhh,James Spader
...,...,...,...,...
496,Morgan Earp,Tombstone,/m/0k776f,Bill Paxton
497,Colorado Ryan,Rio Bravo,/m/0k2kqg,Ricky Nelson
498,Tom Sawyer,The League of Extraordinary Gentlemen,/m/0k5nsh,Shane West
499,William H. 'Billy the Kid' Bonney,Young Guns II,/m/03lrjk0,Emilio Estevez


In [10]:
tvtropes = tvtropes.join(tropes_col2)
tvtropes.drop(["Freebase character/actor map ID"], axis=1, inplace=True)
tvtropes

Unnamed: 0,Type,char,movie,id,actor
0,absent_minded_professor,Professor Philip Brainard,Flubber,/m/0jy9q0,Robin Williams
1,absent_minded_professor,Professor Keenbean,Richie Rich,/m/02vchl3,Michael McShane
2,absent_minded_professor,Dr. Reinhardt Lane,The Shadow,/m/0k6fkc,Ian McKellen
3,absent_minded_professor,Dr. Harold Medford,Them!,/m/0k6_br,Edmund Gwenn
4,absent_minded_professor,Daniel Jackson,Stargate,/m/0k3rhh,James Spader
...,...,...,...,...,...
496,young_gun,Morgan Earp,Tombstone,/m/0k776f,Bill Paxton
497,young_gun,Colorado Ryan,Rio Bravo,/m/0k2kqg,Ricky Nelson
498,young_gun,Tom Sawyer,The League of Extraordinary Gentlemen,/m/0k5nsh,Shane West
499,young_gun,William H. 'Billy the Kid' Bonney,Young Guns II,/m/03lrjk0,Emilio Estevez


Now that our dataframe contains clean columns, we can use it to advance our data story. Next step is to find the ethnicity of every actor, either through wikipedia or imdb.

On wikipedia, format is the following:
- https://en.wikipedia.org/wiki/{FirstName}_{LastName}
- example: https://en.wikipedia.org/wiki/Ricky_Nelson
  
The birth information is contained in an html table (tr (=table row) th (=table header) td (=table data)), more specifically in a td with "class=infobox-data"

Even better would be to use Wikidata instead of Wikipedia. But the URL isn't as obvious as that of Wikipedia. As an example, for the actor Ricky Nelson, the Wikidata URL is the following: https://www.wikidata.org/wiki/Q303207. 
Thankfully, in the wikipedia article, on the left part of the tools page under the Tools section there's a link to the Wikidata item. In HTML, the link is basically inside a "li" tag with "id=t-wikibase" and "class=mw-list-item" which contains an href (link).
On wikidata itself, the Citizenship info we care about is stored inside an "a" tag with title="Q30" and href="/wiki/Q30". This seems to be the case for all actors.

Unfortunately, on imdb, the URL aren't as clean. So for Ricky Nelson as well, the URL looks like this:
- https://www.imdb.com/name/nm0625699/

Which doesn't give us much to work with. We could set up a script that simply enters the actor name in the imdb search bar and select the top/best matching result in the "People" category.
To make it worse, imdb doesn't have a clearcut view of the actor's ethnicity. It only has a text description; there doesn't seem to be an easy way to do this automatically.

With the prior analysis in mind, I vote to favor Wikipedia to extract ethnicity
Note: We mentioned ethnicity, but technically what we're finding is citizenship. This could lead us to mistakes, how do you think we should adapt?

In [11]:
# If you get errors when running this cell, uncomment these 2 lines
# %pip install pywikibot
# %pip install "wikitextparser>=0.47.5"

import pywikibot
def country_scraper(actor_name: str) -> str:
    """
    Takes in actor name {firstName_lastName} and returns the queried country of origin.
    """
    site = pywikibot.Site("en", "wikipedia")
    page = pywikibot.Page(site, actor_name) 
    item = pywikibot.ItemPage.fromPage(page)
    json = item.toJSON()
    country_id = "Q" + str(json["claims"]["P27"][0]["mainsnak"]["datavalue"]["value"]["numeric-id"])
    site = pywikibot.Site("wikidata", "wikidata")
    repo = site.data_repository()
    item = pywikibot.ItemPage(repo, country_id)

    item_dict = item.get()
    country = item_dict["labels"]["en-gb"]
    return country

In [12]:
# Only set this to true if we want to redo the scraping procedure. It takes about 15minutes. We shouldn't need to do so.
SCRAPE = False

from collections import defaultdict

actors = tvtropes["actor"]
countries = defaultdict(set) # In case actor name appears multiple times
mishandled = 0

if SCRAPE:
    for i, actor in enumerate(actors):
        try:
            country = country_scraper(actor)
            countries[actor].add(country)
            print("Actor", i, ":", actor,"--- Country:", country)
        except:
            print("---------------------Issue handling actor---------------", actor)
            # For actors that were mishandled, make sure that we add them to the list still so that we can still easily augment our dataset with NA values
            countries[actor].add("NA")
            mishandled += 1

    print("In total, we mishandled", mishandled, "actors out of", len(countries), "unique actors")
    

Actor 0 : Robin Williams --- Country: United States of America
---------------------Issue handling actor--------------- Michael McShane
Actor 2 : Ian McKellen --- Country: United Kingdom
Actor 3 : Edmund Gwenn --- Country: United Kingdom
Actor 4 : James Spader --- Country: United States of America
Actor 5 : Harrison Ford --- Country: United States of America
Actor 6 : Harrison Ford --- Country: United States of America
Actor 7 : Harrison Ford --- Country: United States of America
Actor 8 : Rachel Weisz --- Country: United Kingdom
Actor 9 : Shih Kien --- Country: People's Republic of China
Actor 10 : William Zabka --- Country: United States of America
Actor 11 : Zhang Ziyi --- Country: People's Republic of China
---------------------Issue handling actor--------------- Chia Hui Liu
Actor 13 : Carl Weathers --- Country: United States of America
Actor 14 : Mr. T --- Country: United States of America
---------------------Issue handling actor--------------- Lee Byung-Hun
Actor 16 : Clancy Br

ERROR: An error occurred for uri https://www.wikidata.org/w/api.php?ids=Q30&action=wbgetentities&maxlag=5&format=json
ERROR: Traceback (most recent call last):
  File "/Users/tchanee/Library/Python/3.9/lib/python/site-packages/pywikibot/data/api/_requests.py", line 679, in _http_request
    response = http.request(self.site, uri=uri,
  File "/Users/tchanee/Library/Python/3.9/lib/python/site-packages/pywikibot/comms/http.py", line 232, in request
    r = fetch(baseuri, headers=headers, **kwargs)
  File "/Users/tchanee/Library/Python/3.9/lib/python/site-packages/pywikibot/comms/http.py", line 393, in fetch
    callback(response)
  File "/Users/tchanee/Library/Python/3.9/lib/python/site-packages/pywikibot/comms/http.py", line 283, in error_handling_callback
    raise response from None
  File "/Users/tchanee/Library/Python/3.9/lib/python/site-packages/pywikibot/comms/http.py", line 384, in fetch
    response = session.request(method, uri,
  File "/Users/tchanee/Library/Python/3.9/lib/pyth

Actor 243 : John Wayne --- Country: United States of America
Actor 244 : Russell Crowe --- Country: New Zealand
Actor 245 : Russell Crowe --- Country: New Zealand
Actor 246 : Martin Sheen --- Country: United States of America
Actor 247 : Günter Lamprecht --- Country: Germany
Actor 248 : Naseeruddin Shah --- Country: India
Actor 249 : Elias Koteas --- Country: Canada
Actor 250 : Martin Sheen --- Country: United States of America
Actor 251 : Ken Watanabe --- Country: Japan
Actor 252 : Denzel Washington --- Country: United States of America
Actor 253 : Bruce Greenwood --- Country: Canada
Actor 254 : Nicholas Bell --- Country: United Kingdom
---------------------Issue handling actor--------------- John Lynch
Actor 256 : Christopher Eccleston --- Country: United Kingdom
Actor 257 : Heather Langenkamp --- Country: United States of America
Actor 258 : Neve Campbell --- Country: Canada
Actor 259 : Sigourney Weaver --- Country: United States of America
Actor 260 : Linda Hamilton --- Country: Un

In [19]:
if SCRAPE:
    countries2 = defaultdict(list) # can't dump to json with a dic of sets, so converting to lists instead
    for country in countries:
        countries2[country] = list(countries[country])

In [20]:
if SCRAPE:
    import json
    with open("actor_country.json", "w") as write_file:
        json.dump(countries2, write_file, indent=4)

As there aren't too many mishandled actors, we will fill the blanks manually.