# Importing movie data and first glimpsse

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [37]:
# import data sets (column names are based on readme file of data set publisher)
character_metadata = pd.read_csv('./data/character.metadata.tsv', sep='\t', names=[
    'Wikipedia movie ID',
    'Freebase movie ID',
    'Movie release date',
    'Character name',
    'Actor date of birth',
    'Actor gender',
    'Actor height (in meters)',
    'Actor ethnicity (Freebase ID)',
    'Actor name',
    'Actor age at movie release',
    'Freebase character/actor map ID',
    'Freebase character ID',
    'Freebase actor ID',
])
movie_metadata = pd.read_csv('./data/movie.metadata.tsv', sep='\t', names=[
    'Wikipedia movie ID', 
    'Freebase movie ID',
    'Movie name',
    'Movie release date',
    'Movie box office revenue',
    'Movie runtime',
    'Movie languages (Freebase ID:name tuples)',
    'Movie countries (Freebase ID:name tuples)',
    'Movie genres (Freebase ID:name tuples)'
])
name_cluster = pd.read_csv('./data/name.clusters.txt', sep='\t', names=['Name', 'Freebase ID']) # maybe wrong column names?
plot_summaries = pd.read_csv('./data/plot_summaries.txt', sep='\t', names=['Wikipedia movie ID', 'Summary'])
tvtropes = pd.read_csv('./data/tvtropes.clusters.txt', sep='\t', names=['Type', 'Freebase character/actor map ID']) # maybe wrong column names?

# variableto be able to iterate over all data sets
data_sets = [
    {'name' : 'character_metadata', 'data' : character_metadata},
    {'name' : 'movie_metadata', 'data' : movie_metadata}, 
    {'name' : 'name_cluster', 'data' : name_cluster},
    {'name' : 'plot_summaries', 'data' : plot_summaries},
    {'name' : 'tvtropes_cluster', 'data' : tvtropes}
]

In [None]:
# first glimpse of each data set and soome basic stats
for data_set in data_sets:
    print('data set:', data_set['name'])
    print('shape:', data_set['data'].shape)
    print('first five rows:\n', data_set['data'].head())
    print('description:\n', data_set['data'].describe(include='all'))
    print('\n\n')

# Pre-processsing

In [None]:
# standardizing release date columns so that is only showing the year (essential since many entries only reveal the year and some a complete date)
movie_metadata['Movie release date'] = movie_metadata['Movie release date'].str.slice(0, 4)

# NaNs values
print('percentage of NaNs in "Movie release date" column:', movie_metadata['Movie release date'].isna().sum()/movie_metadata['Movie release date'].size)
movie_metadata.head()

# Analysis of "Movie release date"

In [None]:
# basic stats about movie release year column
movie_metadata['Movie release date'].describe(include='all')

# visualise amount of movies per year
movie_metadata_release_date_analysis = movie_metadata.dropna(subset=['Movie release date']) # erase rows with NaNs
movie_metadata_release_date_analysis['Movie release date'] = movie_metadata_release_date_analysis['Movie release date'].astype(int) #essential for plotting
print('oldest movie(s) from:', movie_metadata_release_date_analysis['Movie release date'].min())
print('newest movie(s) from:', movie_metadata_release_date_analysis['Movie release date'].max())

boxplot_movies_over_time = movie_metadata_release_date_analysis.boxplot(column='Movie release date')
movies_per_year = movie_metadata_release_date_analysis.hist(column='Movie release date')
print('histogram', movies_per_year)
print('boxplot', boxplot_movies_over_time) #--> reveals faulty values due to existance of extreme outliers

# Look at TV Tropes dataset

In [17]:
tvtropes.describe()

Unnamed: 0,Type,Freebase character/actor map ID
count,501,501
unique,72,447
top,crazy_jealous_guy,"{""char"": ""Captain Jack Sparrow"", ""movie"": ""Pir..."
freq,25,5


In [40]:
tvtropes.head()

Unnamed: 0,Type,Freebase character/actor map ID
0,absent_minded_professor,"{""char"": ""Professor Philip Brainard"", ""movie"":..."
1,absent_minded_professor,"{""char"": ""Professor Keenbean"", ""movie"": ""Richi..."
2,absent_minded_professor,"{""char"": ""Dr. Reinhardt Lane"", ""movie"": ""The S..."
3,absent_minded_professor,"{""char"": ""Dr. Harold Medford"", ""movie"": ""Them!..."
4,absent_minded_professor,"{""char"": ""Daniel Jackson"", ""movie"": ""Stargate""..."


In [56]:
hmm = tvtropes.rename(columns={'Freebase character/actor map ID': "JSON"})["JSON"]
l = []
for elem in hmm:
    l.append(elem)

def write_to_file():
    with open("data/tropes_col2.json", "w") as fp:
        fp.write("[\n")
        for elem in l:
            fp.write("%s,\n" % elem)
        fp.write("]")

In [58]:
tropes_col2 = pd.read_json("tropes_col2.json")
tropes_col2 # It works! 

Unnamed: 0,char,movie,id,actor
0,Professor Philip Brainard,Flubber,/m/0jy9q0,Robin Williams
1,Professor Keenbean,Richie Rich,/m/02vchl3,Michael McShane
2,Dr. Reinhardt Lane,The Shadow,/m/0k6fkc,Ian McKellen
3,Dr. Harold Medford,Them!,/m/0k6_br,Edmund Gwenn
4,Daniel Jackson,Stargate,/m/0k3rhh,James Spader
...,...,...,...,...
496,Morgan Earp,Tombstone,/m/0k776f,Bill Paxton
497,Colorado Ryan,Rio Bravo,/m/0k2kqg,Ricky Nelson
498,Tom Sawyer,The League of Extraordinary Gentlemen,/m/0k5nsh,Shane West
499,William H. 'Billy the Kid' Bonney,Young Guns II,/m/03lrjk0,Emilio Estevez


In [67]:
tvtropes = tvtropes.join(tropes_col2)
tvtropes.drop(["Freebase character/actor map ID"], axis=1, inplace=True)
tvtropes

Unnamed: 0,Type,char,movie,id,actor
0,absent_minded_professor,Professor Philip Brainard,Flubber,/m/0jy9q0,Robin Williams
1,absent_minded_professor,Professor Keenbean,Richie Rich,/m/02vchl3,Michael McShane
2,absent_minded_professor,Dr. Reinhardt Lane,The Shadow,/m/0k6fkc,Ian McKellen
3,absent_minded_professor,Dr. Harold Medford,Them!,/m/0k6_br,Edmund Gwenn
4,absent_minded_professor,Daniel Jackson,Stargate,/m/0k3rhh,James Spader
...,...,...,...,...,...
496,young_gun,Morgan Earp,Tombstone,/m/0k776f,Bill Paxton
497,young_gun,Colorado Ryan,Rio Bravo,/m/0k2kqg,Ricky Nelson
498,young_gun,Tom Sawyer,The League of Extraordinary Gentlemen,/m/0k5nsh,Shane West
499,young_gun,William H. 'Billy the Kid' Bonney,Young Guns II,/m/03lrjk0,Emilio Estevez


Now that our dataframe contains clean columns, we can use it to advance our data story. Next step is to find the ethnicity of every actor, either through wikipedia or imdb.