In [48]:
# Import modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

# Data loading

## Movie data

In [49]:
# Load films
movies_header = ['wikipedia_id', 'freebase_id', 'name', 'release_date', 'box_office_revenue', 'runtime', 'languages', 'countries', 'genres']
movies = pd.read_csv('data/movie.metadata.tsv', sep='\t', header=None, names=movies_header)
movies.head(5)
print(movies.shape)
#movies['runtime'].hist()


(81741, 9)


Observed issues: 

The date format is not constant, sometimes it's the year only, sometimes it's yyyy-mm-dd

Box office revenue is not always available: NaN + Should take into account inflation

For language, countries and genres there's a code in addition to the information (can we use it??)

## Character data

In [50]:
# Load characters
characters_header = ['wikipedia_movie_id','freebase_movie_id','movie_release_date','character_name','actor_birthdate','actor_gender','actor_height_meters','actor_ethnicity_freebase_id','actor_name','actor_age_at_movie_release','freebase_character_actor_map_id','freebase_character_id','freebase_actor_id']
characters = pd.read_csv('data/character.metadata.tsv', sep='\t', header=None, names=characters_header)
characters.sample(5)

Unnamed: 0,wikipedia_movie_id,freebase_movie_id,movie_release_date,character_name,actor_birthdate,actor_gender,actor_height_meters,actor_ethnicity_freebase_id,actor_name,actor_age_at_movie_release,freebase_character_actor_map_id,freebase_character_id,freebase_actor_id
74088,29831261,/m/08yjm1,1945,,1922-11-21,F,,,María Casares,22.0,/m/02vcg0q,,/m/0ksjnb
107322,23167491,/m/064nsx2,2008,,,,,,Lucia Walters,,/m/0gc35jd,,/m/0gc35jj
437973,3329045,/m/05_lhx,1991-10-16,,,M,,,Daniel Buain,,/m/0bqscx5,,/m/0bqscx3
348943,32191033,/m/0crszls,1985,,1953,M,,/m/0dryh9k,Surendra Pal,32.0,/m/0gc74hz,,/m/04ldqtp
270015,931949,/m/03r58n,1985-06-21,,1973-08-24,M,,,Barret Oliver,11.0,/m/04d5l06,,/m/033ws8


freebase_character_actor_map_id ?
movies are referred to by wikipedia id or freebase id
lots of data missing especially for: ethnicity, charcter name (maybe some characters aren't named in movies) , 
different date formats for release date 

## IMDB data

In [25]:
imdb_top_1000 = pd.read_csv('data/imdb/imdb_top_1000.csv', thousands=',')

In [26]:
imdb_top_1000.head(5)

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469.0
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411.0
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444.0
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000.0
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000.0


## Cleaning & Pre-processing of data

In [51]:
movies.describe()

Unnamed: 0,wikipedia_id,box_office_revenue,runtime
count,81741.0,8401.0,61291.0
mean,17407840.0,47993630.0,111.8192
std,10987910.0,112175300.0,4360.07
min,330.0,10000.0,0.0
25%,7323695.0,2083193.0,81.0
50%,17778990.0,10639690.0,93.0
75%,27155730.0,40716960.0,106.0
max,37501920.0,2782275000.0,1079281.0


In [52]:
characters.describe()

Unnamed: 0,wikipedia_movie_id,actor_height_meters,actor_age_at_movie_release
count,450669.0,154824.0,292556.0
mean,13969750.0,1.788893,37.788523
std,10796620.0,4.37994,20.58787
min,330.0,0.61,-7896.0
25%,3759292.0,1.6764,28.0
50%,11890650.0,1.75,36.0
75%,23665010.0,1.83,47.0
max,37501920.0,510.0,103.0


In [53]:
# Function to remove encoding: 
remove_encoding = lambda x: np.nan if x == '{}' else \
    [w.replace(' Language', '').replace(' language', '') for w in re.findall(r'"(.*?)"', x)[1::2]]

In [54]:
movies['genres'] = movies['genres'].apply(remove_encoding)
movies['languages'] = movies['languages'].apply(remove_encoding)
movies['countries'] = movies['countries'].apply(remove_encoding)
movies.head()

Unnamed: 0,wikipedia_id,freebase_id,name,release_date,box_office_revenue,runtime,languages,countries,genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,[English],[United States of America],"[Thriller, Science Fiction, Horror, Adventure,..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,[English],[United States of America],"[Mystery, Biographical film, Drama, Crime Drama]"
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,[Norwegian],[Norway],"[Crime Fiction, Drama]"
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,[English],[United Kingdom],"[Thriller, Erotic thriller, Psychological thri..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,[German],[Germany],[Drama]


In [55]:
movies['countries'].apply(np.shape).min()

()

`<<<<<<< HEAD`

## Clean release dates

`=======`

### IMDB

In [27]:
imdb_top_1000['Runtime'].str.contains(r'\d+ min').value_counts()

True    1000
Name: Runtime, dtype: int64

In [28]:
imdb_top_1000['Runtime'] = imdb_top_1000['Runtime'].str.extract(r'(\d+) min', expand=False).astype(int)

In [31]:
imdb_top_1000.head()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469.0
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411.0
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444.0
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000.0
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000.0


In [32]:
imdb_top_1000.describe()

Unnamed: 0,Runtime,IMDB_Rating,Meta_score,No_of_Votes,Gross
count,1000.0,1000.0,843.0,1000.0,831.0
mean,122.891,7.9493,77.97153,273692.9,68034750.0
std,28.093671,0.275491,12.376099,327372.7,109750000.0
min,45.0,7.6,28.0,25088.0,1305.0
25%,103.0,7.7,70.0,55526.25,3253559.0
50%,119.0,7.9,79.0,138548.5,23530890.0
75%,137.0,8.1,87.0,374161.2,80750890.0
max,321.0,9.3,100.0,2343110.0,936662200.0


In [33]:
imdb_top_1000.dtypes

Poster_Link       object
Series_Title      object
Released_Year     object
Certificate       object
Runtime            int64
Genre             object
IMDB_Rating      float64
Overview          object
Meta_score       float64
Director          object
Star1             object
Star2             object
Star3             object
Star4             object
No_of_Votes        int64
Gross            float64
dtype: object

## Exploratory Data Analysis

`>>>>>>> dc00059a036a785f065cb51d9e0c8ab2e533cb54`

In [56]:
# Replace string 'release_date' column with 3 float columns ['release_year','release_month','release_day']

movies['release_date'] = movies['release_date'].astype('str') # Convert to string all dates
release_date = pd.DataFrame(movies['release_date'].str.split('-', expand=True).values, columns=['release_year','release_month','release_day']) # Split between days, months and year
movies[['release_year','release_month','release_day']] = release_date
movies = movies.drop('release_date',axis=1)


In [57]:
movies.head()

Unnamed: 0,wikipedia_id,freebase_id,name,box_office_revenue,runtime,languages,countries,genres,release_year,release_month,release_day
0,975900,/m/03vyhn,Ghosts of Mars,14010832.0,98.0,[English],[United States of America],"[Thriller, Science Fiction, Horror, Adventure,...",2001,8.0,24.0
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,,95.0,[English],[United States of America],"[Mystery, Biographical film, Drama, Crime Drama]",2000,2.0,16.0
2,28463795,/m/0crgdbh,Brun bitter,,83.0,[Norwegian],[Norway],"[Crime Fiction, Drama]",1988,,
3,9363483,/m/0285_cd,White Of The Eye,,110.0,[English],[United Kingdom],"[Thriller, Erotic thriller, Psychological thri...",1987,,
4,261236,/m/01mrr1,A Woman in Flames,,106.0,[German],[Germany],[Drama],1983,,


# Exploratory Data Analysis

In [None]:
movies['genres'].hist()