# Data exploration
This notebook contains some tests about data import and some data viz tools

In [2]:
DATA_FOLDER = './data/'
MOVIES_FOLDER = DATA_FOLDER + 'movies_summaries/'
PLOT_SUMMARY_FOLDER = DATA_FOLDER + 'corenlp_plot_summaries'

REPORT_FOLDER = './gen/reports/'
ETHNICITY_FILE = './gen/ethnicities.tsv'

CHARACTERS_FILE = MOVIES_FOLDER + 'character.metadata.tsv'
MOVIES_FILE = MOVIES_FOLDER + 'movie.metadata.tsv'
PLOT_SUMMARIES_FILE = MOVIES_FOLDER + 'plot_summaries.txt'
TROPES_FILE = MOVIES_FOLDER + 'tvtropes.clusters.txt'

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import re

from pandas_profiling import ProfileReport
%matplotlib inline

# Movies

In [4]:
movies_columns = ['wiki_movie_id', 'freebase_movie_id', 'name', 'release_date', 'box_office_revenue', 'runtime', 'languages', 'countries', 'genres']
movies = pd.read_csv(MOVIES_FILE, sep='\t', names=movies_columns)

print(movies.shape)
movies.head()

(81741, 9)


Unnamed: 0,wiki_movie_id,freebase_movie_id,name,release_date,box_office_revenue,runtime,languages,countries,genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"


In [5]:
def replace_unknown(df, label):
    df[label] = df[label].replace("{}", "{\"\": \"Unknown\"}")

replace_unknown(movies, 'countries')
#replace_unknown(movies, 'languages')
#replace_unknown(movies, 'genres')

movies.loc[200]

wiki_movie_id                            30693170
freebase_movie_id                      /m/0g9z2v0
name                                       Harumi
release_date                                  NaN
box_office_revenue                            NaN
runtime                                       NaN
languages                                      {}
countries                         {"": "Unknown"}
genres                {"/m/0jtdp": "Documentary"}
Name: 200, dtype: object

In [6]:
def cleanup_countries(str):
    split_elements = list(map(lambda x: x.split(':'), str.split(',')))
    cleaned_elements = list(map(lambda x: re.sub('["}]', '', x[1][1:]), split_elements))
    return cleaned_elements


movies["countries"] = movies["countries"].apply(cleanup_countries)

In [7]:
target_countries = ['United States of America', 'France']
def filter_with_countries(df, target_countries, mode):
    if mode == 'all':
        return df.where(df["countries"].apply(lambda x: all(country in x for country in target_countries))).dropna()
    elif mode == 'any':
        return df.where(df["countries"].apply(lambda x: any(country in x for country in target_countries))).dropna()
    elif mode == 'only':
        return df.where(df["countries"].apply(lambda x: set(x) == set(target_countries))).dropna()
    else:
        raise ValueError('mode must be one of [all, any, only]')
    

filter_with_countries(movies, target_countries, mode='only')

Unnamed: 0,wiki_movie_id,freebase_movie_id,name,release_date,box_office_revenue,runtime,languages,countries,genres
2001,25871790.0,/m/09v42sf,Kaboom,2010-05-15,539957.0,83.0,"{""/m/02h40lc"": ""English Language""}","[United States of America, France]","{""/m/01jfsb"": ""Thriller"", ""/m/0hn10"": ""LGBT"", ..."
2454,10915095.0,/m/02qtx93,My Father the Hero,1994,25479558.0,87.0,"{""/m/064_8sq"": ""French Language"", ""/m/02h40lc""...","[France, United States of America]","{""/m/06cvj"": ""Romantic comedy"", ""/m/0hqxf"": ""F..."
2666,28763446.0,/m/0ddg_bs,Colombiana,2011-07-27,60965854.0,107.0,"{""/m/02h40lc"": ""English Language"", ""/m/06nm1"":...","[France, United States of America]","{""/m/03btsm8"": ""Action/Adventure"", ""/m/01jfsb""..."
3049,22751.0,/m/05ppy,Original Sin,2001-07-11,36402320.0,118.0,"{""/m/03_9r"": ""Japanese Language"", ""/m/06nm1"": ...","[United States of America, France]","{""/m/01jfsb"": ""Thriller"", ""/m/02n4kr"": ""Myster..."
3828,5800622.0,/m/0f5hd8,The Bear,1988-10-19,31753898.0,96.0,"{""/m/02h40lc"": ""English Language""}","[United States of America, France]","{""/m/0hqxf"": ""Family Film"", ""/m/0bj8m2"": ""Chil..."
...,...,...,...,...,...,...,...,...,...
79420,213188.0,/m/01f73r,DuckTales the Movie: Treasure of the Lost Lamp,1990-08-03,18115724.0,74.0,"{""/m/02h40lc"": ""English Language""}","[United States of America, France]","{""/m/0bj8m2"": ""Children's"", ""/m/01hmnh"": ""Fant..."
81186,28084941.0,/m/0cmdmfw,Rubber,2010-05-15,100370.0,82.0,"{""/m/064_8sq"": ""French Language"", ""/m/02h40lc""...","[France, United States of America]","{""/m/06n90"": ""Science Fiction"", ""/m/03npn"": ""H..."
81285,972958.0,/m/03vnxd,The Transporter,2002-10-02,43928932.0,94.0,"{""/m/064_8sq"": ""French Language"", ""/m/0653m"": ...","[France, United States of America]","{""/m/01jfsb"": ""Thriller"", ""/m/0cq22f9"": ""Actio..."
81659,25920477.0,/m/0b6lqyd,Source Code,2011-03-11,147332697.0,93.0,"{""/m/02h40lc"": ""English Language""}","[France, United States of America]","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."


# Characters

In [8]:
character_columns = ['wiki_movie_id', 'freebase_movie_id', 'm_release_date', 'name', 'a_dob', 'a_gender', 'a_height', 'a_ethnicity_freebase_id', 'a_name', 'a_age_at_release', 'freebase_char/a_map', 'freebase_char_id', 'freebase_a_id']
characters = pd.read_csv(CHARACTERS_FILE, sep='\t', names=character_columns, index_col=False)

characters['m_release_date'] = pd.to_datetime(characters['m_release_date'], format='%Y-%m-%d', errors='coerce')

print(characters.shape)
characters

(450669, 13)


Unnamed: 0,wiki_movie_id,freebase_movie_id,m_release_date,name,a_dob,a_gender,a_height,a_ethnicity_freebase_id,a_name,a_age_at_release,freebase_char/a_map,freebase_char_id,freebase_a_id
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.620,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.780,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4
2,975900,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l
3,975900,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12,M,1.750,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc
4,975900,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25,F,1.650,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg
...,...,...,...,...,...,...,...,...,...,...,...,...,...
450664,913762,/m/03pcrp,1992-05-21,Elensh,1970-05,F,,,Dorothy Elias-Fahn,,/m/0kr406c,/m/0kr406h,/m/0b_vcv
450665,913762,/m/03pcrp,1992-05-21,Hibiki,1965-04-12,M,,,Jonathan Fahn,27.0,/m/0kr405_,/m/0kr4090,/m/0bx7_j
450666,28308153,/m/0cp05t9,1957-01-01,,1941-11-18,M,1.730,/m/02w7gg,David Hemmings,15.0,/m/0g8ngmc,,/m/022g44
450667,28308153,/m/0cp05t9,1957-01-01,,,,,,Roberta Paterson,,/m/0g8ngmj,,/m/0g8ngmm


In [9]:
characters = characters.join(movies[["freebase_movie_id", "countries"]].set_index("freebase_movie_id"), on="freebase_movie_id")
characters

Unnamed: 0,wiki_movie_id,freebase_movie_id,m_release_date,name,a_dob,a_gender,a_height,a_ethnicity_freebase_id,a_name,a_age_at_release,freebase_char/a_map,freebase_char_id,freebase_a_id,countries
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.620,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7,[United States of America]
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.780,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4,[United States of America]
2,975900,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l,[United States of America]
3,975900,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12,M,1.750,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc,[United States of America]
4,975900,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25,F,1.650,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg,[United States of America]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
450664,913762,/m/03pcrp,1992-05-21,Elensh,1970-05,F,,,Dorothy Elias-Fahn,,/m/0kr406c,/m/0kr406h,/m/0b_vcv,[Japan]
450665,913762,/m/03pcrp,1992-05-21,Hibiki,1965-04-12,M,,,Jonathan Fahn,27.0,/m/0kr405_,/m/0kr4090,/m/0bx7_j,[Japan]
450666,28308153,/m/0cp05t9,1957-01-01,,1941-11-18,M,1.730,/m/02w7gg,David Hemmings,15.0,/m/0g8ngmc,,/m/022g44,[United Kingdom]
450667,28308153,/m/0cp05t9,1957-01-01,,,,,,Roberta Paterson,,/m/0g8ngmj,,/m/0g8ngmm,[United Kingdom]


In [15]:
target_countries = ['Serbia']
filter_with_countries(characters, target_countries, mode='only')

Unnamed: 0,wiki_movie_id,freebase_movie_id,m_release_date,name,a_dob,a_gender,a_height,a_ethnicity_freebase_id,a_name,a_age_at_release,freebase_char/a_map,freebase_char_id,freebase_a_id,countries
60332,30320975.0,/m/0g5qxtx,2010-10-06,Mina,1981-11-17,F,1.78,/m/04kbvpz,Bojana Novakovic,28.0,/m/0gw3nfy,/m/0h37y2c,/m/02v_6qy,[Serbia]
124895,14882264.0,/m/03h0b44,2008-01-30,Arsa 'Kralj carlstona',1978-12-01,M,1.93,/m/02ctzb,Stefan Kapicic,29.0,/m/04tnnf1,/m/0hngk1g,/m/04tnnf4,[Serbia]
435377,34470730.0,/m/0h34t39,2012-03-13,Sef tuzilastva,1946-07-27,M,1.91,/m/09vmyh,Rade Serbedzija,65.0,/m/0h34t2_,/m/0h34t32,/m/034jgf,[Serbia]


In [16]:
target_countries = ['Serbia']
filter_with_countries(characters, target_countries, mode='any')

Unnamed: 0,wiki_movie_id,freebase_movie_id,m_release_date,name,a_dob,a_gender,a_height,a_ethnicity_freebase_id,a_name,a_age_at_release,freebase_char/a_map,freebase_char_id,freebase_a_id,countries
60332,30320975.0,/m/0g5qxtx,2010-10-06,Mina,1981-11-17,F,1.78,/m/04kbvpz,Bojana Novakovic,28.0,/m/0gw3nfy,/m/0h37y2c,/m/02v_6qy,[Serbia]
63757,13598172.0,/m/03cbh45,1982-01-01,Doktor Grujic,1946-07-27,M,1.91,/m/09vmyh,Rade Serbedzija,35.0,/m/040l3fw,/m/0h34zkm,/m/034jgf,"[Serbia, Yugoslavia]"
124895,14882264.0,/m/03h0b44,2008-01-30,Arsa 'Kralj carlstona',1978-12-01,M,1.93,/m/02ctzb,Stefan Kapicic,29.0,/m/04tnnf1,/m/0hngk1g,/m/04tnnf4,[Serbia]
298431,16133349.0,/m/03wb067,2006-01-01,Brewster,1943-03-31,M,1.83,/m/01qhm_,Christopher Walken,62.0,/m/04dfw24,/m/0gy8p22,/m/016fjj,"[Italy, Serbia, United Kingdom]"
298432,16133349.0,/m/03wb067,2006-01-01,Tommaso Moreno,1979-12-29,M,1.78,/m/09k5jvk,Diego Luna,26.0,/m/04dfw29,/m/0h34tg5,/m/037kqv,"[Italy, Serbia, United Kingdom]"
298433,16133349.0,/m/03wb067,2006-01-01,Lea Padovani,1976-01-02,F,1.67,/m/03ttfc,Paz Vega,29.0,/m/04dfw2g,/m/05kc0k7,/m/04mdbq,"[Italy, Serbia, United Kingdom]"
309854,28135570.0,/m/0cmcv5z,2010-07-23,Lucija,1955-09-07,F,1.676,/m/041rx,Mira Furlan,54.0,/m/0cmxv87,/m/0j4_lhq,/m/0prs1,"[Belgium, Bosnia and Herzegovina, Slovenia, Fr..."
435377,34470730.0,/m/0h34t39,2012-03-13,Sef tuzilastva,1946-07-27,M,1.91,/m/09vmyh,Rade Serbedzija,65.0,/m/0h34t2_,/m/0h34t32,/m/034jgf,[Serbia]
443170,27128043.0,/m/0bwhwdx,2006-09-28,Marina,1981-11-17,F,1.78,/m/04kbvpz,Bojana Novakovic,24.0,/m/0g9hd4k,/m/0h37y30,/m/02v_6qy,"[Switzerland, Spain, Serbia, Monaco]"
