In [1]:
from __future__ import division, print_function, unicode_literals

import numpy as np
import os

np.random.seed(42)

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(".\images\proyecto", fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    os.makedirs(IMAGES_PATH, exist_ok=True)
    plt.savefig(path, format=fig_extension, dpi=resolution)

import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [2]:
import pandas as pd

def load_data(path,file):
    csv_path = os.path.join(path, file)
    return pd.read_csv(csv_path)

In [3]:
train_raw = load_data(".\data","train.csv")

In [4]:
train_raw.head()

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,2/20/15,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651
1,2,"[{'id': 107674, 'name': 'The Princess Diaries ...",40000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,tt0368933,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.248895,...,8/6/04,113.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It can take a lifetime to find true love; she'...,The Princess Diaries 2: Royal Engagement,"[{'id': 2505, 'name': 'coronation'}, {'id': 42...","[{'cast_id': 1, 'character': 'Mia Thermopolis'...","[{'credit_id': '52fe43fe9251416c7502563d', 'de...",95149435
2,3,,3300000,"[{'id': 18, 'name': 'Drama'}]",http://sonyclassics.com/whiplash/,tt2582802,en,Whiplash,"Under the direction of a ruthless instructor, ...",64.29999,...,10/10/14,105.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The road to greatness can take you to the edge.,Whiplash,"[{'id': 1416, 'name': 'jazz'}, {'id': 1523, 'n...","[{'cast_id': 5, 'character': 'Andrew Neimann',...","[{'credit_id': '54d5356ec3a3683ba0000039', 'de...",13092000
3,4,,1200000,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",http://kahaanithefilm.com/,tt1821480,hi,Kahaani,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,3.174936,...,3/9/12,122.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,Kahaani,"[{'id': 10092, 'name': 'mystery'}, {'id': 1054...","[{'cast_id': 1, 'character': 'Vidya Bagchi', '...","[{'credit_id': '52fe48779251416c9108d6eb', 'de...",16000000
4,5,,0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,tt1380152,ko,마린보이,Marine Boy is the story of a former national s...,1.14807,...,2/5/09,118.0,"[{'iso_639_1': 'ko', 'name': '한국어/조선말'}]",Released,,Marine Boy,,"[{'cast_id': 3, 'character': 'Chun-soo', 'cred...","[{'credit_id': '52fe464b9251416c75073b43', 'de...",3923970


In [5]:
test_raw = load_data(".\data","test.csv")

In [6]:
test_raw.head()

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,production_countries,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew
0,3001,"[{'id': 34055, 'name': 'Pokémon Collection', '...",0,"[{'id': 12, 'name': 'Adventure'}, {'id': 16, '...",http://www.pokemon.com/us/movies/movie-pokemon...,tt1226251,ja,ディアルガVSパルキアVSダークライ,Ash and friends (this time accompanied by newc...,3.851534,...,"[{'iso_3166_1': 'JP', 'name': 'Japan'}, {'iso_...",7/14/07,90.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Somewhere Between Time & Space... A Legend Is ...,Pokémon: The Rise of Darkrai,"[{'id': 11451, 'name': 'pok√©mon'}, {'id': 115...","[{'cast_id': 3, 'character': 'Tonio', 'credit_...","[{'credit_id': '52fe44e7c3a368484e03d683', 'de..."
1,3002,,88000,"[{'id': 27, 'name': 'Horror'}, {'id': 878, 'na...",,tt0051380,en,Attack of the 50 Foot Woman,When an abused wife grows to giant size becaus...,3.559789,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",5/19/58,65.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A titanic beauty spreads a macabre wave of hor...,Attack of the 50 Foot Woman,"[{'id': 9748, 'name': 'revenge'}, {'id': 9951,...","[{'cast_id': 2, 'character': 'Nancy Fowler Arc...","[{'credit_id': '55807805c3a3685b1300060b', 'de..."
2,3003,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",,tt0118556,en,Addicted to Love,Good-natured astronomer Sam is devastated when...,8.085194,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",5/23/97,100.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A Comedy About Lost Loves And Last Laughs,Addicted to Love,"[{'id': 931, 'name': 'jealousy'}, {'id': 9673,...","[{'cast_id': 11, 'character': 'Maggie', 'credi...","[{'credit_id': '52fe4330c3a36847f8041367', 'de..."
3,3004,,6800000,"[{'id': 18, 'name': 'Drama'}, {'id': 10752, 'n...",http://www.sonyclassics.com/incendies/,tt1255953,fr,Incendies,A mother's last wishes send twins Jeanne and S...,8.596012,...,"[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...",9/4/10,130.0,"[{'iso_639_1': 'fr', 'name': 'Français'}, {'is...",Released,The search began at the opening of their mothe...,Incendies,"[{'id': 378, 'name': 'prison'}, {'id': 539, 'n...","[{'cast_id': 6, 'character': 'Nawal', 'credit_...","[{'credit_id': '56478092c3a36826140043af', 'de..."
4,3005,,2000000,"[{'id': 36, 'name': 'History'}, {'id': 99, 'na...",,tt0418753,en,Inside Deep Throat,"In 1972, a seemingly typical shoestring budget...",3.21768,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",2/11/05,92.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It was filmed in 6 days for 25 thousand dollar...,Inside Deep Throat,"[{'id': 279, 'name': 'usa'}, {'id': 1228, 'nam...","[{'cast_id': 1, 'character': 'Narrator (voice)...","[{'credit_id': '52fe44ce9251416c75041967', 'de..."


In [7]:
train_raw.shape, test_raw.shape

((3000, 23), (4398, 22))

In [8]:
for i, e in enumerate(train_raw['belongs_to_collection'][:5]):
    print(i, e)

0 [{'id': 313576, 'name': 'Hot Tub Time Machine Collection', 'poster_path': '/iEhb00TGPucF0b4joM1ieyY026U.jpg', 'backdrop_path': '/noeTVcgpBiD48fDjFVic1Vz7ope.jpg'}]
1 [{'id': 107674, 'name': 'The Princess Diaries Collection', 'poster_path': '/wt5AMbxPTS4Kfjx7Fgm149qPfZl.jpg', 'backdrop_path': '/zSEtYD77pKRJlUPx34BJgUG9v1c.jpg'}]
2 nan
3 nan
4 nan


In [9]:
train_raw['belongs_to_collection'].apply(lambda x: 1 if not isinstance(x, (float)) else 0).value_counts()

0    2396
1     604
Name: belongs_to_collection, dtype: int64

In [10]:
import ast

In [11]:
train_raw['collection_name'] = train_raw['belongs_to_collection'].apply(lambda x: ast.literal_eval(x)[0]['name'] if not isinstance(x, (float)) else 0)
train_raw['has_collection'] = train_raw['belongs_to_collection'].apply(lambda x: len(ast.literal_eval(x)) if not isinstance(x, (float)) else 0)

test_raw['collection_name'] = test_raw['belongs_to_collection'].apply(lambda x: ast.literal_eval(x)[0]['name'] if not isinstance(x, (float)) else 0)
test_raw['has_collection'] = test_raw['belongs_to_collection'].apply(lambda x: len(ast.literal_eval(x)) if not isinstance(x, (float)) else 0)

train_raw = train_raw.drop(['belongs_to_collection'], axis=1)
test_raw = test_raw.drop(['belongs_to_collection'], axis=1)

In [12]:
train_raw.head()

Unnamed: 0,id,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,poster_path,...,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue,collection_name,has_collection
0,1,14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,/tQtWuwvMf0hCc2QR2tkolwl7c3c.jpg,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651,Hot Tub Time Machine Collection,1
1,2,40000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,tt0368933,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.248895,/w9Z7A0GHEhIp7etpj0vyKOeU1Wx.jpg,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It can take a lifetime to find true love; she'...,The Princess Diaries 2: Royal Engagement,"[{'id': 2505, 'name': 'coronation'}, {'id': 42...","[{'cast_id': 1, 'character': 'Mia Thermopolis'...","[{'credit_id': '52fe43fe9251416c7502563d', 'de...",95149435,The Princess Diaries Collection,1
2,3,3300000,"[{'id': 18, 'name': 'Drama'}]",http://sonyclassics.com/whiplash/,tt2582802,en,Whiplash,"Under the direction of a ruthless instructor, ...",64.29999,/lIv1QinFqz4dlp5U4lQ6HaiskOZ.jpg,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The road to greatness can take you to the edge.,Whiplash,"[{'id': 1416, 'name': 'jazz'}, {'id': 1523, 'n...","[{'cast_id': 5, 'character': 'Andrew Neimann',...","[{'credit_id': '54d5356ec3a3683ba0000039', 'de...",13092000,0,0
3,4,1200000,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",http://kahaanithefilm.com/,tt1821480,hi,Kahaani,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,3.174936,/aTXRaPrWSinhcmCrcfJK17urp3F.jpg,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,Kahaani,"[{'id': 10092, 'name': 'mystery'}, {'id': 1054...","[{'cast_id': 1, 'character': 'Vidya Bagchi', '...","[{'credit_id': '52fe48779251416c9108d6eb', 'de...",16000000,0,0
4,5,0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,tt1380152,ko,마린보이,Marine Boy is the story of a former national s...,1.14807,/m22s7zvkVFDU9ir56PiiqIEWFdT.jpg,...,"[{'iso_639_1': 'ko', 'name': '한국어/조선말'}]",Released,,Marine Boy,,"[{'cast_id': 3, 'character': 'Chun-soo', 'cred...","[{'credit_id': '52fe464b9251416c75073b43', 'de...",3923970,0,0


In [13]:
for i, e in enumerate(train_raw['genres'][:5]):
    print(i, e)

0 [{'id': 35, 'name': 'Comedy'}]
1 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}, {'id': 10751, 'name': 'Family'}, {'id': 10749, 'name': 'Romance'}]
2 [{'id': 18, 'name': 'Drama'}]
3 [{'id': 53, 'name': 'Thriller'}, {'id': 18, 'name': 'Drama'}]
4 [{'id': 28, 'name': 'Action'}, {'id': 53, 'name': 'Thriller'}]


In [14]:
list_of_genres = list(train_raw['genres'].apply(lambda x: [i['name'] for i in ast.literal_eval(x)] if not isinstance(x, (float)) else []).values)
#count de generos
train_raw['num_genres'] = pd.DataFrame([len(x) for x in list_of_genres])
#Todos los generos como string

train_raw['all_genres'] = train_raw['genres'].apply(lambda x: ' '.join(sorted([i['name'] for i in ast.literal_eval(x)])) if not isinstance(x, (float)) else '')
top_genres = pd.DataFrame([i for j in list_of_genres for i in j])[0].value_counts().index.tolist()

for g in top_genres[:15]:
    train_raw['genre_' + g] = train_raw['all_genres'].apply(lambda x: 1 if g in x else 0)

list_of_genres_t = list(test_raw['genres'].apply(lambda x: [i['name'] for i in ast.literal_eval(x)] if not isinstance(x, (float)) else []).values) 
test_raw['num_genres'] = pd.DataFrame([len(x) for x in list_of_genres_t])
test_raw['all_genres'] = test_raw['genres'].apply(lambda x: ' '.join(sorted([i['name'] for i in ast.literal_eval(x)])) if not isinstance(x, (float)) else '')
for g in top_genres:
    test_raw['genre_' + g] = test_raw['all_genres'].apply(lambda x: 1 if g in x else 0)

train_raw = train_raw.drop(['genres'], axis=1)
test_raw = test_raw.drop(['genres'], axis=1)


In [15]:
for i, e in enumerate(train_raw['production_companies'][:5]):
    print(i, e)

0 [{'name': 'Paramount Pictures', 'id': 4}, {'name': 'United Artists', 'id': 60}, {'name': 'Metro-Goldwyn-Mayer (MGM)', 'id': 8411}]
1 [{'name': 'Walt Disney Pictures', 'id': 2}]
2 [{'name': 'Bold Films', 'id': 2266}, {'name': 'Blumhouse Productions', 'id': 3172}, {'name': 'Right of Way Films', 'id': 32157}]
3 nan
4 nan


In [16]:
print('Number of production companies in films')
train_raw['production_companies'].apply(lambda x: len(ast.literal_eval(x)) if not isinstance(x, (float)) else 0).value_counts()

Number of production companies in films


1     775
2     734
3     582
4     312
5     166
0     156
6     118
7      62
8      42
9      29
11      7
10      7
12      3
16      2
15      2
14      1
13      1
17      1
Name: production_companies, dtype: int64

In [17]:
train_raw[train_raw['production_companies'].apply(lambda x: len(ast.literal_eval(x)) if not isinstance(x, (float)) else 0) > 11]

Unnamed: 0,id,budget,homepage,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,...,genre_Crime,genre_Adventure,genre_Horror,genre_Science Fiction,genre_Family,genre_Fantasy,genre_Mystery,genre_Animation,genre_History,genre_Music
31,32,0,http://www.cache-derfilm.at,tt0387898,fr,Caché,A married couple is terrorized by a series of ...,5.69586,/i1Zl8S4DgM3IDLW5dhZzBnIdCOe.jpg,"[{'name': 'Les Films du Losange', 'id': 223}, ...",...,0,0,0,0,0,0,1,0,0,0
116,117,0,,tt2113822,zh,一九四二,"In 1942, Henan Province was devastated by the ...",1.678013,/xxz2gi8vijqqJySGO3kQy2i8mv.jpg,"[{'name': 'Emperor Motion Pictures', 'id': 272...",...,0,0,0,0,0,0,0,0,0,0
363,364,15400000,,tt2053425,fr,De rouille et d'os,"Put in charge of his young son, Ali leaves Bel...",8.400049,/cHCwW8xPl8yPKQwpNzKVinwvirT.jpg,"[{'name': 'France 2 Cinéma', 'id': 83}, {'name...",...,0,0,0,0,0,0,0,0,0,0
392,393,0,,tt5072406,fr,Moka,Diane Kramer is led by one obsession: to find ...,2.404466,/5VKVaTJJsyDeOzY6fLcyTo1RA9g.jpg,"[{'name': 'Canal+', 'id': 5358}, {'name': 'Cin...",...,0,0,0,0,0,0,0,0,0,0
449,450,80000000,http://asoundofthunder.warnerbros.com/,tt0318081,en,A Sound of Thunder,When a hunter sent back to the prehistoric era...,4.980191,/gsqOX1ReJ5lcmTuDdkhOXLug8Ug.jpg,"[{'name': 'Epsilon Motion Pictures', 'id': 117...",...,0,1,0,1,0,0,0,0,0,0
554,555,14500000,,tt0293416,ja,Metropolis,Duke Red has overseen the construction of a ma...,9.298092,/1EK7mGCpRKYmSg25FaojvnS9opm.jpg,"[{'name': 'Bandai Visual Company', 'id': 528},...",...,0,0,0,1,0,0,0,1,0,0
1079,1080,7000000,,tt2737050,fr,"Deux jours, une nuit",Sandra is a young woman who has only one weeke...,6.868022,/1mYAejpMskvskGr0J0SaBvdjmrH.jpg,"[{'name': 'BIM Distribuzione', 'id': 225}, {'n...",...,0,0,0,0,0,0,0,0,0,0
2170,2171,4000000,http://tickets.picturehouseentertainment.co.uk...,tt3464902,en,The Lobster,"In a dystopian near future, single people, acc...",11.223033,/yR60EqMGS9hHq9I5Pkq2hG984TP.jpg,"[{'name': 'Haut et Court', 'id': 726}, {'name'...",...,0,0,0,1,0,0,0,0,0,0
2395,2396,0,,tt4082068,fr,Dheepan,Dheepan is a Sri Lankan Tamil warrior who flee...,5.752416,/lgONuekbRlM0eMvMHJBEDL6MsBP.jpg,"[{'name': 'France 2 Cinéma', 'id': 83}, {'name...",...,1,0,0,0,0,0,0,0,0,0
2517,2518,97250400,http://www.asterixauxjeuxolympiques.com/index.php,tt0463872,fr,Astérix aux Jeux Olympiques,Ast√©rix and Ob√©lix have to win the Olympic G...,9.671944,/tKL0RJOeuccc1rrpcDKg8qhedIz.jpg,"[{'name': 'Constantin Film', 'id': 47}, {'name...",...,0,1,0,0,1,1,0,0,0,0


In [18]:
train_raw['production_companies'].head()

0    [{'name': 'Paramount Pictures', 'id': 4}, {'na...
1          [{'name': 'Walt Disney Pictures', 'id': 2}]
2    [{'name': 'Bold Films', 'id': 2266}, {'name': ...
3                                                  NaN
4                                                  NaN
Name: production_companies, dtype: object

In [19]:
#count de productoras
train_raw['num_companies'] = train_raw['production_companies'].apply(lambda x: len(ast.literal_eval(x)) if not isinstance(x, (float)) else 0)
train_raw['all_production_companies'] = train_raw['production_companies'].apply(lambda x: ' '.join(sorted([i['name'] for i in ast.literal_eval(x)])) if not isinstance(x, (float)) else '')

list_of_companies = list(train_raw['production_companies'].apply(lambda x: [i['name'] for i in ast.literal_eval(x)] if not isinstance(x, (float)) else []).values)
top_companies = pd.DataFrame([i for j in list_of_companies for i in j])[0].value_counts().index.tolist()

for g in top_companies[:30]:
    train_raw['production_company_' + g] = train_raw['all_production_companies'].apply(lambda x: 1 if g in x else 0)

test_raw['num_companies'] = test_raw['production_companies'].apply(lambda x: len(ast.literal_eval(x)) if not isinstance(x, (float)) else 0)
test_raw['all_production_companies'] = test_raw['production_companies'].apply(lambda x: ' '.join(sorted([i['name'] for i in ast.literal_eval(x)])) if not isinstance(x, (float)) else '')

for g in top_companies[:30]:
    test_raw['production_company_' + g] = test_raw['all_production_companies'].apply(lambda x: 1 if g in x else 0)

train_raw = train_raw.drop(['production_companies', 'all_production_companies'], axis=1)
test_raw = test_raw.drop(['production_companies', 'all_production_companies'], axis=1)

In [20]:
for i, e in enumerate(train_raw['production_countries'][:5]):
    print(i, e)

0 [{'iso_3166_1': 'US', 'name': 'United States of America'}]
1 [{'iso_3166_1': 'US', 'name': 'United States of America'}]
2 [{'iso_3166_1': 'US', 'name': 'United States of America'}]
3 [{'iso_3166_1': 'IN', 'name': 'India'}]
4 [{'iso_3166_1': 'KR', 'name': 'South Korea'}]


In [21]:
print('Number of production countries in films')
train_raw['production_countries'].apply(lambda x: len(ast.literal_eval(x)) if not isinstance(x, (float)) else 0).value_counts()

Number of production countries in films


1    2222
2     525
3     116
4      57
0      55
5      21
6       3
8       1
Name: production_countries, dtype: int64

In [22]:
#count de countries
train_raw['num_countries'] = train_raw['production_countries'].apply(lambda x: len(ast.literal_eval(x)) if not isinstance(x, (float)) else 0)
train_raw['all_production_countries'] = train_raw['production_countries'].apply(lambda x: ' '.join(sorted([i['name'] for i in ast.literal_eval(x)])) if not isinstance(x, (float)) else '')

list_of_countries = list(train_raw['production_countries'].apply(lambda x: [i['name'] for i in ast.literal_eval(x)] if not isinstance(x, (float)) else []).values)
top_countries = pd.DataFrame([i for j in list_of_countries for i in j])[0].value_counts().index.tolist()

for g in top_countries[:25]:
    train_raw['production_countrie_' + g] = train_raw['all_production_countries'].apply(lambda x: 1 if g in x else 0)

test_raw['num_countries'] = test_raw['production_countries'].apply(lambda x: len(ast.literal_eval(x)) if not isinstance(x, (float)) else 0)
test_raw['all_production_countries'] = test_raw['production_countries'].apply(lambda x: ' '.join(sorted([i['name'] for i in ast.literal_eval(x)])) if not isinstance(x, (float)) else '')

for g in top_countries[:25]:
    test_raw['production_countrie_' + g] = test_raw['all_production_countries'].apply(lambda x: 1 if g in x else 0)

train_raw = train_raw.drop(['production_countries', 'all_production_countries'], axis=1)
test_raw = test_raw.drop(['production_countries', 'all_production_countries'], axis=1)



In [23]:
for i, e in enumerate(train_raw['spoken_languages'][:5]):
    print(i, e)

0 [{'iso_639_1': 'en', 'name': 'English'}]
1 [{'iso_639_1': 'en', 'name': 'English'}]
2 [{'iso_639_1': 'en', 'name': 'English'}]
3 [{'iso_639_1': 'en', 'name': 'English'}, {'iso_639_1': 'hi', 'name': 'हिन्दी'}]
4 [{'iso_639_1': 'ko', 'name': '한국어/조선말'}]


In [24]:
print('Number of sponken languages in films')
train_raw['spoken_languages'].apply(lambda x: len(ast.literal_eval(x)) if not isinstance(x, (float)) else 0).value_counts()

Number of sponken languages in films


1    2105
2     549
3     216
4      72
5      23
0      20
7       6
6       6
8       2
9       1
Name: spoken_languages, dtype: int64

In [25]:
#count de lenguajes
train_raw['num_languages'] = train_raw['spoken_languages'].apply(lambda x: len(ast.literal_eval(x)) if not isinstance(x, (float)) else 0)
train_raw['all_spoken_languages'] = train_raw['spoken_languages'].apply(lambda x: ' '.join(sorted([i['name'] for i in ast.literal_eval(x)])) if not isinstance(x, (float)) else '')

list_of_languages = list(train_raw['spoken_languages'].apply(lambda x: [i['name'] for i in ast.literal_eval(x)] if not isinstance(x, (float)) else []).values)
top_languages = pd.DataFrame([i for j in list_of_languages for i in j])[0].value_counts().index.tolist()

for g in top_languages[:15]:
    train_raw['language_' + g] = train_raw['all_spoken_languages'].apply(lambda x: 1 if g in x else 0)

test_raw['num_languages'] = test_raw['spoken_languages'].apply(lambda x: len(ast.literal_eval(x)) if not isinstance(x, (float)) else 0)
test_raw['all_spoken_languages'] = test_raw['spoken_languages'].apply(lambda x: ' '.join(sorted([i['name'] for i in ast.literal_eval(x)])) if not isinstance(x, (float)) else '')

for g in top_languages[:15]:
    test_raw['language_' + g] = test_raw['all_spoken_languages'].apply(lambda x: 1 if g in x else 0)

train_raw = train_raw.drop(['spoken_languages', 'all_spoken_languages'], axis=1)
test_raw = test_raw.drop(['spoken_languages', 'all_spoken_languages'], axis=1)

In [26]:
for i, e in enumerate(train_raw['Keywords'][:5]):
    print(i, e)

0 [{'id': 4379, 'name': 'time travel'}, {'id': 9663, 'name': 'sequel'}, {'id': 11830, 'name': 'hot tub'}, {'id': 179431, 'name': 'duringcreditsstinger'}]
1 [{'id': 2505, 'name': 'coronation'}, {'id': 4263, 'name': 'duty'}, {'id': 6038, 'name': 'marriage'}, {'id': 13072, 'name': 'falling in love'}]
2 [{'id': 1416, 'name': 'jazz'}, {'id': 1523, 'name': 'obsession'}, {'id': 1640, 'name': 'conservatory'}, {'id': 2176, 'name': 'music teacher'}, {'id': 14512, 'name': 'new york city'}, {'id': 14819, 'name': 'violence'}, {'id': 33896, 'name': 'montage'}, {'id': 156823, 'name': 'drummer'}, {'id': 170418, 'name': 'public humiliation'}, {'id': 176095, 'name': 'jazz band'}, {'id': 206298, 'name': 'young adult'}, {'id': 207739, 'name': 'music school'}]
3 [{'id': 10092, 'name': 'mystery'}, {'id': 10540, 'name': 'bollywood'}, {'id': 11734, 'name': 'police corruption'}, {'id': 14536, 'name': 'crime'}, {'id': 14636, 'name': 'india'}, {'id': 208364, 'name': 'missing husband'}, {'id': 220935, 'name': 'ne

In [27]:
print('Number of keywords in films')
train_raw['Keywords'].apply(lambda x: len(ast.literal_eval(x)) if not isinstance(x, (float)) else 0).value_counts()

Number of keywords in films


5      293
0      276
4      248
3      228
6      227
2      207
7      192
1      187
8      161
9      134
11     132
10     125
12     100
13      85
14      59
15      59
16      58
17      45
18      29
20      24
21      23
19      22
22      17
23      15
24      12
27      12
25       6
26       5
32       2
28       2
30       2
31       2
29       2
37       2
43       1
40       1
38       1
33       1
97       1
39       1
149      1
Name: Keywords, dtype: int64

In [29]:
#count de Keywords
train_raw['num_Keywords'] = train_raw['Keywords'].apply(lambda x: len(ast.literal_eval(x)) if not isinstance(x, (float)) else 0)
train_raw['all_Keywords'] = train_raw['Keywords'].apply(lambda x: ' '.join(sorted([i['name'] for i in ast.literal_eval(x)])) if not isinstance(x, (float)) else '')

list_of_keywords = list(train_raw['Keywords'].apply(lambda x: [i['name'] for i in ast.literal_eval(x)] if not isinstance(x, (float)) else []).values)
top_keywords = pd.DataFrame([i for j in list_of_keywords for i in j])[0].value_counts().index.tolist()

for g in top_keywords[:30]:
    train_raw['Keyword_' + g] = train_raw['all_Keywords'].apply(lambda x: 1 if g in x else 0)

test_raw['num_Keywords'] = test_raw['Keywords'].apply(lambda x: len(ast.literal_eval(x)) if not isinstance(x, (float)) else 0)
test_raw['all_Keywords'] = test_raw['Keywords'].apply(lambda x: ' '.join(sorted([i['name'] for i in ast.literal_eval(x)])) if not isinstance(x, (float)) else '')

for g in top_keywords[:30]:
    test_raw['Keywords' + g] = test_raw['all_Keywords'].apply(lambda x: 1 if g in x else 0)

train_raw = train_raw.drop(['Keywords', 'all_Keywords'], axis=1)
test_raw = test_raw.drop(['Keywords', 'all_Keywords'], axis=1)

In [31]:
for i, e in enumerate(train_raw['cast'][:1]):
    print(i, e)

0 [{'cast_id': 4, 'character': 'Lou', 'credit_id': '52fe4ee7c3a36847f82afae7', 'gender': 2, 'id': 52997, 'name': 'Rob Corddry', 'order': 0, 'profile_path': '/k2zJL0V1nEZuFT08xUdOd3ucfXz.jpg'}, {'cast_id': 5, 'character': 'Nick', 'credit_id': '52fe4ee7c3a36847f82afaeb', 'gender': 2, 'id': 64342, 'name': 'Craig Robinson', 'order': 1, 'profile_path': '/tVaRMkJXOEVhYxtnnFuhqW0Rjzz.jpg'}, {'cast_id': 6, 'character': 'Jacob', 'credit_id': '52fe4ee7c3a36847f82afaef', 'gender': 2, 'id': 54729, 'name': 'Clark Duke', 'order': 2, 'profile_path': '/oNzK0umwm5Wn0wyEbOy6TVJCSBn.jpg'}, {'cast_id': 7, 'character': 'Adam Jr.', 'credit_id': '52fe4ee7c3a36847f82afaf3', 'gender': 2, 'id': 36801, 'name': 'Adam Scott', 'order': 3, 'profile_path': '/5gb65xz8bzd42yjMAl4zwo4cvKw.jpg'}, {'cast_id': 8, 'character': 'Hot Tub Repairman', 'credit_id': '52fe4ee7c3a36847f82afaf7', 'gender': 2, 'id': 54812, 'name': 'Chevy Chase', 'order': 4, 'profile_path': '/svjpyYtPwtjvRxX9IZnOmOkhDOt.jpg'}, {'cast_id': 9, 'characte

In [33]:
print('Number of actors')
train_raw['cast'].apply(lambda x: len(ast.literal_eval(x)) if not isinstance(x, (float)) else 0).value_counts()[:10]

Number of actors


15    212
16    165
10    135
13    129
12    124
11    122
9     118
17    118
18    115
14    110
Name: cast, dtype: int64

In [39]:
list_of_cast_names = list(train_raw['cast'].apply(lambda x: [i['name'] for i in ast.literal_eval(x)] if not isinstance(x, (float)) else []).values)
top_cast = pd.DataFrame([i for j in list_of_cast_names for i in j])[0].value_counts().index.tolist()

list_of_cast_genders = list(train_raw['cast'].apply(lambda x: [i['gender'] for i in ast.literal_eval(x)] if not isinstance(x, (float)) else []).values)
top_genders = pd.DataFrame([i for j in list_of_cast_genders for i in j])[0].value_counts().index.tolist()

list_of_cast_character = list(train_raw['cast'].apply(lambda x: [i['character'] for i in ast.literal_eval(x)] if not isinstance(x, (float)) else []).values)
top_character = pd.DataFrame([i for j in list_of_cast_character for i in j])[0].value_counts().index.tolist()


In [41]:
print(top_cast[:3])
print(top_genders)
print(top_character[:5])

['Samuel L. Jackson', 'Robert De Niro', 'Morgan Freeman']
[2, 0, 1]
['', 'Himself', 'Herself', 'Dancer', 'Additional Voices (voice)']


In [None]:
train_raw['num_cast'] = train_raw['cast'].apply(lambda x: len(ast.literal_eval(x)) if not isinstance(x, (float)) else 0)

for g in top_cast[:15]:
    train_raw['cast_' + g] = train_raw['cast'].apply(lambda x: ast.literal_eval(x) if not isinstance(x, (float)) else []).apply(lambda x: 1 if g in str(x) else 0)

train_raw['genders_0_cast'] = train_raw['cast'].apply(lambda x: ast.literal_eval(x) if not isinstance(x, (float)) else []).apply(lambda x: sum([1 for i in x if i['gender'] == 0]))
train_raw['genders_1_cast'] = train_raw['cast'].apply(lambda x: ast.literal_eval(x) if not isinstance(x, (float)) else []).apply(lambda x: sum([1 for i in x if i['gender'] == 1]))
train_raw['genders_2_cast'] = train_raw['cast'].apply(lambda x: ast.literal_eval(x) if not isinstance(x, (float)) else []).apply(lambda x: sum([1 for i in x if i['gender'] == 2]))

for g in top_character[:15]:
    train_raw['cast_' + g] = train_raw['cast'].apply(lambda x: ast.literal_eval(x) if not isinstance(x, (float)) else []).apply(lambda x: 1 if g in str(x) else 0)

test_raw['num_cast'] = test_raw['cast'].apply(lambda x: len(ast.literal_eval(x)) if not isinstance(x, (float)) else 0)

for g in top_cast[:15]:
    test_raw['cast_' + g] = test_raw['cast'].apply(lambda x: ast.literal_eval(x) if not isinstance(x, (float)) else []).apply(lambda x: 1 if g in str(x) else 0)

test_raw['genders_0_cast'] = test_raw['cast'].apply(lambda x: ast.literal_eval(x) if not isinstance(x, (float)) else []).apply(lambda x: sum([1 for i in x if i['gender'] == 0]))
test_raw['genders_1_cast'] = test_raw['cast'].apply(lambda x: ast.literal_eval(x) if not isinstance(x, (float)) else []).apply(lambda x: sum([1 for i in x if i['gender'] == 1]))
test_raw['genders_2_cast'] = test_raw['cast'].apply(lambda x: ast.literal_eval(x) if not isinstance(x, (float)) else []).apply(lambda x: sum([1 for i in x if i['gender'] == 2]))

for g in top_character[:15]:
    test_raw['cast_' + g] = test_raw['cast'].apply(lambda x: ast.literal_eval(x) if not isinstance(x, (float)) else []).apply(lambda x: 1 if g in str(x) else 0)

train_raw = train_raw.drop(['cast'], axis=1)
test_raw = test_raw.drop(['cast'], axis=1)

In [55]:
for i, e in enumerate(train_raw['crew'][:1]):
    print(i, e)

0 [{'credit_id': '59ac067c92514107af02c8c8', 'department': 'Directing', 'gender': 0, 'id': 1449071, 'job': 'First Assistant Director', 'name': 'Kelly Cantley', 'profile_path': None}, {'credit_id': '52fe4ee7c3a36847f82afad7', 'department': 'Directing', 'gender': 2, 'id': 3227, 'job': 'Director', 'name': 'Steve Pink', 'profile_path': '/myHOgo8mQSCiCAZNGMRdHVr03jr.jpg'}, {'credit_id': '5524ed25c3a3687ded000d88', 'department': 'Writing', 'gender': 2, 'id': 347335, 'job': 'Writer', 'name': 'Josh Heald', 'profile_path': '/pwXJIenrDMrG7t3zNfLvr8w1RGU.jpg'}, {'credit_id': '5524ed2d925141720c001128', 'department': 'Writing', 'gender': 2, 'id': 347335, 'job': 'Characters', 'name': 'Josh Heald', 'profile_path': '/pwXJIenrDMrG7t3zNfLvr8w1RGU.jpg'}, {'credit_id': '5524ed3d92514166c1004a5d', 'department': 'Production', 'gender': 2, 'id': 57822, 'job': 'Producer', 'name': 'Andrew Panay', 'profile_path': None}, {'credit_id': '5524ed4bc3a3687df3000dd2', 'department': 'Production', 'gender': 0, 'id': 14

In [56]:
print('Number of crew')
train_raw['crew'].apply(lambda x: len(ast.literal_eval(x)) if not isinstance(x, (float)) else 0).value_counts()[:10]

Number of crew


2     179
11    127
10    126
3     126
12    110
9     109
8     109
14    104
4     101
7      94
Name: crew, dtype: int64

In [58]:
list_of_crew_names = list(train_raw['crew'].apply(lambda x: [i['name'] for i in ast.literal_eval(x)] if not isinstance(x, (float)) else []).values)
top_crew_names = pd.DataFrame([i for j in list_of_crew_names for i in j])[0].value_counts().index.tolist()

list_of_crew_genders = list(train_raw['crew'].apply(lambda x: [i['gender'] for i in ast.literal_eval(x)] if not isinstance(x, (float)) else []).values)
top_crew_genders = pd.DataFrame([i for j in list_of_crew_genders for i in j])[0].value_counts().index.tolist()

list_of_crew_jobs = list(train_raw['crew'].apply(lambda x: [i['job'] for i in ast.literal_eval(x)] if not isinstance(x, (float)) else []).values)
top_crew_jobs = pd.DataFrame([i for j in list_of_crew_jobs for i in j])[0].value_counts().index.tolist()

list_of_crew_department = list(train_raw['crew'].apply(lambda x: [i['department'] for i in ast.literal_eval(x)] if not isinstance(x, (float)) else []).values)
top_crew_department = pd.DataFrame([i for j in list_of_crew_department for i in j])[0].value_counts().index.tolist()

In [59]:
print(top_crew_names[:3])
print(top_crew_genders)
print(top_crew_jobs[:5])
print(top_crew_department[:5])

['Avy Kaufman', 'Robert Rodriguez', 'Deborah Aquila']
[0, 2, 1]
['Producer', 'Executive Producer', 'Director', 'Screenplay', 'Editor']
['Production', 'Sound', 'Art', 'Crew', 'Writing']


In [61]:
train_raw['num_crew'] = train_raw['crew'].apply(lambda x: len(ast.literal_eval(x)) if not isinstance(x, (float)) else 0)

for g in top_crew_names[:15]:
    train_raw['crew_name_' + g] = train_raw['crew'].apply(lambda x: ast.literal_eval(x) if not isinstance(x, (float)) else []).apply(lambda x: 1 if g in str(x) else 0)

train_raw['genders_0_crew'] = train_raw['crew'].apply(lambda x: ast.literal_eval(x) if not isinstance(x, (float)) else []).apply(lambda x: sum([1 for i in x if i['gender'] == 0]))
train_raw['genders_1_crew'] = train_raw['crew'].apply(lambda x: ast.literal_eval(x) if not isinstance(x, (float)) else []).apply(lambda x: sum([1 for i in x if i['gender'] == 1]))
train_raw['genders_2_crew'] = train_raw['crew'].apply(lambda x: ast.literal_eval(x) if not isinstance(x, (float)) else []).apply(lambda x: sum([1 for i in x if i['gender'] == 2]))

for g in top_crew_jobs[:15]:
    train_raw['crew_jobs_' + g] = train_raw['crew'].apply(lambda x: ast.literal_eval(x) if not isinstance(x, (float)) else []).apply(lambda x: sum([1 for i in x if i['job'] == g]))

for g in top_crew_department[:15]:
    train_raw['crew_departments_' + g] = train_raw['crew'].apply(lambda x: ast.literal_eval(x) if not isinstance(x, (float)) else []).apply(lambda x: sum([1 for i in x if i['department'] == g]))

test_raw['num_crew'] = test_raw['crew'].apply(lambda x: len(ast.literal_eval(x)) if not isinstance(x, (float)) else 0)

for g in top_crew_names[:15]:
    test_raw['crew_name_' + g] = test_raw['crew'].apply(lambda x: ast.literal_eval(x) if not isinstance(x, (float)) else []).apply(lambda x: 1 if g in str(x) else 0)

test_raw['genders_0_crew'] = test_raw['crew'].apply(lambda x: ast.literal_eval(x) if not isinstance(x, (float)) else []).apply(lambda x: sum([1 for i in x if i['gender'] == 0]))
test_raw['genders_1_crew'] = test_raw['crew'].apply(lambda x: ast.literal_eval(x) if not isinstance(x, (float)) else []).apply(lambda x: sum([1 for i in x if i['gender'] == 1]))
test_raw['genders_2_crew'] = test_raw['crew'].apply(lambda x: ast.literal_eval(x) if not isinstance(x, (float)) else []).apply(lambda x: sum([1 for i in x if i['gender'] == 2]))

for g in top_crew_jobs[:15]:
    test_raw['crew_jobs_' + g] = test_raw['crew'].apply(lambda x: ast.literal_eval(x) if not isinstance(x, (float)) else []).apply(lambda x: sum([1 for i in x if i['job'] == g]))

for g in top_crew_department[:15]:
    test_raw['crew_departments_' + g] = test_raw['crew'].apply(lambda x: ast.literal_eval(x) if not isinstance(x, (float)) else []).apply(lambda x: sum([1 for i in x if i['department'] == g]))

train_raw = train_raw.drop(['crew'], axis=1)
test_raw = test_raw.drop(['crew'], axis=1)