In [1]:
import ast

import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

%matplotlib inline
np.random.seed(42)

In [2]:
# Load data
df = pd.read_csv('./data/train.csv', sep=',')
df.set_index('id', inplace=True)
orig_len = len(df)
orig_columns = df.columns

In [3]:
# Convert json columns to lists of dictionaries
json_columns = ['genres', 'belongs_to_collection', 'production_companies', 'production_countries', 'spoken_languages', 'keywords', 'cast', 'crew']
def str_to_dict(str_dict):
    if type(str_dict) == type('str'):
        return ast.literal_eval(str_dict)
    elif type(str_dict) == type({'key': 'value'}):
        return str_dict
    return []

for col in json_columns:
    df[col] = df[col].map(str_to_dict)

In [23]:
languages = {}
def get_languages(row):
    for item in row:
        languages[item['iso_639_1']] = item['name']
df.spoken_languages.map(get_languages)
print(languages)

{'en': 'English', 'hi': 'हिन्दी', 'ko': '한국어/조선말', 'ar': 'العربية', 'ru': 'Pусский', 'sv': 'svenska', 'sr': 'Srpski', 'de': 'Deutsch', 'fr': 'Français', 'it': 'Italiano', 'ja': '日本語', 'he': 'עִבְרִית', 'pt': 'Português', 'la': 'Latin', 'zh': '普通话', 'es': 'Español', 'nl': 'Nederlands', 'cn': '广州话 / 廣州話', 'qu': '', 'cs': 'Český', 'ta': 'தமிழ்', 'te': 'తెలుగు', 'pl': 'Polski', 'tr': 'Türkçe', 'gd': '', 'hu': 'Magyar', 'el': 'ελληνικά', 'ga': 'Gaeilge', 'fa': 'فارسی', 'th': 'ภาษาไทย', 'ca': 'Català', 'tl': '', 'da': 'Dansk', 'bn': 'বাংলা', 'kk': 'қазақ', 'sh': '', 'yi': '', 'af': 'Afrikaans', 'hy': '', 'pa': 'ਪੰਜਾਬੀ', 'bg': 'български език', 'sw': 'Kiswahili', 'no': 'Norsk', 'mr': '', 'bo': '', 'xx': 'No Language', 'fi': 'suomi', 'ur': 'اردو', 'sq': 'shqip', 'ro': 'Română', 'ln': '', 'my': '', 'id': 'Bahasa indonesia', 'vi': 'Tiếng Việt', 'am': '', 'sk': 'Slovenčina', 'uk': 'Український', 'eo': 'Esperanto', 'eu': 'euskera', 'sa': '', 'ny': '', 'st': '', 'xh': '', 'zu': 'isiZulu', 'mi': '',

In [24]:
df.status.unique()

array(['Released', 'Rumored'], dtype=object)

In [4]:
#Drop rows where budget or runtime are zero, or genres are empty
cleaned_df = df.drop(df[(df.budget == 0) | (df.runtime == 0) | (df.genres == {})].index)
print(f'Rows dropped: {orig_len - len(cleaned_df)}')

Rows dropped: 823


In [5]:
# Create dummy variables for genres
dummy_genres = []

def encode_genres(row):
    new_row = {'id': row.name}
    for item in row['genres']:
        new_row[f'genre_{item["name"]}'.replace(' ', '_').lower()] = 1
    dummy_genres.append(new_row)
                
df[orig_columns].apply(encode_genres, axis=1)
dummy_genres = pd.DataFrame(dummy_genres)
dummy_genres.set_index('id', inplace=True)
dummy_genres.fillna(0, inplace=True)
dummy_genres = dummy_genres.astype('int64')

In [6]:
# Create categorical variable for genres
cat_genres = []

def generate_genres_cat(row):
    new_row = {'id': row.name}
    genres = []
    for item in row['genres']:
        genres.append(item["name"])
    new_row['genres_cat'] = ','.join(genres)
    cat_genres.append(new_row)
    
df[orig_columns].apply(generate_genres_cat, axis=1)
cat_genres = pd.DataFrame(cat_genres)
cat_genres.set_index('id', inplace=True)

In [7]:
# Create feature vector for genres
genres_vectors = []
                
def vectorize_genres(row):
    new_row = {'id': row.name}
    new_row['genre_vector'] = row.values.astype('int64')
    genres_vectors.append(new_row)
    
dummy_genres.apply(vectorize_genres, axis=1)
genres_vectors = pd.DataFrame(genres_vectors)
genres_vectors.set_index('id', inplace=True)

genres_df = dummy_genres.join(genres_vectors).join(cat_genres)

In [77]:
cleaned_df = df.join(genres_df)

In [78]:
# Create dummy variable for belongs_to_collection

belongs_to_collection_bool = []

def create_collection_bool(row):
    new_row = {'id': row.name}
    if len(row['belongs_to_collection']) > 0:
        new_row['collection_bool'] = 1
    else:
        new_row['collection_bool'] = 0
    belongs_to_collection_bool.append(new_row)
        
df.apply(create_collection_bool, axis=1)
belongs_to_collection_bool = pd.DataFrame(belongs_to_collection_bool)
belongs_to_collection_bool.set_index('id', inplace=True)
belongs_to_collection_bool = belongs_to_collection_bool.astype('int64')

In [79]:
# Create catagorical variable for belongs_to_collection

collection_cat = []

def generate_collection_cat(row):
    new_row = {'id': row.name}
    new_row['collection_cat'] = row['belongs_to_collection'][0]['name'] if len(row['belongs_to_collection']) > 0 else 'no_collection'
    collection_cat.append(new_row)
    
df.apply(generate_collection_cat, axis=1)
collection_cat = pd.DataFrame(collection_cat)
collection_cat.set_index('id', inplace=True)        

In [80]:
cleaned_df = cleaned_df.join(belongs_to_collection_bool).join(collection_cat)

In [93]:
# Create dummy variables for production companies

dummy_prod = []

def encode_prod(row):
    new_row = {'id': row.name}
    for item in row['production_companies']:
        new_row[f'prod_{item["name"]}'.replace(' ', '_').lower()] = 1
    dummy_prod.append(new_row)
                
df[orig_columns].apply(encode_prod, axis=1)
dummy_prod = pd.DataFrame(dummy_prod)
dummy_prod.set_index('id', inplace=True)
dummy_prod.fillna(0, inplace=True)
dummy_prod = dummy_prod.astype('int64')

In [86]:
cleaned_df = cleaned_df.join(dummy_prod)

In [87]:
cleaned_df.head(5)

Unnamed: 0_level_0,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,poster_path,...,prod_инвада_фильм,prod_интерфест,prod_квартал-95,prod_кинокомпания_«lunapark»,prod_леополис,prod_реал-дакота,prod_роскинопрокат,prod_ств,prod_флагман-трейд,prod_фокс
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,/tQtWuwvMf0hCc2QR2tkolwl7c3c.jpg,...,0,0,0,0,0,0,0,0,0,0
2,"[{'id': 107674, 'name': 'The Princess Diaries ...",40000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,tt0368933,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.248895,/w9Z7A0GHEhIp7etpj0vyKOeU1Wx.jpg,...,0,0,0,0,0,0,0,0,0,0
3,[],3300000,"[{'id': 18, 'name': 'Drama'}]",http://sonyclassics.com/whiplash/,tt2582802,en,Whiplash,"Under the direction of a ruthless instructor, ...",64.29999,/lIv1QinFqz4dlp5U4lQ6HaiskOZ.jpg,...,0,0,0,0,0,0,0,0,0,0
4,[],1200000,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",http://kahaanithefilm.com/,tt1821480,hi,Kahaani,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,3.174936,/aTXRaPrWSinhcmCrcfJK17urp3F.jpg,...,0,0,0,0,0,0,0,0,0,0
5,[],0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,tt1380152,ko,마린보이,Marine Boy is the story of a former national s...,1.14807,/m22s7zvkVFDU9ir56PiiqIEWFdT.jpg,...,0,0,0,0,0,0,0,0,0,0


In [82]:
cleaned_df.to_csv(path_or_buf='data/cleaned-train.csv')