In [1]:
import pandas as pd
import numpy as np
import ast

In [2]:
test_data = pd.read_csv('full_data.csv',delimiter=',')
test_data = test_data.drop("Unnamed: 0", 1)
test_data = test_data.drop(['vote_count', 'vote_average'], axis=1)
test_data = test_data.drop(['metacritic'], axis=1)

test_data['rating_count'] = test_data['rating_count'].replace(0,np.nan)
test_data = test_data[np.isfinite(test_data['rating_count'])]
test_data['cast'] = test_data['cast'].replace('[]',np.nan)
test_data['crew'] = test_data['crew'].replace('[]',np.nan)
test_data = test_data[pd.notnull(test_data['cast'])]
test_data = test_data[pd.notnull(test_data['crew'])]

test_data = test_data.reset_index(drop=True)

# Get director, editor, producer, writer

In [3]:
directors=[]
producers=[]
editors=[]
writers=[]

for index, row in test_data.iterrows():
    is_director=False
    is_producer=False
    is_editor=False
    is_writer=False
    
    row_crew=ast.literal_eval(row['crew'])
    for crew in row_crew:
        if(crew['job']=='Director' and not is_director):
            directors.append(crew['id'])
            is_director=True
        if(crew['job']=='Producer' and not is_producer):
            producers.append(crew['id'])
            is_producer=True
        if(crew['job']=='Editor' and not is_editor):
            editors.append(crew['id'])
            is_editor=True
        if(crew['job']=='Writer' and not is_writer):
            writers.append(crew['id'])
            is_writer=True
    if(not is_director):
        directors.append(0)
    if(not is_producer):
        producers.append(0)
    if(not is_editor):
        editors.append(0)
    if(not is_writer):
        writers.append(0)
        
test_data['director']=directors
test_data['producer']=producers
test_data['editor']=editors
test_data['writer']=writers


# Calculate genre bytwise

In [4]:
genres=[]
gen_dict = {'Action': 1,
            'Adventure': 2,
            'Animation': 4,
            'Comedy':8,
            'Crime':16,
            'Documentary':32,
            'Drama':64,
            'Family':128,
            'Fantasy':256,
            'Foreign':512,
            'History':1024,
            'Horror':2048,
            'Music':4096,
            'Mystery':8192,
            'Romance':16384,
            'Science Fiction':32768,
            'TV Movie':65536,
            'Thriller':131072,
            'War':262144,
            'Western':524288}

for index, row in test_data.iterrows():
    row_genres=ast.literal_eval(row['genres'])
    genre_value=0
    for genre in row_genres:
        genre_value=genre_value+gen_dict[genre['name']]
    genres.append(genre_value)
test_data['genre_byte']=genres

# Get award win/nomination per director

In [6]:
award_data = pd.read_csv('220k_awards_by_directors.csv',delimiter=',')

In [12]:
director_award_dict={}
for index, row in award_data.iterrows():
    if(row['director_name'] not in director_award_dict):
        director_award_dict[row['director_name']]=[0,0]
    if(row['outcome']=="Won"):
        director_award_dict[row['director_name']][0]=director_award_dict[row['director_name']][0]+1
    else:
        director_award_dict[row['director_name']][1]=director_award_dict[row['director_name']][1]+1

In [19]:
victory_list=[]
nomination_list=[]

for index, row in test_data.iterrows(): 
    row_crew=ast.literal_eval(row['crew'])
    victories=0
    nominations=0
    for crew in row_crew:
        if(crew['job']=='Director' and (crew['name'] in director_award_dict)):
            victories=victories+director_award_dict[crew['name']][0]
            nominations=nominations+director_award_dict[crew['name']][1]
    victory_list.append(victories)
    nomination_list.append(nominations)
    
test_data['director_awards_won']=victory_list
test_data['director_awards_nominated']=nomination_list

# Save to file

In [22]:
test_data.to_csv('added_rows.csv', sep=',', encoding='utf-8',index=False)