In [1]:
import pandas as pd
import ast 


In [2]:
# Load movies data on DF
df_movies = pd.read_csv('data/movies_dataset.csv', dtype={'popularity':object})

In [3]:
# Delete movies without title
mask = df_movies['title'] > ''
df_movies = df_movies.loc[mask]

In [4]:
# Replace null with 0
df_movies['revenue'].fillna(0, inplace=True)

# Convert column to float & replace null with 0
df_movies.budget = df_movies.budget.astype(float)
df_movies['budget'].fillna(0, inplace=True)


In [5]:
# Extract collection name from belongs_to_collection string dict 

def collection_to_string(row):
    if type(row.belongs_to_collection) == str:
        collection = ast.literal_eval(row.belongs_to_collection)['name'] 
        return collection

    return None  

df_movies.belongs_to_collection = df_movies.apply(collection_to_string, axis=1)

In [6]:
# Extract countries names from production_countries string dict

def countries_to_list(row):
    if type(row.production_countries) == str:
        countries = ast.literal_eval(row.production_countries)
        list_countries = [c['name'] for c in countries]
        return list_countries
    
    return '[]'

df_movies.production_countries = df_movies.apply(countries_to_list, axis=1)

In [7]:
# Extract companies names from production_companies string dict
def companies_to_list(row):

    if type(row.production_companies) == str:
        companies = ast.literal_eval(row.production_companies)
        list_companies = [c['name'] for c in companies]
        return list_companies
    
    return '[]'

df_movies.production_companies = df_movies.apply(companies_to_list, axis=1)

In [8]:
# Delete rows with empty release date
df_movies.count()
df_movies.dropna(subset=['release_date'], inplace=True)

In [9]:
# Add release year column
df_movies['release_year'] = df_movies['release_date'].apply(lambda x : x[:4])

In [10]:
# Calculate return

def calculo_return (row):
    if type(row.revenue)==float and type(row.budget)==float and row.budget>0:
        return round(row.revenue/row.budget, 2)
    return 0

df_movies['return'] = df_movies.apply(calculo_return, axis=1)


In [11]:
# add director from credits
def extract_director(row):

    if type(row.crew) == str:
        crew = ast.literal_eval(row.crew)
        if len(crew)> 0:
            director = crew[0]['name']
            return director
    
    return ''

df_credits = pd.read_csv('data/credits.csv')
df_credits['director'] = df_credits.apply(extract_director, axis=1)
df_credits.drop(columns=['crew','cast'], inplace=True)

#make ids same type
df_movies.id = df_movies.id.astype(int)
#add director column to df_movies
df_movies = pd.merge(df_movies, df_credits, on='id', how='inner')

In [12]:
#Drop useless columns
df_movies.drop(columns=['video','imdb_id','adult','original_title','poster_path','homepage'], inplace=True)

In [13]:
#Save DataFrame
df_movies.to_csv('data/movies_clean.csv')
del df_movies
del df_credits