Funcões de mapeamento

In [19]:
import pandas as pd
import ast
from pandas import DataFrame

def map_genre_ids_to_names(genres_ids_col, genres_dict):
    return genres_ids_col.map(lambda genre_ids: None if pd.isna(genre_ids) else [genres_dict[int(id)] for id in ast.literal_eval(genre_ids)])


def map_keywords(keywords_col):
    return keywords_col.map(lambda keywords_dict_list: [keyword['name'] for keyword in ast.literal_eval(keywords_dict_list)])

def map_spoken_languages(spoken_languages_col):
    return spoken_languages_col.map(lambda spoken_languages_dict_list: [spoken_language['english_name'] for spoken_language in ast.literal_eval(spoken_languages_dict_list)])

def get_country(country_iso, countries_dict):
    if country_iso == '': 
        return None
    return countries_dict[country_iso]

def map_production_companies(production_companies_col, countries_dict):
    return production_companies_col.map(lambda production_companies_dict_list: (
            [
                {'name': production_companies_dict['name'], 'origin_country': get_country(production_companies_dict['origin_country'], countries_dict)}
                for production_companies_dict in ast.literal_eval(production_companies_dict_list) 
            ]
        )
    )

def map_production_countries(production_countries_col):
    return production_countries_col.map(lambda production_countries: [pd_dict['name'] for pd_dict in ast.literal_eval(production_countries)])

def remove_dict_props(dict, props_list):
    for prop in props_list:
        dict.pop(prop, None)

    return dict

def map_cast(cast_col):
    def map_cast_gender(actor):
        gender_dict = {
            0: "Not set/not specified",
            1: "Female",
            2: "Male",
            3: "Non-binary"
        }
        gender_id = actor['gender']
        actor['gender'] = gender_dict[gender_id]
        return actor
        
    props_to_remove = ['adult', 'id', 'original_name', 'popularity', 'profile_path', 'cast_id', 'credit_id']
    
    return cast_col.map(lambda cast: [map_cast_gender(remove_dict_props(cast_dict, props_to_remove)) for cast_dict in ast.literal_eval(cast)])

def remove_zero_entries(df):
    columns_to_check = [
        'id', 
        'original_language', 
        'original_title', 
        'overview',
        'release_date', 
        'title', 
        'vote_average', 
        'vote_count', 
        'budget', 
        'genres', 
        'production_companies', 
        'production_countries', 
        'revenue', 
        'runtime', 
        'spoken_languages', 
        'cast', 
        'crew', 
        'keywords'
    ] 
    
    df_cleaned = df.dropna(subset=columns_to_check)
    
    zero_columns_condition = (df_cleaned[columns_to_check] == 0).any(axis=1)
    df_cleaned = df_cleaned[~zero_columns_condition]
    return df_cleaned
    

Codigo que chama as funções para cada ano

In [24]:
def prepare_data(year):
    raw_data_common_name = 'raw_data/tmdb_dump'

    genres_df = pd.read_csv(f'{raw_data_common_name}-genres.csv')
    genres = genres_df.set_index('id')['name'].to_dict()

    # Foi necessário NA filter porque o pais Namibia (NA) estava sendo intepretrado como NaN
    countries_df = pd.read_csv(f'{raw_data_common_name}-countries.csv', na_filter=False)
    countries = countries_df.set_index('iso_3166_1')['english_name'].to_dict()

    file_path = f'{raw_data_common_name}-{year}.csv'
    movies_df = pd.read_csv(file_path, encoding='utf-8', lineterminator='\n')
    
    movies_df = remove_zero_entries(movies_df)

    movies_df['genres'] = map_genre_ids_to_names(movies_df['genre_ids'], genres)
    movies_df['keywords'] = map_keywords(movies_df['keywords'])
    movies_df['spoken_languages'] = map_spoken_languages(movies_df['spoken_languages'])
    movies_df['production_companies'] = map_production_companies(movies_df['production_companies'], countries)
    movies_df['production_countries'] = map_production_countries(movies_df['production_countries'])
    movies_df['cast'] = map_cast(movies_df['cast'])
    movies_df['crew'] = map_cast(movies_df['crew'])


    columns_to_remove = ['adult', 'backdrop_path', 'genre_ids', 'poster_path', 'imdb_id', 'video', 'homepage', 'status', 'popularity']

    movies_df.drop(columns_to_remove, axis=1, inplace=True)
    duplicated_movies_mask = movies_df.astype(str).duplicated()
    movies_df = movies_df[~duplicated_movies_mask]
    return movies_df




In [28]:
start_year = 2013
end_year = 2023
prepared_data_common_name = 'prepared_data/tmdb_dump'

for year in range(start_year, end_year+1):
    prepared_df = prepare_data(year)
    prepared_df.to_csv(f'{prepared_data_common_name}-{year}.csv', index=False, encoding='utf-8', header=True)
