In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

start_year = 2013
end_year = 2023

movies_data_by_year = {}

for year in range(start_year, end_year+1):
    file_path = f'raw_data/tmdb_dump-{year}.csv'
    df = pd.read_csv(file_path, encoding='utf-8', lineterminator='\n')
    movies_data_by_year[year] = df
    print(year, len(df))


2013 2690
2014 2794
2015 2961
2016 3060
2017 3331
2018 3287
2019 3223
2020 2466
2021 2456
2022 2435
2023 1912


In [4]:
for year in range(start_year, end_year+1):
    df = movies_data_by_year[year]
    print(year)
    print(df.isnull().sum())
    print("\n")

2013
adult                       0
backdrop_path             375
genre_ids                   0
id                          0
original_language           0
original_title              0
overview                   29
popularity                  0
poster_path                18
release_date                0
title                       0
video                       0
vote_average                0
vote_count                  0
belongs_to_collection    2375
budget                      1
genres                      1
homepage                 1894
imdb_id                    13
production_companies        1
production_countries        1
revenue                     1
runtime                     1
spoken_languages            1
status                      1
tagline                  1494
cast                        0
crew                        0
keywords                    3
dtype: int64


2014
adult                       0
backdrop_path             358
genre_ids                   0
id             

In [5]:
for year in range(start_year, end_year+1):
    df = movies_data_by_year[year]
    print(year)
    print(df[df == 0].count())
    print("\n")

2013
adult                    2690
backdrop_path               0
genre_ids                   0
id                          0
original_language           0
original_title              0
overview                    0
popularity                  0
poster_path                 0
release_date                0
title                       0
video                    2690
vote_average                0
vote_count                  0
belongs_to_collection       0
budget                   2079
genres                      0
homepage                    0
imdb_id                     0
production_companies        0
production_countries        0
revenue                  2174
runtime                    42
spoken_languages            0
status                      0
tagline                     0
cast                        0
crew                        0
keywords                    0
dtype: int64


2014
adult                    2794
backdrop_path               0
genre_ids                   0
id             

In [6]:
for year in range(start_year, end_year+1):
    df = movies_data_by_year[year]
    print(year)
    print(df[df == ""].value_counts())
    print("\n")

2013
Series([], Name: count, dtype: int64)


2014
Series([], Name: count, dtype: int64)


2015
Series([], Name: count, dtype: int64)


2016
Series([], Name: count, dtype: int64)


2017
Series([], Name: count, dtype: int64)


2018
Series([], Name: count, dtype: int64)


2019
Series([], Name: count, dtype: int64)


2020
Series([], Name: count, dtype: int64)


2021
Series([], Name: count, dtype: int64)


2022
Series([], Name: count, dtype: int64)


2023
Series([], Name: count, dtype: int64)




In [9]:
for year in range(start_year, end_year+1):
    df = movies_data_by_year[year]
    print(year, df["budget"].value_counts(), df["revenue"].value_counts())
    print("\n")

2013 budget
0.0           2079
5000000.0       24
1000000.0       24
2000000.0       24
4000000.0       15
              ... 
21000000.0       1
4800000.0        1
1900000.0        1
2900000.0        1
58000000.0       1
Name: count, Length: 249, dtype: int64 revenue
0.0            2174
3900000.0         3
1600000.0         3
5200000.0         2
4463059.0         2
               ... 
12247.0           1
32935319.0        1
371900000.0       1
10900434.0        1
44000000.0        1
Name: count, Length: 499, dtype: int64


2014 budget
0.0            2219
5000000.0        24
2000000.0        22
1000000.0        21
3000000.0        20
               ... 
12250000.0        1
120000000.0       1
7.0               1
2121476.0         1
41000000.0        1
Name: count, Length: 246, dtype: int64 revenue
0.0            2271
3200000.0         3
100.0             3
2800000.0         2
4404.0            2
               ... 
83700000.0        1
529100000.0       1
369300000.0       1
542300000.0 

In [53]:
import ast

def remove_zero_entries(df):
    columns_to_check = [
        'id', 
        'original_language', 
        'original_title', 
        'overview',
        'release_date', 
        'title', 
        'vote_average', 
        'vote_count', 
        'genres', 
        'production_companies', 
        'production_countries', 
        'runtime', 
        'spoken_languages', 
        'cast', 
        'crew', 
        'keywords'
    ] 
    
    df_cleaned = df.dropna(subset=columns_to_check)
    
    zero_columns_condition = (df_cleaned[columns_to_check] == 0).any(axis=1)
    df_cleaned = df_cleaned[~zero_columns_condition]
    return df_cleaned

def map_production_countries(production_countries_col):
    return production_countries_col.map(lambda production_countries: [pd_dict['name'] for pd_dict in production_countries])

for year in range(start_year, end_year+1):
    df = movies_data_by_year[year]
    df = remove_zero_entries(df)
    df['production_countries'] = df['production_countries'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    df['production_countries'] = map_production_countries(df['production_countries'])


    df_countries = df.explode('production_countries', ignore_index=True)
    country_counts = df_countries['production_countries'].value_counts().reset_index()
    country_counts.columns = ['production_country', 'count']
    
    # print(year, country_counts[country_counts["production_country"] == "Brazil"])
    total = df_countries[df_countries["production_countries"] == "Brazil"]["original_title"].count()
    without_budget = (df_countries[df_countries["production_countries"] == "Brazil"]["budget"] == 0).sum()
    
    print("Brasil:")
    print(year, total)
    # print(year, df_countries[df_countries["production_countries"] == "Brazil"]["original_title"].count(), "\n")
    # print(year, (df_countries[df_countries["production_countries"] == "Brazil"]["budget"] == 0).sum(), "\n")
    
    # print("EUA:")
    
    # total = df_countries[df_countries["production_countries"] == "United States of America"]["original_title"].count()
    # without_budget = (df_countries[df_countries["production_countries"] == "United States of America"]["budget"] == 0).sum()
    
    # print(year, total - without_budget)
    
    # print(len(df.index))


    
    


Brasil:
2013 41
Brasil:
2014 55
Brasil:
2015 45
Brasil:
2016 56
Brasil:
2017 90
Brasil:
2018 69
Brasil:
2019 70
Brasil:
2020 42
Brasil:
2021 44
Brasil:
2022 32
Brasil:
2023 34
