Código Geral

In [342]:
import pandas as pd
import seaborn as sns
import math
import matplotlib.pyplot as plt
import ast
from collections import Counter
from pandas import DataFrame


start_year = 2013
end_year = 2023

movies_data_by_year = {}


In [343]:
def get_file_path(year):
    data_common_name = 'prepared_data/tmdb_dump'
    return f'{data_common_name}-{year}.csv'

def read_data_set(year):
    file_path = get_file_path(year)
    return pd.read_csv(file_path, encoding='utf-8', lineterminator='\n')

In [344]:
for year in range(start_year, end_year+1):
    data = read_data_set(year)
    movies_data_by_year[year] = data
    if 'all' in movies_data_by_year:
        movies_data_by_year['all'] = pd.concat([movies_data_by_year['all'], data])
    else:
        movies_data_by_year['all'] = data

Análise dos Dados numéricos (Quantidade e Média)

In [None]:
def movies_count(year):
    df =  movies_data_by_year[year]
    return len(df.index)

def budget_mean(year):
    df: DataFrame = movies_data_by_year[year]
    return df.loc[:, 'budget'].mean()

def revenue_mean(year):
    df: DataFrame = movies_data_by_year[year]
    return df.loc[:, 'revenue'].mean()

def runtime_mean(year):
    df: DataFrame = movies_data_by_year[year]
    return df.loc[:, 'runtime'].mean()

def vote_count_mean(year):
    df: DataFrame = movies_data_by_year[year]
    return df.loc[:, 'vote_count'].mean()

def vote_mean_by_year(year):
    df: DataFrame = movies_data_by_year[year]
    return vote_mean(df)

def vote_mean(df):
    df["weight"] = df["vote_count"] * df["vote_average"]
    return df["weight"].sum() / df["vote_count"].sum()
    
def vote_deviation(df):
    mean = vote_mean(df)
    df["weight"] = df["vote_count"] * df["vote_average"]    
    df["deviation"] = ((df["vote_average"] - mean)**2) * df["weight"]
    
    upper_eq_part = df["deviation"].sum()
    
    weight_sum = df["weight"].sum()
    n = df["weight"].count()
    bottom_eq_part = weight_sum * (n - 1) / n
    
    return math.sqrt(upper_eq_part/bottom_eq_part)

Calculo do desvio padrão amostral dos dados numéricos

In [None]:
df_concat = movies_data_by_year[start_year]
for year in range(start_year+1, end_year+1):
    df_concat = pd.concat([df_concat, movies_data_by_year[year]])
    
numeric_columns = ["budget", "revenue", "runtime", "vote_count"]
print(df_concat[numeric_columns].std())
print("Deviation: ", vote_deviation(df_concat))


budget        5.004297e+07
revenue       2.163519e+08
runtime       2.107148e+01
vote_count    3.680442e+03
dtype: float64
Deviation:  0.7151867182957711


In [None]:
for year in range(start_year, end_year+1):
    values = year, movies_count(year), budget_mean(year), revenue_mean(year), runtime_mean(year), vote_count_mean(year), vote_mean_by_year(year)
    for value in values:
        print("{:.2f}".format(value), " & " , end="")
    print("")


2013.00  & 333.00  & 29133697.35  & 83963872.83  & 111.97  & 2356.57  & 6.79  & 
2014.00  & 317.00  & 27739095.79  & 88809630.21  & 109.73  & 2624.77  & 7.01  & 
2015.00  & 301.00  & 28413424.29  & 96225236.56  & 112.00  & 2407.86  & 6.81  & 
2016.00  & 339.00  & 30959884.17  & 92686166.47  & 111.99  & 2464.51  & 6.83  & 
2017.00  & 307.00  & 29728204.99  & 103942389.76  & 112.44  & 2593.88  & 6.96  & 
2018.00  & 275.00  & 30404534.53  & 109654092.13  & 112.82  & 2334.57  & 7.00  & 
2019.00  & 251.00  & 32862991.07  & 119087300.21  & 111.95  & 2376.76  & 7.14  & 
2020.00  & 132.00  & 26047200.65  & 39661077.16  & 105.89  & 1539.45  & 7.04  & 
2021.00  & 158.00  & 45537524.30  & 93868823.13  & 115.16  & 2124.64  & 7.21  & 
2022.00  & 173.00  & 39368203.98  & 105105561.14  & 115.13  & 1499.87  & 7.11  & 
2023.00  & 207.00  & 44851926.51  & 106455189.39  & 117.57  & 1097.42  & 7.14  & 


Histograma das variáveis categóricas

In [None]:
def plot_histogram(data, title, xlabel):
    plt.figure(figsize=(10, 12))
    sns.histplot(y=data, stat="count", bins=(len(data.unique())))
    # plt.xscale('log')
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel("Frequência")
    plt.show()
    
df = movies_data_by_year[2013]

# Histograma para production_countries
df['production_countries'] = df['production_countries'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
# df_countries = df.explode('production_countries', ignore_index=True)["production_countries"]
# plot_histogram(df_countries, 'Distribuição de Production Countries', 'Países Produtores')

# # Histograma para genres
# df['genres'] = df['genres'].apply(lambda x: x if isinstance(x, list) else [])
# df_genres = df.explode('genres')['genres']
# plot_histogram(df_genres, 'Distribuição de Genres', 'Gêneros')

# # Histograma para keywords
# df['keywords'] = df['keywords'].apply(lambda x: x if isinstance(x, list) else [])
# df_keywords = df.explode('keywords')['keywords']
# plot_histogram(df_keywords, 'Distribuição de Keywords', 'Palavras-Chave')

# # Histograma para spoken_languages
# df['spoken_languages'] = df['spoken_languages'].apply(lambda x: x if isinstance(x, list) else [])
# df_languages = df.explode('spoken_languages')['spoken_languages']
# plot_histogram(df_languages, 'Distribuição de Spoken Languages', 'Línguas Faladas')






# for year in range(start_year, end_year+1):
#     df = movies_data_by_year[year]
#     all_countries = [country for sublist in df['production_countries'] for country in ast.literal_eval(sublist)]
#     # # Count the occurrences of each country
#     country_counts = Counter(all_countries)

#     # Convert to DataFrame for better visualization (optional)
#     country_counts_df = pd.DataFrame.from_dict(country_counts, orient='index', columns=['count']).reset_index()
#     country_counts_df.rename(columns={'index': 'country'}, inplace=True)
#     print(year)
#     print(country_counts_df.sort_values(by="count", ascending=False).head(15))
#     print()

In [None]:
import numpy as np
import folium
import geopandas as gpd
from folium import Choropleth

for year in range (start_year, end_year+1):
    df = movies_data_by_year[year]
    df['production_countries'] = df['production_countries'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

    df_countries = df.explode('production_countries', ignore_index=True)
    country_counts = df_countries['production_countries'].value_counts().reset_index()
    country_counts.columns = ['production_country', 'count']
    country_counts['count'] = np.log(country_counts['count'])

    # Carrega o arquivo GeoJSON com as fronteiras dos países
    geo_json_data = gpd.read_file('https://raw.githubusercontent.com/johan/world.geo.json/master/countries.geo.json')

    # Renomeia as colunas para corresponder aos nomes do GeoJSON
    geo_json_data = geo_json_data.rename(columns={"name": "production_country"})

    geo_countries_list = geo_json_data["production_country"].tolist()
   
    country_mapping = {
        'Hong Kong': 'China',  # Considera Hong Kong como parte da China
        'Serbia': 'Republic of Serbia',
        'Aruba': 'Netherlands',  # Considera Aruba como parte dos Países Baixos
        'Singapore': 'Malaysia',
        'Congo': 'Democratic Republic of the Congo',
        'Bahamas': 'The Bahamas',
        'Guadaloupe': 'France' # Considera Guadalupe como parte da frança
    }

    country_counts.replace(country_mapping, inplace=True)
    
    df_countries_list = country_counts['production_country'].tolist()
    missing_countries = [country for country in df_countries_list if country not in geo_countries_list]

    # Exibe os países que não têm correspondência
    print("Países que não correspondem no GeoJSON:")
    print(missing_countries)
    

    # Inicializa o mapa
    m = folium.Map(location=[45, -90], zoom_start=1.5)

    # Adiciona o choropleth map ao mapa
    Choropleth(
        geo_data=geo_json_data,
        name='choropleth',
        data=country_counts,
        columns=['production_country', 'count'],
        key_on='feature.properties.production_country',
        fill_color='YlGn',
        fill_opacity=0.7,
        line_opacity=0.2,
        legend_name='Log(Número de Filmes por País)',
    ).add_to(m)

    # Adiciona os controles de camada
    folium.LayerControl().add_to(m)

    # Exibe o mapa
    m.save(f'choropleth/choropleth_paises_produtores-{year}.html')
    m


Países que não correspondem no GeoJSON:
[]
