In [241]:
import findspark
findspark.init('/home/ubuntu/spark-3.2.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession, functions as F
spark = SparkSession.builder.appName('Iteration4').getOrCreate()



import enum
from typing import List, Dict, Tuple, Optional, Union
import pandas as pd
from matplotlib import pyplot as plt

from dataclasses import dataclass


databases_path = '../datasets/'
PATH_IMAGES = '../tex/iterations/iteration_4/images/'


## Utilities

Some methods and enums that will be used for:

- Produce process images
- Functions to execute repetitive tasks

This will make easier the workflow of the project and also the artifact generation automatically.

In [247]:
class DataFramesCSV(enum.Enum):
    ALCOHOL_CONSUMPTION_CSV = f"{databases_path}4_total-alcohol-consumption-per-capita-litres-of-pure-alcohol.csv"
    COUNTRY_MASTER_CSV = f"{databases_path}0_master_country_codes.csv"
    WHO_OBESITY_CSV = f"{databases_path}1_who_obesity.csv"
    MEAT_CONSUMPTION_CSV = f"{databases_path}2_meat_consumption.csv"
    HUNGER_CSV = f"{databases_path}5_global_hunger_index.csv"
    SMOKING_CSV = f"{databases_path}6_share-of-adults-who-smoke.csv"
    HAPPINESS_REPORT_CSV = f"{databases_path}3_happiness_report.csv"

class DataFramePreviousFieldNameOptions(enum.Enum):
    IS_NULL = 'isnull'
    D_TYPES = 'dtypes'
    COUNT = 'count'

COLUMN_RENAME_BY_DATASET = {
    DataFramesCSV.WHO_OBESITY_CSV: {
        'Numeric': 'percentage_obesity',
        'Countries, territories and areas': 'country',
        'WHO region': 'region',
        'Year': 'year',
    },
    DataFramesCSV.HAPPINESS_REPORT_CSV: {
        'year': 'year',
        'Country name': 'country',
        "Life Ladder": 'life_ladder',
        "Social support": 'social_support',
        "Freedom to make life choices": "freedom_to_make_life_choices",
        "Generosity": "generosity",
        "Perceptions of corruption": "perceptions_of_corruption",
        "Positive affect": "positive_affect",
        "Negative affect": "negative_affect",
    },
    DataFramesCSV.MEAT_CONSUMPTION_CSV: {
        'Code': 'country_code',
        'Year': 'year',
        "Meat, poultry | 00002734 || Food available for consumption | 0645pc || kilograms per year per capita": "poultry",
        "Meat, beef | 00002731 || Food available for consumption | 0645pc || kilograms per year per capita": "beef",
        "Meat, sheep and goat | 00002732 || Food available for consumption | 0645pc || kilograms per year per capita": "sheep_and_goat",
        "Meat, pig | 00002733 || Food available for consumption | 0645pc || kilograms per year per capita": "pig",
        "Fish and seafood | 00002960 || Food available for consumption | 0645pc || kilograms per year per capita": "fish_and_seafood",
    },
    DataFramesCSV.COUNTRY_MASTER_CSV: {
        'alpha-3': 'country_code',
        'name': 'country'
    },
    DataFramesCSV.HUNGER_CSV: {
        'Entity': 'country',
        'Year': 'year',
        'Global Hunger Index (2021)': 'hunger_index',
    },
    DataFramesCSV.SMOKING_CSV: {
        'Entity': 'country',
        'Year': 'year',
        'Prevalence of current tobacco use (% of adults)': 'prevalence_smoking',
    },
    DataFramesCSV.ALCOHOL_CONSUMPTION_CSV: {
        'Entity': 'country',
        'Year': 'year',
        'liters_of_pure_alcohol_per_capita': 'liters_of_pure_alcohol_per_capita',
    },
}


def capture_get_dataframe_info_image(
        table_name: str,
        name_file: str,
        df_spark: pyspark.sql.dataframe.DataFrame,
        previous_data: list = None,
        previous_data_name: str = None,
        figure_size_height=10.0,
        figure_size_width=5.0,
):

    # Collect required statistics from PySpark DataFrame
    column_names = [col[:30] for col in df_spark.columns]
    column_types = [dtype for _, dtype in df_spark.dtypes]
    row_count = df_spark.count()
    column_counts = [row_count for _ in df_spark.columns]
    column_null_counts = df_spark.agg(*[F.sum(F.when(F.col(c).isNull(), 1).otherwise(0)).alias(c) for c in df_spark.columns]).collect()[0]
    column_non_null_counts = [row_count - null_count for null_count in column_null_counts]

    info_object = {
        'columns': column_names,
        'dtypes': column_types,
        'count': column_non_null_counts,
        'isnull': column_null_counts
    }

    if previous_data is not None:
        info_object[f'old {previous_data_name}'] = previous_data
        current_data = info_object[previous_data_name]
        info_object['change'] = [prev - curr for prev, curr in zip(previous_data, current_data)]

    dataframe_info = list(zip(*info_object.values()))

    # Plotting the table image
    fig, ax = plt.subplots(figsize=(figure_size_width, figure_size_height))
    ax.axis('off')
    ax.axis('tight')
    col_widths = [0.35, 0.15, 0.15, 0.15, 0.15, 0.15]
    data_table = ax.table(
        cellText=dataframe_info,
        colLabels=[' '.join(col.split('_')) for col in info_object.keys()],
        colWidths=col_widths,
        loc='center'
    )

    if previous_data is not None:
        for (i, j), val in np.ndenumerate(np.array(dataframe_info)):
            if j == 6 and val != 0:  # We look into the last column (j==6), and search for non-zero values
                data_table[(i + 1, j)].set_facecolor("red")
                data_table[(i + 1, j)].set_text_props(color='white', weight='bold')

    num_rows = row_count - 1
    data_table.auto_set_font_size(False)
    data_table.set_fontsize(6)
    plt.title(f'{table_name} (Records: {num_rows})')
    plt.tight_layout()
    plt.savefig(f"{PATH_IMAGES}{name_file}.png", dpi=200, bbox_inches='tight')
    plt.close()

def capture_summary_dataset_to_image(
        name_file: str,
        df_spark: pyspark.sql.dataframe.DataFrame,
        dataset_name: str,
        figure_size_height=3,
        font_size=7,
):
    dataset = df_spark.toPandas()
    desc = dataset.describe().round(2).T

    fig, ax = plt.subplots(figsize=(7, figure_size_height))
    new_order = ['min', '25%', '50%', '75%', 'max', 'mean', 'count', 'std']
    desc = desc[new_order]
    # Hide axes
    ax.axis('off')
    ax.axis('tight')
    col_widths = [0.08, 0.08, 0.08, 0.08, 0.08, 0.08, 0.08, 0.08]

    data_table = ax.table(
        cellText=desc.values,
        colLabels=desc.columns,
        rowLabels=[name[:30] for name in desc.index],
        cellLoc='center',
        loc='center',
        colWidths=col_widths,
    )
    data_table.auto_set_font_size(False)
    data_table.set_fontsize(font_size)
    fig.tight_layout()

    plt.title(f"Descriptive Statistics {dataset_name}")
    plt.savefig(f"{PATH_IMAGES}{name_file}.png", dpi=200, bbox_inches='tight')
    plt.close()

    
def du_data_exploration_basics(
        df_spark: pyspark.sql.dataframe.DataFrame,
        metric_name_plot: str,
        metric_label: str,
        dataset_name: str,
        country_label: str = 'country',
        prefix_file_name: str = 'du',
        year_label: str = 'year',
):

    metric_name_file = '_'.join([word[0:2] for word in metric_label.split('_')])
    base_name_file = f'{prefix_file_name}_{dataset_name}_{metric_name_file}'
    base_name_file_with_path = f'{PATH_IMAGES}{base_name_file}'
    
    df_spark.toPandas().info()

    fig, (ax1, ax2) = plt.subplots(figsize=(20, 10), nrows=2, ncols=1)
    grouped_data = df_spark.groupBy(year_label).agg(F.collect_list(metric_label).alias("values"))
    data_collected = grouped_data.collect()

    years = [row[year_label] for row in data_collected]
    data_to_plot = [row['values'] for row in data_collected]
    
    ax1.boxplot(data_to_plot, vert=True, patch_artist=True, labels=years)
    ax1.set_title(f'Yearly Spread of {metric_name_plot}')
    ax1.set_xlabel('Year')
    ax1.set_ylabel(metric_name_plot)
    

    average_per_year = df_spark.groupBy(year_label).agg(F.avg(metric_label).alias('avg_metric'))
    average_per_year = average_per_year.orderBy(year_label)
    data_collected = average_per_year.collect()

    years = [row[year_label] for row in data_collected]
    avg_values = [row['avg_metric'] for row in data_collected]
    
    
    ax2.plot(years, avg_values, linestyle='-', marker='o', color='b', label=f'Average {metric_name_plot}')
    ax2.set_title(f'Average {metric_name_plot} Over Years')
    ax2.set_ylabel(metric_name_plot)
    ax2.set_xlabel('Year')
    
    plt.suptitle('')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(f'{base_name_file_with_path}_y_trend.png', bbox_inches='tight', dpi=80)
    plt.close()
    

    data_collected = df_spark.select(metric_label).rdd.flatMap(lambda x: x).collect()
    plt.figure(figsize=(10, 3))
    plt.hist(data_collected, bins=30, edgecolor='black', alpha=0.7)
    plt.title(f'Distribution of {metric_name_plot}')
    plt.xlabel(metric_name_plot)
    plt.ylabel('Frequency')
    plt.savefig(f'{base_name_file_with_path}_freq.png', bbox_inches='tight', dpi=120)
    plt.close()


    average_by_country = df_spark.groupBy(country_label).agg(F.avg(metric_label).alias("avg_metric"))
    top_countries = average_by_country.orderBy(F.desc("avg_metric")).limit(20)
    bottom_countries = average_by_country.orderBy("avg_metric").limit(20)

    top_countries_list = [row[country_label] for row in top_countries.collect()]
    bottom_countries_list = [row[country_label] for row in bottom_countries.collect()]

    top_countries_data = df_spark.filter(df_spark[country_label].isin(top_countries_list))
    bottom_countries_data = df_spark.filter(df_spark[country_label].isin(bottom_countries_list))

    top_countries_pd = top_countries_data.toPandas()
    bottom_countries_pd = bottom_countries_data.toPandas()

    fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(18, 12))
    fig.tight_layout(pad=10.0)

    top_countries_pd.boxplot(column=metric_label, by=country_label, ax=ax1, rot=80)
    ax1.set_title(f'{metric_name_plot} for Top 20 Countries')
    ax1.set_xlabel('Country')
    ax1.set_ylabel(metric_name_plot)

    bottom_countries_pd.boxplot(column=metric_label, by=country_label, ax=ax2, rot=80)
    ax2.set_title(f'{metric_name_plot} for Lowest 20 Countries')
    ax2.set_xlabel('Country')
    ax2.set_ylabel(metric_name_plot)

    plt.suptitle('')
    plt.savefig(f'{base_name_file_with_path}_cou_t_l_20.png', bbox_inches='tight', dpi=80)
    plt.close()
    
    
    # Plotting aggregated data
    merged_pd = pd.concat([top_countries_pd, bottom_countries_pd])
    avg_values_by_country = merged_pd.groupby(country_label)[metric_label].mean().sort_values()

    plt.figure(figsize=(20, 15))
    avg_values_by_country.plot(kind='barh', color='skyblue')
    plt.title(f'Average {metric_name_plot} by Country 20 top and 20 lowest')
    plt.xlabel(metric_name_plot)
    plt.ylabel('Country')
    plt.savefig(f'{base_name_file_with_path}_cou_t_l_20_v2.png', bbox_inches='tight', dpi=80)
    plt.close()

def capture_table_dataframe_image(
        table_name: str,
        name_file: str,
        col_widths: List[float],
        df_spark: pyspark.sql.dataframe.DataFrame,
        figure_size_height=10.0,
        figure_size_width=5.0,
        font_size=4,
        head=None,
        show_records=True,
):
    fig, ax = plt.subplots(figsize=(figure_size_width, figure_size_height))
    ax.axis('off')
    ax.axis('tight')
    
    # Convert the Spark DataFrame to a list of lists and retrieve column names
    if head:
        data_values = df_spark.limit(head).collect()
    else:
        data_values = df_spark.collect()
    columns = df_spark.columns
    
    # Create table with the collected data and columns
    data_table = ax.table(
        cellText=data_values,
        colLabels=[' '.join(col.split('_')) for col in columns],
        colWidths=col_widths,
        loc='center'
    )
    
    num_rows = df_spark.count() - 1
    records = f'(Records: {num_rows})' if show_records else ''
    plt.title(f'{table_name} {records}')
    
    data_table.auto_set_font_size(False)
    data_table.set_fontsize(font_size)
    fig.tight_layout()

    plt.savefig(f"{PATH_IMAGES}{name_file}.png", dpi=200, bbox_inches='tight')
    plt.close()
    
def rename_columns(
    df: pyspark.sql.dataframe.DataFrame,
    map_columns: Dict[str,str]
) -> pyspark.sql.dataframe.DataFrame:
    for old_name, new_name in map_columns.items():
        df = df.withColumnRenamed(old_name, new_name)
    return df

def read_csv(file_path_enum: DataFramesCSV, sep=",") -> pyspark.sql.dataframe.DataFrame:
    return spark.read.csv(file_path_enum.value, header=True, inferSchema=True, sep=sep)
    

## Crisp Process Manager

I will create a manager that will make easier to follow each steps for the reviewer.

In [282]:
@dataclass
class CRISPManager:
    missing_countries = None
    country_master = None
    integrated_dataset = None
    generate_images_du_02: bool = False

    def __post_init__(self):
        country_master = read_csv(DataFramesCSV.COUNTRY_MASTER_CSV)
        country_master = rename_columns(
            df=country_master,
            map_columns={
                "alpha-3": "country_code",
                "name": "country"
            }
        )
        self.country_master = country_master

    def _capture_get_dataframe_info_image(
            self,
            table_name: str,
            name_file: str,
            df_spark: pyspark.sql.dataframe.DataFrame,
            figure_size_height=10.0,
            figure_size_width=5.0,
            force_save_image=False,
            previous_data: pd.Series = None,
            previous_data_name: DataFramePreviousFieldNameOptions = None,
    ):
        if self.generate_images_du_02 or force_save_image:
            capture_get_dataframe_info_image(
                table_name=table_name,
                name_file=name_file,
                df_spark=df_spark,
                figure_size_height=figure_size_height,
                figure_size_width=figure_size_width,
                previous_data_name=previous_data_name,
                previous_data=previous_data,
            )
    
    def _capture_summary_dataset_to_image(
            self,
            name_file: str,
            df_spark: pyspark.sql.dataframe.DataFrame,
            dataset_name: str,
            figure_size_height=3,
            font_size=7,
            force_save_image=False,
    ):
        if self.generate_images_du_02 or force_save_image:
            capture_summary_dataset_to_image(
                name_file=name_file,
                df_spark=df_spark,
                dataset_name=dataset_name,
                figure_size_height=figure_size_height,
                font_size=font_size,
            )

    def _capture_table_dataframe_image(
            self,
            table_name: str,
            name_file: str,
            col_widths: List[float],
            df_spark: pyspark.sql.dataframe.DataFrame,
            figure_size_height=10.0,
            figure_size_width=5.0,
            font_size=4,
            head=None,
            force_save_image=False,
    ):
        if self.generate_images_du_02 or force_save_image:
            capture_table_dataframe_image(
                table_name=table_name,
                name_file=name_file,
                col_widths=col_widths,
                df_spark=df_spark,
                figure_size_width=figure_size_width,
                figure_size_height=figure_size_height,
                font_size=font_size,
                head=head
            )


    def _du_data_exploration_basics(
            self,
            df_spark: pyspark.sql.dataframe.DataFrame,
            metric_name_plot: str,
            metric_label: str,
            dataset_name: str,
            country_label: str = 'country',
            prefix_file_name: str = 'du',
            year_label: str = 'year',
            force_save_image=False,
    ):
        if self.generate_images_du_02 or force_save_image:
            du_data_exploration_basics(
                dataset_name=dataset_name,
                df_spark=df_spark,
                metric_label=metric_label,
                metric_name_plot=metric_name_plot,
                country_label=country_label,
                prefix_file_name=prefix_file_name,
                year_label=year_label,
            )
    
    def _merge_by_country_code(self, dataset_name, target_dataset):
        # 2. Full join on the "id" column
        full_joined = target_dataset.join(self.country_master, "country_code", "left")
        not_found = full_joined.filter(F.col("country-code").isNull()).select("country_code").distinct()
        
        not_found = not_found.withColumn(
            "dataset", lit(dataset_name)
        ).withColumn(
            "value", lit(1)
        ).withColumn(
            "replacement", lit(None)
        )
        not_found = rename_columns(
            df=not_found,
            map_columns={'country_code':'missing'}
        )
        if self.missing_countries is None:
            self.missing_countries = not_found
        else:
            self.missing_countries = not_found.union(self.missing_countries)

    def _merge_by_country_name(self, dataset_name, target_dataset: pd.DataFrame):
        # 2. Full join on the "id" column
        full_joined = target_dataset.join(self.country_master, "country", "left")
        not_found = full_joined.filter(F.col("country-code").isNull()).select("country").distinct()
        
        not_found = not_found.withColumn(
            "dataset", lit(dataset_name)
        ).withColumn(
            "value", lit(1)
        ).withColumn(
            "replacement", lit(None)
        )
        not_found = rename_columns(
            df=not_found,
            map_columns={'country':'missing'}
        )
        if self.missing_countries is None:
            self.missing_countries = not_found
        else:
            self.missing_countries = not_found.union(self.missing_countries)
        
        
    
    def _du_02_country_master(self):
        country_master = read_csv(DataFramesCSV.COUNTRY_MASTER_CSV)
        self._capture_get_dataframe_info_image(
            table_name='Countries Dataset',
            name_file='du_country_dataset',
            df_spark=country_master,
            figure_size_height=2.5
        )

    def _du_02_meat_consumption(self):
        meat_consumption = read_csv(DataFramesCSV.MEAT_CONSUMPTION_CSV)
        self._capture_get_dataframe_info_image(
            table_name='Meat Consumption Dataset',
            name_file='du_meat_consumption_dataset',
            df_spark=meat_consumption,
            figure_size_height=2
        )
        self._capture_summary_dataset_to_image(
            df_spark=meat_consumption,
            dataset_name='Meat Consumption',
            name_file='du_meat_consumption_summary',
            figure_size_height=2
        )

        meat_consumption = rename_columns(
            df=meat_consumption,
            map_columns=COLUMN_RENAME_BY_DATASET.get(DataFramesCSV.MEAT_CONSUMPTION_CSV)
        )
        self._merge_by_country_code(dataset_name='meat_consumption', target_dataset=meat_consumption)

        self._du_data_exploration_basics(
            df_spark=meat_consumption,
            metric_name_plot='Kg./Year per Capita - Beef consumption',
            metric_label='beef',
            dataset_name='meat_beef',
            country_label='country_code',
        )
        self._du_data_exploration_basics(
            df_spark=meat_consumption,
            metric_name_plot='Kg./Year per Capita - Poultry consumption',
            metric_label='poultry',
            dataset_name='meat_poultry',
            country_label='country_code',
        )
        self._du_data_exploration_basics(
            df_spark=meat_consumption,
            metric_name_plot='Kg./Year per Capita - Sheep and Goat consumption',
            metric_label='sheep_and_goat',
            dataset_name='meat_sheep',
            country_label='country_code',
        )
        self._du_data_exploration_basics(
            df_spark=meat_consumption.fillna(0),
            metric_name_plot='Kg./Year per Capita - Pig consumption',
            metric_label='pig',
            dataset_name='meat_pig',
            country_label='country_code',
        )
        self._du_data_exploration_basics(
            df_spark=meat_consumption,
            metric_name_plot='Kg./Year per Capita - Fish and Seafood consumption',
            metric_label='fish_and_seafood',
            dataset_name='meat_fish_seafood',
            country_label='country_code',
        )

    def _du_02_hunger(self):
        hunger = read_csv(DataFramesCSV.HUNGER_CSV)
        self._capture_get_dataframe_info_image(
            table_name='Hunger Dataset',
            name_file='du_hunger_dataset',
            df_spark=hunger,
            figure_size_height=1.5
        )
        
        self._capture_summary_dataset_to_image(
            df_spark=hunger,
            dataset_name='Hunger',
            name_file='du_hunger_summary',
            figure_size_height=1
        )
        hunger = hunger.select("Entity", "Year", "Global Hunger Index (2021)")
        hunger = rename_columns(
            df=hunger,
            map_columns=COLUMN_RENAME_BY_DATASET.get(DataFramesCSV.HUNGER_CSV)
        )
        self._du_data_exploration_basics(
            df_spark=hunger,
            metric_name_plot='Global Hunger Index',
            metric_label='hunger_index',
            dataset_name='hunger',
            country_label='country',
        )
        self._merge_by_country_name(dataset_name='hunger', target_dataset=hunger)

    def _du_02_smoking(self):
        smoking = read_csv(DataFramesCSV.SMOKING_CSV)
        self._capture_get_dataframe_info_image(
            table_name='Smoking Dataset',
            name_file='du_smoking_dataset',
            df_spark=smoking,
            figure_size_height=1
        )
        
        self._capture_summary_dataset_to_image(
            df_spark=smoking,
            dataset_name='Smoking',
            name_file='du_smoking_summary',
            figure_size_height=1
        )
        smoking = smoking.drop("Code")        
        smoking = rename_columns(
            df=smoking,
            map_columns=COLUMN_RENAME_BY_DATASET.get(DataFramesCSV.SMOKING_CSV)
        )
        self._du_data_exploration_basics(
            df_spark=smoking,
            metric_name_plot='Percentage Prevalence Tobacco use Adults',
            metric_label='prevalence_smoking',
            dataset_name='smoking',
        )
        self._merge_by_country_name(dataset_name='smoking', target_dataset=smoking)

    def _du_02_alcohol_consumption(self):
        alcohol_consumption = read_csv(DataFramesCSV.ALCOHOL_CONSUMPTION_CSV)
        self._capture_get_dataframe_info_image(
            table_name='Alcohol Consumption Dataset',
            name_file='du_alcohol_consumption_dataset',
            df_spark=alcohol_consumption,
            figure_size_height=1.2
        )
        
        self._capture_summary_dataset_to_image(
            df_spark=alcohol_consumption,
            dataset_name='Alcohol Consumption',
            name_file='du_alcohol_summary',
            figure_size_height=1
        )
        alcohol_consumption = alcohol_consumption.drop("Code")        
        alcohol_consumption = rename_columns(
            df=alcohol_consumption,
            map_columns=COLUMN_RENAME_BY_DATASET.get(DataFramesCSV.ALCOHOL_CONSUMPTION_CSV)
        )
        self._du_data_exploration_basics(
            df_spark=alcohol_consumption,
            metric_name_plot='Liters of Pure Alcohol per Capita',
            metric_label='liters_of_pure_alcohol_per_capita',
            dataset_name='alcohol',
        )
        self._merge_by_country_name(dataset_name='alcohol_consumption', target_dataset=alcohol_consumption)
    
    def _du_02_obesity(self):
        obesity_dataset = read_csv(DataFramesCSV.WHO_OBESITY_CSV)

        self._capture_summary_dataset_to_image(
            df_spark=obesity_dataset,
            dataset_name='obesity',
            name_file='du_obesity_summary'
        )

        self._capture_get_dataframe_info_image(
            table_name='Obesity Dataset',
            name_file='du_obesity_dataset',
            df_spark=obesity_dataset,
            figure_size_height=3.3
        )

        # Filter the DataFrame based on the 'Sex' column
        obesity_dataset = obesity_dataset.filter(obesity_dataset.Sex == "Both sexes")

        # Select specific columns
        obesity_dataset = obesity_dataset.select("Numeric", "Countries, territories and areas", "WHO region", "Year")
        
        obesity_dataset= rename_columns(
            df=obesity_dataset,
            map_columns=COLUMN_RENAME_BY_DATASET.get(DataFramesCSV.WHO_OBESITY_CSV)
        )
        self._merge_by_country_name(dataset_name='obesity', target_dataset=obesity_dataset)

        self._du_data_exploration_basics(
            df_spark=obesity_dataset.fillna(0),
            metric_name_plot='Percentage Obesity',
            metric_label='percentage_obesity',
            dataset_name='obesity',
        )


    def _du_02_happiness(self):
        happiness_record = read_csv(DataFramesCSV.HAPPINESS_REPORT_CSV)
        
        self._capture_get_dataframe_info_image(
            table_name='Happiness Report Dataset',
            name_file='du_happiness_dataset',
            df_spark=happiness_record,
            figure_size_height=2.5
        )

        self._capture_summary_dataset_to_image(
            df_spark=happiness_record,
            dataset_name='Happiness',
            name_file='du_happiness_summary',
            figure_size_height=2.3
        )
        map_columns_happiness = {col: col.split(',', 2)[-1].strip() for col in happiness_record.columns}
        
        happiness_record = rename_columns(
            df=happiness_record,
            map_columns=map_columns_happiness
        )
        
        happiness_record = rename_columns(
            df=happiness_record,
            map_columns=COLUMN_RENAME_BY_DATASET.get(DataFramesCSV.HAPPINESS_REPORT_CSV)
        )
        
        self._merge_by_country_name(dataset_name='happiness', target_dataset=happiness_record)
        self._du_data_exploration_basics(
            df_spark=happiness_record,
            metric_name_plot='Life Ladder',
            metric_label='life_ladder',
            dataset_name='happiness',
            country_label='country',
        )
        self._du_data_exploration_basics(
            df_spark=happiness_record.fillna(0),
            metric_name_plot='Social Support',
            metric_label='social_support',
            dataset_name='happiness',
            country_label='country',
        )
        self._du_data_exploration_basics(
            df_spark=happiness_record.fillna(0),
            metric_name_plot='Freedom to Make Life Choices',
            metric_label='freedom_to_make_life_choices',
            dataset_name='happiness',
            country_label='country',
        )
        self._du_data_exploration_basics(
            df_spark=happiness_record.fillna(0),
            metric_name_plot='Generosity',
            metric_label='generosity',
            dataset_name='happiness',
            country_label='country',
        )
        self._du_data_exploration_basics(
            df_spark=happiness_record.fillna(0),
            metric_name_plot='Perceptions of Corruption',
            metric_label='perceptions_of_corruption',
            dataset_name='happiness',
            country_label='country',
        )
        self._du_data_exploration_basics(
            df_spark=happiness_record.fillna(0),
            metric_name_plot='Positive Affect',
            metric_label='positive_affect',
            dataset_name='happiness',
            country_label='country',
        )
        self._du_data_exploration_basics(
            df_spark=happiness_record.fillna(0),
            metric_name_plot='Negative affect',
            metric_label='negative_affect',
            dataset_name='happiness',
            country_label='country',
        )

    def _function_mapper_du_02(self, dataset: DataFramesCSV):
        mapper = {
            DataFramesCSV.MEAT_CONSUMPTION_CSV: self._du_02_meat_consumption,
            DataFramesCSV.WHO_OBESITY_CSV: self._du_02_obesity,
            DataFramesCSV.HAPPINESS_REPORT_CSV: self._du_02_happiness,
            DataFramesCSV.HUNGER_CSV: self._du_02_hunger,
            DataFramesCSV.SMOKING_CSV: self._du_02_smoking,
            DataFramesCSV.ALCOHOL_CONSUMPTION_CSV: self._du_02_alcohol_consumption,
        }

        return mapper.get(dataset)
    
    def get_crosstab_missing_countries(self, without_replacement=False, save_table=False, head=20) -> pd.DataFrame:
        if without_replacement:
            filtered_dataframe = self.missing_countries.filter(self.missing_countries["replacement"].isNull())
        else:
            filtered_dataframe = self.missing_countries
        
        # Compute the crosstab
        crosstab_result = filtered_dataframe.crosstab("missing", "dataset")
        
        # Optionally save the table
        if save_table:
            if head:
                file_name = f'du_missing_countries_per_dataset_head_{head}'
            else:
                file_name = 'du_missing_countries_per_dataset'
            self._capture_table_dataframe_image(
                table_name='Missing countries by datasets (20 firsts)',
                df_spark=crosstab_result,
                col_widths=[0.35, 0.2, 0.1, 0.1, 0.2, 0.1, 0.1, 0.07],
                name_file=file_name,
                head=head,
                font_size=5,
                figure_size_height=4.5,
            )
        
        return crosstab_result
    
    def _du_02_run_processes(self):
        self._function_mapper_du_02(dataset=DataFramesCSV.MEAT_CONSUMPTION_CSV)()
        self._function_mapper_du_02(dataset=DataFramesCSV.WHO_OBESITY_CSV)()
        self._function_mapper_du_02(dataset=DataFramesCSV.HAPPINESS_REPORT_CSV)()
        self._function_mapper_du_02(dataset=DataFramesCSV.HUNGER_CSV)()
        self._function_mapper_du_02(dataset=DataFramesCSV.SMOKING_CSV)()
        self._function_mapper_du_02(dataset=DataFramesCSV.ALCOHOL_CONSUMPTION_CSV)()

    def du_02(self):
        # pd.set_option('display.max_columns', None)
        # pd.set_option('display.max_rows', None)
        # pd.set_option('display.expand_frame_repr', False)

        self._du_02_country_master()
        self._du_02_run_processes()

        self.get_crosstab_missing_countries(save_table=True)

## CRISP Steps

In [283]:
manager = CRISPManager(
    generate_images_du_02=False
)

### Data Understanding

The following method primarily focuses on generating visuals based on various data analyses.

In [284]:
manager.du_02()

metric_name_plot  Negative affect
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2199 entries, 0 to 2198
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   country                           2199 non-null   object 
 1   year                              2199 non-null   int32  
 2   life_ladder                       2199 non-null   float64
 3   Log GDP per capita                2199 non-null   float64
 4   social_support                    2199 non-null   float64
 5   Healthy life expectancy at birth  2199 non-null   float64
 6   freedom_to_make_life_choices      2199 non-null   float64
 7   generosity                        2199 non-null   float64
 8   perceptions_of_corruption         2199 non-null   float64
 9   positive_affect                   2199 non-null   float64
 10  negative_affect                   2199 non-null   float64
dtypes: float64(9), int32(1), object(1)
