In [131]:
import os
import findspark
findspark.init('/home/ubuntu/spark-3.2.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession, functions as F, window
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml import Pipeline
spark = SparkSession.builder.appName('Iteration4_DM').getOrCreate()
spark.conf.set("spark.sql.optimizer.maxIterations", 300)
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier, GBTClassifier, RandomForestClassifier, LogisticRegression
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics


import numpy as np
from matplotlib import pyplot as plt
import enum
from dataclasses import dataclass
from typing import List, Dict, Any

import pandas as pd
from pyspark import ml

PATH_MODELS = '../models/'
PATH_IMAGES = '../tex/iterations/iteration_4/images/'




In [132]:
DATASET = spark.read.csv("../datasets/GENERATED/DP_03_M6.csv", header=True, inferSchema=True)

In [168]:
def capture_confusion_matrix_image(
        table_name: str,
        name_file: str,
        dataframe: pd.DataFrame,
        figure_size_height=10.0,
        figure_size_width=5.0,
        font_size=4,
        head=None,
        accuracy: float = None,
        precision: float = None,
        recall: float = None,
        f1_score: float = None,
        best_params=None,
        best_training_accuracy=None,
        best_test_accuracy=None,
        tuning_scoring='precision',
):
    fig, (ax, ax2) = plt.subplots(figsize=(figure_size_width, figure_size_height), nrows=2)
    ax.axis('off')
    ax.axis('tight')
    dataframe.reset_index(inplace=True)
    df_cm = ax.table(
        cellText=dataframe.head(head).values if head is not None else dataframe.values,
        colLabels=[' '.join(col.split('_')) for col in dataframe.columns],
        colWidths=[0.2, 0.25, 0.25],
        loc='center'
    )
    ax.set_title(table_name)
    df_cm.auto_set_font_size(False)
    df_cm.set_fontsize(font_size)

    ax2.axis('off')
    ax2.axis('tight')
    if best_params:
        df_best_parameters = pd.DataFrame(list(best_params.items()), columns=['parameter', 'best'])
        data_table_best_params = ax2.table(
            cellText=df_best_parameters.head(head).values if head is not None else df_best_parameters.values,
            colLabels=[' '.join(col.split('_')) for col in df_best_parameters.columns],
            colWidths=[0.5, 0.3],
            loc='center',
        )
        ax2.set_title(f"Tuning Process for {tuning_scoring}:")
        data_table_best_params.auto_set_font_size(False)
        data_table_best_params.set_fontsize(font_size)

    increase_lines = - 0.13
    current_line = -0.2
    font_size_text = font_size + 1
    text_h_pos_m_r = 0.4
    text_h_pos_m_t = 1

    current_line = current_line + increase_lines

    if accuracy is not None:
        plt.text(
            text_h_pos_m_r, current_line, f'Accuracy: {round(accuracy * 100, 2)}%',
            ha='right',
            va='center',
            transform=plt.gca().transAxes,
            fontsize=font_size_text,
        )
    current_line = current_line + increase_lines
    if precision is not None:
        plt.text(
            text_h_pos_m_r, current_line, f'Precision: {round(precision * 100, 2)}%',
            ha='right',
            va='center',
            transform=plt.gca().transAxes,
            fontsize=font_size_text,
        )

    current_line = current_line + increase_lines
    if recall is not None:
        plt.text(
            text_h_pos_m_r, current_line, f'Recall: {round(recall * 100, 2)}%',
            ha='right',
            va='center',
            transform=plt.gca().transAxes,
            fontsize=font_size_text,
        )
    current_line = current_line + increase_lines
    if f1_score is not None:
        plt.text(
            text_h_pos_m_r, current_line, f'F1-score: {round(f1_score * 100, 2)}%',
            ha='right',
            va='center',
            transform=plt.gca().transAxes,
            fontsize=font_size_text,
        )

    # fig.tight_layout()
    plt.savefig(f"{PATH_IMAGES}{name_file}.png", dpi=200, bbox_inches='tight')
    plt.close()

def capture_table_dataframe_image(
        table_name: str,
        name_file: str,
        col_widths: List[float],
        dataframe: pd.DataFrame,
        figure_size_height=10.0,
        figure_size_width=5.0,
        font_size=4,
        head=None,
        show_records=True,
):
    fig, ax = plt.subplots(figsize=(figure_size_width, figure_size_height))
    ax.axis('off')
    ax.axis('tight')
    dataframe.reset_index(inplace=True)
    data_table = ax.table(
        cellText=dataframe.head(head).values if head is not None else dataframe.values,
        colLabels=[' '.join(col.split('_')) for col in dataframe.columns],
        colWidths=col_widths,
        loc='center'
    )
    num_rows = dataframe.shape[0] - 1
    records = f'(Records: {num_rows})' if show_records else ''
    plt.title(f'{table_name} {records}')
    data_table.auto_set_font_size(False)
    data_table.set_fontsize(font_size)
    fig.tight_layout()

    plt.savefig(f"{PATH_IMAGES}{name_file}.png", dpi=200, bbox_inches='tight')
    plt.close()

In [173]:
class AvailableModels(enum.Enum):
    RANDOM_FOREST = 'Random Forest'
    DECISION_TREE = 'Decision Tree'
    LOGISTIC_REGRESSION = 'Logistic Regression'
    GRADIENT_BOOSTING = 'Gradient Boosting'


@dataclass
class ModelFitter:
    dataset: pyspark.sql.dataframe.DataFrame
    models_to_fit: List[AvailableModels]
    models: Dict[AvailableModels, ml.Estimator] = None
    predictions_models: Dict[AvailableModels, pd.Series] = None
    tuning_scoring: str = 'accuracy'
    
    def __post_init__(self):
        self.load_models()
        self.acc_evaluator = MulticlassClassificationEvaluator(labelCol="expected_obesity_rate", predictionCol="prediction", metricName="accuracy")
        self.rcl_evaluator = MulticlassClassificationEvaluator(labelCol="expected_obesity_rate", predictionCol="prediction", metricName="recallByLabel")
        self.prc_evaluator = MulticlassClassificationEvaluator(labelCol="expected_obesity_rate", predictionCol="prediction", metricName="precisionByLabel")
    
    def get_classifier(self, enum_model: AvailableModels):
        return self.models[enum_model]
    
    @property
    def random_forest(self):
        return self.models[AvailableModels.RANDOM_FOREST]
    
    @property
    def decision_tree(self):
        return self.models[AvailableModels.DECISION_TREE]
    
    @property
    def logistic_regresion(self):
        return self.models[AvailableModels.LOGISTIC_REGRESSION]
    
    @property
    def gradient_boosting(self):
        return self.models[AvailableModels.GRADIENT_BOOSTING]
    
        
    @property
    def random_forest_predictions(self):
        return self.predictions_models.get(AvailableModels.RANDOM_FOREST, None)
    
    @property
    def decision_tree_predictions(self):
        return self.predictions_models.get(AvailableModels.DECISION_TREE, None)
    
    @property
    def logistic_regresion_predictions(self):
        return self.predictions_models.get(AvailableModels.LOGISTIC_REGRESSION, None)
    
    @property
    def gradient_boosting_predictions(self):
        return self.predictions_models.get(AvailableModels.GRADIENT_BOOSTING, None)

    def load_models(self):
        self.predictions_models = {}
        for path_enum in AvailableModels:
            path_file = f"{PATH_MODELS}/{'_'.join(path_enum.value.lower())}"
            if os.path.exists(path_file):
                if path_enum == AvailableModels.RANDOM_FOREST:
                    self.models[path_enum] = ml.classification.RandomForestClassificationModel.load(path_file)
                if path_enum == AvailableModels.LOGISTIC_REGRESSION:
                    self.models[path_enum] = ml.classification.LogisticRegression.load(path_file)
                if path_enum == AvailableModels.GRADIENT_BOOSTING:
                    self.models[path_enum] = ml.classification.GBTClassificationModel.load(path_file)
                if path_enum == AvailableModels.DECISION_TREE:
                    self.models[path_enum] = ml.classification.DecisionTreeClassificationModel.load(path_file)

    def save_model(self, classifiers: ml, path_enum: AvailableModels):
        if self.models.get(path_enum, None) is None:
            path_file = f"{PATH_MODELS}/{'_'.join(path_enum.value.lower())}"
            classifiers.save(path_file)

    def _get_features_columns_names(self):
        return [
            "prevalence_smoking",
            "liters_of_pure_alcohol_per_capita",
            "beef",
            "poultry",
            "sheep_and_goat",
            "pig",
            "fish_and_seafood",
            "perceptions_of_corruption",
            "negative_affect",
            "life_ladder",
            "positive_affect",
            "freedom_to_make_life_choices",
            "generosity",
            "social_support",
        ]
    
    def _generate_split_dataset(self):
        assembler = VectorAssembler(
            inputCols=self._get_features_columns_names(),
            outputCol="features",
            handleInvalid="keep"
        )
        output = assembler.transform(self.dataset)
        final_data = output.select("features", "expected_obesity_rate")
        train_data, test_data = final_data.randomSplit([0.7,0.3])
        
        return train_data, test_data, assembler

    def fit_models(self) -> None:
        """Fit specified models to the data."""
        fit_function_mapper = {
            AvailableModels.LOGISTIC_REGRESSION: self._fit_logistic_regression,
            AvailableModels.DECISION_TREE: self._fit_decision_tree,
            AvailableModels.RANDOM_FOREST: self._fit_random_forest,
            AvailableModels.GRADIENT_BOOSTING: self._fit_gradient_boosting,
        }

        for model_enum in self.models_to_fit:
            function_to_run = fit_function_mapper.get(model_enum)
            if function_to_run:
                print(f"Fitting : {model_enum.value}...")
                function_to_run()

            # If you want to calculate accuracy, ensure your fitting functions return predictions
            # and uncomment the below code:
            # predictions = function_to_run()
            # accuracy = np.mean(predictions == self._y_test)
            # print(f"Accuracy {model_enum.value}: {accuracy}")

    def generate_confusion_matrix_spark(self, enum_model):
        predictions = self.predictions_models[enum_model]

        # Computing TP, TN, FP, FN
        TP = predictions.where((F.col('expected_obesity_rate') == 1) & (F.col('prediction') == 1)).count()
        TN = predictions.where((F.col('expected_obesity_rate') == 0) & (F.col('prediction') == 0)).count()
        FP = predictions.where((F.col('expected_obesity_rate') == 0) & (F.col('prediction') == 1)).count()
        FN = predictions.where((F.col('expected_obesity_rate') == 1) & (F.col('prediction') == 0)).count()

        confusion_matrix = [[TN, FP], [FN, TP]]

        # Computing other metrics
        evaluator = MulticlassClassificationEvaluator(labelCol="expected_obesity_rate", predictionCol="prediction")

        accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
        recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})
        precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
        f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})

        # Convert confusion matrix to a DataFrame for display purposes
        df_cm = spark.createDataFrame(confusion_matrix, ["Actual: 0", "Actual: 1"])

        # This is where you would call your capture_confusion_matrix_image function 
        # or adapt it to work with Spark DataFrames.
        # For the sake of brevity, I'll not include that part here.

        capture_confusion_matrix_image(
            table_name=f'Confusion Matrix: {enum_model.value}',
            dataframe=df_cm.toPandas(),
            font_size=6,
            name_file=f"dm_confu_mat_{'_'.join([word.lower()[0:4] for word in enum_model.value.split(' ')])}",
            figure_size_width=4,
            figure_size_height=3,
            accuracy=accuracy,
            precision=precision,
            recall=recall,
            f1_score=f1,
            #best_params=best_params,
            #best_training_accuracy=best_training_accuracy,
            #best_test_accuracy=best_test_accuracy,
        )
    
    def get_metrics(self, model_enum: AvailableModels):
        predictions = self.predictions_models[model_enum]
        return {
            "accuracy": self.acc_evaluator.evaluate(predictions),
            "recall": self.rcl_evaluator.evaluate(predictions),
            "precision":self.prc_evaluator.evaluate(predictions),
        }
    
    def _predictor_resume_logistic_regression(self, feature_importance):
        feature_importance_df = pd.DataFrame({
            'Feature': feature_importance.keys(),
            'Importance': feature_importance.values()
        })
        feature_importance_df = feature_importance_df.round(5)
        feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
        capture_table_dataframe_image(
            dataframe=feature_importance_df,
            table_name='Feature Importance: Logistic Regresion',
            font_size=5,
            name_file='dm_featu_imp_logi_regr',
            col_widths=[0.2, 1.2, 0.6],
            figure_size_height=3.5,
            figure_size_width=8,
            show_records=False
        )

    def _predictor_resume_gradient_boosting(self, feature_importance):
        feature_importance_df = pd.DataFrame({
            'Feature': feature_importance.keys(),
            'Importance': feature_importance.values()
        })
        feature_importance_df = feature_importance_df.round(5)
        feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
        capture_table_dataframe_image(
            dataframe=feature_importance_df,
            table_name='Feature Importance: Gradient Boosting',
            font_size=5,
            name_file='dm_featu_imp_grad_boost',
            col_widths=[0.2, 1.2, 0.6],
            figure_size_height=3.5,
            figure_size_width=8,
            show_records=False
        )

    def _predictor_resume_decision_tree(self, feature_importance):
        feature_importance_df = pd.DataFrame({
            'Feature': feature_importance.keys(),
            'Importance': feature_importance.values()
        })
        feature_importance_df = feature_importance_df.round(5)
        feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
        capture_table_dataframe_image(
            dataframe=feature_importance_df,
            table_name='Feature Importance: Decision Tree',
            font_size=5,
            name_file='dm_featu_imp_deci_tree',
            col_widths=[0.2, 1.2, 0.6],
            figure_size_height=3.5,
            figure_size_width=8,
            show_records=False
        )
        plt.savefig(f"{PATH_IMAGES}decision_tree_tree.png", dpi=500, bbox_inches='tight')

    def _predictor_resume_random_forest(self, feature_importance):
        feature_importance_df = pd.DataFrame({
            'Feature': feature_importance.keys(),
            'Importance': feature_importance.values()
        })
        feature_importance_df = feature_importance_df.round(5)
        feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
        capture_table_dataframe_image(
            dataframe=feature_importance_df,
            table_name='Feature Importance: Random Forest',
            font_size=5,
            name_file='dm_featu_imp_random_forest',
            col_widths=[0.2, 1.2, 0.6],
            figure_size_height=3.5,
            figure_size_width=8,
            show_records=False
        )


    def _fit_decision_tree(self) -> None:
        train_data, test_data, assembler = self._generate_split_dataset()
        dtc = DecisionTreeClassifier(labelCol='expected_obesity_rate',featuresCol='features')
        dtc_model = dtc.fit(train_data)
        self.predictions_models[AvailableModels.DECISION_TREE] = dtc_model.transform(test_data)
        
        
        importances = dtc_model.featureImportances.toArray()
        feature_names = assembler.getInputCols()
        features_and_importances = list(zip(feature_names, importances))
        features_and_importances_sorted = sorted(features_and_importances, key=lambda x: x[1], reverse=True)
        
        feature_importance = {feature:round(importance,4) for feature, importance in features_and_importances_sorted}

        print(feature_importance)
        
        self.generate_confusion_matrix_spark(
            enum_model=AvailableModels.DECISION_TREE
        )
        self._predictor_resume_decision_tree(feature_importance)

    def _fit_logistic_regression(self) -> None:
        print("FIT LOGISTIC REGRESION")
        assembler = VectorAssembler(
            inputCols=self._get_features_columns_names(),
            outputCol="features",
            handleInvalid="keep"
        )
        train_data_lg, test_data_lg = self.dataset.na.drop().randomSplit([0.7,.3])
        log_reg_titanic = LogisticRegression(featuresCol='features',labelCol='expected_obesity_rate')
        pipeline = Pipeline(stages=[assembler, log_reg_titanic])
        fit_model = pipeline.fit(train_data_lg)
        
        self.predictions_models[AvailableModels.LOGISTIC_REGRESSION] = fit_model.transform(test_data_lg)
        
        lr_model = fit_model.stages[-1]
        importances = lr_model.coefficients
        feature_importance = {
            k:v for k, v in
            list(zip(self._get_features_columns_names(), importances))
        }
        
        self._predictor_resume_logistic_regression(feature_importance)
        
        self.generate_confusion_matrix_spark(
            enum_model=AvailableModels.LOGISTIC_REGRESSION
        )

    def _fit_gradient_boosting(self) -> None:
        print("FIT GRADIENT BOOSTING")
        train_data, test_data, assembler = self._generate_split_dataset()
        gbt = GBTClassifier(labelCol='expected_obesity_rate',featuresCol='features')
        gbt_model = gbt.fit(train_data)
        self.predictions_models[AvailableModels.GRADIENT_BOOSTING] = gbt_model.transform(test_data)
        
        importances = gbt_model.featureImportances.toArray()
        feature_names = assembler.getInputCols()
        features_and_importances = list(zip(feature_names, importances))
        features_and_importances_sorted = sorted(features_and_importances, key=lambda x: x[1], reverse=True)
        
        feature_importance = {feature:round(importance,4) for feature, importance in features_and_importances_sorted}
        
        self.generate_confusion_matrix_spark(
            enum_model=AvailableModels.GRADIENT_BOOSTING
        )
        self._predictor_resume_gradient_boosting(feature_importance)

    def _fit_random_forest(self) -> None:
        print("FIT RANDOM FOREST")
        train_data, test_data, assembler = self._generate_split_dataset()
        rfc = RandomForestClassifier(labelCol='expected_obesity_rate',featuresCol='features')
        rfc_model = rfc.fit(train_data)
        self.predictions_models[AvailableModels.RANDOM_FOREST] = rfc_model.transform(test_data)
        
        
        importances = rfc_model.featureImportances.toArray()
        feature_names = assembler.getInputCols()
        features_and_importances = list(zip(feature_names, importances))
        features_and_importances_sorted = sorted(features_and_importances, key=lambda x: x[1], reverse=True)
        
        feature_importance = {feature:round(importance,4) for feature, importance in features_and_importances_sorted}

        print(feature_importance)
        
        self._predictor_resume_random_forest(feature_importance)
        
        self.generate_confusion_matrix_spark(
            enum_model=AvailableModels.RANDOM_FOREST
        )

In [174]:
fitter = ModelFitter(
    models_to_fit=[
        AvailableModels.DECISION_TREE,
        AvailableModels.RANDOM_FOREST,
        AvailableModels.GRADIENT_BOOSTING,
        AvailableModels.LOGISTIC_REGRESSION,
    ],
    dataset=DATASET,
)
fitter.fit_models()

Fitting : Decision Tree...
{'pig': 0.3145, 'negative_affect': 0.1197, 'liters_of_pure_alcohol_per_capita': 0.1146, 'beef': 0.1015, 'fish_and_seafood': 0.0944, 'sheep_and_goat': 0.0751, 'poultry': 0.0651, 'social_support': 0.0622, 'prevalence_smoking': 0.0528, 'perceptions_of_corruption': 0.0, 'life_ladder': 0.0, 'positive_affect': 0.0, 'freedom_to_make_life_choices': 0.0, 'generosity': 0.0}
Fitting : Random Forest...
FIT RANDOM FOREST


AttributeError: 'NoneType' object has no attribute 'get'

<Figure size 432x288 with 0 Axes>