# Exploratory Data Analysis (EDA)
In questo notebook eseguiremo un'analisi esplorativa dei dati per capire meglio il dataset e le sue caratteristiche.

## Import delle librerie
Iniziamo importando le librerie necessarie per l'analisi dei dati.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

import sys
import os

# Set the working directory
sys.path.append(os.getcwd() + os.sep + ".." + os.sep + "..")

import src.scripts.mapping_answers_dict as mp 

## Caricamento dei dati
Carichiamo il dataset e visualizziamo le prime righe per visualizzare la struttura del dataset.

In [None]:
dataset = pd.read_csv("../../data/processed/GYTS_dataset.csv")
dataset.head()

## Impostazione del tipo di colonna
Impostiamo il tipo di colonna per le colonne categoriche e booleane.

In [None]:

# Conversione delle colonne in categoriche
categorical_columns = ["State", "Gender", "Age", "SmokingFriends", "SeenSmokerInPublicPlace",
                       "SeenSmokerInEnclosedPlace", "SeenSmokerInHome", "AttractiveSmoker",
                       "HardQuitSmoke", "SmokerConfidentInCelebrations", "SchoolWarnings",
                       "SeenHealthWarnings", "AntiTobaccoInEvents", "HarmfulPassiveSmoke"]
dataset[categorical_columns] = dataset[categorical_columns].astype('category')

# Conversione delle colonne in booleane
boolean_columns = ["Smoke", "SeenSmokerInSchool", "ParentWarnings", "AntiTobaccoInMedia",
                   "BanTobaccoOutdoors", "SmokingFather", "SmokingMother", "WorkingFather",
                   "WorkingMother"]
dataset[boolean_columns] = dataset[boolean_columns].astype('bool')

# Lista di tutte le colonne del dataset
list_of_columns = dataset.columns.tolist()
list_of_columns.insert(0, list_of_columns.pop(list_of_columns.index("Smoke")))

## Analisi univariata
Eseguiamo un'analisi univariata per analizzare distribuzione delle variabili.

In [None]:
sns.set_theme(style="whitegrid", font_scale=1.2)
# Bar plots
for i, column in enumerate(list_of_columns): 
    
    if column in mp.column_mappings_reverse.keys() and column != "Smoke":
        order = eval(f"mp.{mp.column_mappings_reverse[column]}_dict").values()
    else:
        order = None
    
    fig = plt.figure(figsize=(12, 8))
    ax = sns.countplot(x=column, data=dataset, palette='rainbow', zorder=10, order=order)
    
    ax.bar_label(ax.containers[0], fmt=lambda x: f'{(x/len(dataset[column]))*100:0.1f}%')
    
    if column in ["SeenHealthWarnings", "Age"]:
        ax.set_xticklabels(ax.get_xticklabels(), rotation=25, ha="right")
   
    ax.set(title=f"{column} countplot");
    ax.title.set_fontsize(20)
    ax.title.set_fontname('Arial')
    ax.title.set_fontweight('bold')
    
    plt.tight_layout()
    plt.savefig(f"../../data/processed/univariate_analysis/{column}_countplot.png", dpi=600)
    plt.show()

## Analisi multivariata
Eseguiamo un'analisi multivariata per analizzare la relazione tra le variabili e la variabile target.

In [None]:
# Bar plots
for i, column in enumerate(list_of_columns):
    if column == 'Smoke':
        continue
    feat = column
    hue = 'Smoke'
    
    groups = dataset[feat].unique()
    normalized_proportions = dataset.groupby(feat, observed=True)[hue].value_counts(normalize=True) 
    proportions = dataset.groupby(feat, observed=True)[hue].value_counts(normalize=False) 
    print(normalized_proportions)
    print(proportions)
    
    if feat in mp.column_mappings_reverse.keys():
        order = eval(f"mp.{mp.column_mappings_reverse[feat]}_dict").values()
    else:
        order = None

    fig = plt.figure(figsize=(12, 8))
    ax = sns.countplot(x=feat, hue=hue, data=dataset, dodge=True, zorder=10, order=order)

    for container in ax.containers:

        container_hue = True if container.get_label() == "True" else False
        labels = [0]*len(groups)
        for i, g in enumerate(groups):
            for j, patch in enumerate(container.patches):
                if proportions.loc[g, container_hue] == patch.get_height():
                    labels[j] = f'{normalized_proportions.loc[g, container_hue]:.1%}'
        print(labels)
        ax.bar_label(container, labels)
    ax.set(title=f"Countplot of smoker by {column}");
    # set title style
    ax.title.set_fontsize(20)
    ax.title.set_fontname('Arial')
    ax.title.set_fontweight('bold')
    
    
    if column in ["SeenHealthWarnings", "Age"]:
        ax.set_xticklabels(ax.get_xticklabels(), rotation=25, ha="right")

    plt.tight_layout()
    plt.savefig(f"../../data/processed/multivariate_analysis/{column}_vs_Smoke_countplot.png", dpi=600)
    plt.show()


## Tabella delle associazioni
Creiamo una tabella delle associazioni per analizzare la relazione tra le variabili.

In [None]:
# from scipy.stats import chi2_contingency
from scipy.stats.contingency import association
    
# Tabella di associazione
association_table = pd.DataFrame(index=list_of_columns, columns=list_of_columns)
association_table = association_table.astype('float')

# Calcolo dell'associazione tra le colonne
for col1 in list_of_columns:
    for col2 in list_of_columns:
        frequency_table = pd.crosstab(dataset[col1], dataset[col2])
        association_value = association(frequency_table) 
        association_table.loc[col1, col2] = association_value

# Plot della tabella di associazione
plt.figure(figsize=(24, 18))
ax = sns.heatmap(association_table, annot=True, fmt=".2f")
plt.savefig("../../data/processed/association_table.png", dpi=600)
plt.show()

association_table.to_csv("../../data/processed/association_table.csv")

## Countplots delle associazioni >= 0.2
Visualizziamo i countplots delle associazioni con un valore superiore a 0.2 per analizzare le relazione tra le variabili.

In [None]:
for i, col1 in enumerate(list_of_columns):
    for j, col2 in enumerate(list_of_columns):
        if i < j and association_table.loc[col1, col2] >= 0.2:            
            if col1 == 'Smoke' or col2 == 'Smoke':
                continue
            feat = col1
            hue = col2
            
            groups = dataset[feat].unique()
            normalized_proportions = dataset.groupby(feat, observed=True)[hue].value_counts(normalize=True) 
            proportions = dataset.groupby(feat, observed=True)[hue].value_counts(normalize=False) 
            
            if feat in mp.column_mappings_reverse.keys():
                order = eval(f"mp.{mp.column_mappings_reverse[feat]}_dict").values()
            else:
                order = None
            
            fig = plt.figure(figsize=(20, 8))
            ax = sns.countplot(x=feat, hue=hue, data=dataset, dodge=True, zorder=10, order=order)

            for container in ax.containers:

                if container.get_label() == "True":
                    container_hue = True
                elif container.get_label() == "False":
                    container_hue = False
                else:
                    container_hue = container.get_label()
   
                labels = [0]*len(groups)
                for i, g in enumerate(groups):
                    for j, patch in enumerate(container.patches):
                        if proportions.loc[g, container_hue] == patch.get_height():
                            labels[j] = f'{normalized_proportions.loc[g, container_hue]:.1%}'
                ax.bar_label(container, labels)
            ax.set(title=f"Countplot of {col1} by {col2}");
            # set title style
            ax.title.set_fontsize(20)
            ax.title.set_fontname('Arial')
            ax.title.set_fontweight('bold')
            
            
            if column in ["SeenHealthWarnings", "Age"]:
                ax.set_xticklabels(ax.get_xticklabels(), rotation=25, ha="right")
            # tight_layout() is used to avoid overlapping of the labels
            plt.tight_layout()
            plt.savefig(f"../../data/processed/multivariate_analysis/associated_variables/{col1}_vs_{col2}_countplot.png", dpi=600)
            plt.show()