# Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# FUNÇÕES DE AJUDA

# Identificar Var Numéricas e Categóricas

In [2]:
def get_feature_types(df, target_col='Tipo de Ataque'):
    '''
    Identifica Categorias Numéricas e Categóricas
    '''
    numeric_features = df.select_dtypes(include=[np.number]).columns.tolist()
    categorical_features = df.select_dtypes(exclude=[np.number]).columns.tolist()

    # Remove Variável Alvo
    if target_col in numeric_features:
        numeric_features.remove(target_col)
    if target_col in categorical_features:
        categorical_features.remove(target_col)

    return numeric_features, categorical_features

# Analizar Correlação Entre Features Numéricas

In [3]:
def correlation_analysis(df, numeric_features, threshold=0.85):
    '''
    Analiza Correlação entre Features Numéricas

    Threshold 0.85 para procurar correlações fortes
    '''

    corr_matrix = df[numeric_features].corr()

    plt.figure(figsize=(20, 20))
    sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', center=0, linewidth= 0.5)

    plt.title('Correlação entre Features - Heatmap')
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    plt.show()

    threshold = threshold
    high_corr = np.where(np.abs(corr_matrix) > threshold)
    high_corr = [(corr_matrix.index[x], corr_matrix.columns[y], corr_matrix.iloc[x, y]) for x, y in zip(*high_corr) if x != y and x < y]
    
    return high_corr


# Análise de Homogeneidade da Variância (Teste de Levene's)

In [4]:
def analyze_variance_homogen(df, numeric_features, target_col='Tipo de Ataque'):
    '''
    Análise de Variância Homogénea
    '''

    results_levene = {}

    for feature in numeric_features:
        groups = [group[feature].dropna().values for name, group in df.groupby(target_col) if not group[feature].dropna().empty] 

        groups = [group for group in groups if len(group) > 0 and np.any(group != 0) and np.var(group) > 0]

        if len(groups) < 2:
            print(f"Não existem grupos válidos para fazer o teste de Levene para a feature: {feature}")
            continue

        stat_levene, p_value_levene = stats.levene(*groups)  
        results_levene[feature] = {'Statistic': stat_levene, 'p-value': p_value_levene}  

    return results_levene



# Análise da Importância das Features com Teste Kurskal-Wallis

In [5]:
def analyze_feature_importance(df, numeric_features, target_col='Tipo de Ataque'):
    '''
    Análise de Importância de Features
    '''

    h_scores = {}

    for feature in numeric_features:
        groups = [group[feature].dropna().values for name, group in df.groupby(target_col)]
        h_stat, p_value = stats.kruskal(*groups)
        h_scores[feature] = {'H-Statistic': h_stat, 'p-value': p_value}

    h_scores_df = pd.DataFrame.from_dict(h_scores, orient='index')
    h_scores_df = h_scores_df.sort_values('H-Statistic', ascending=False)

    plt.figure(figsize=(18, 10))
    plt.bar(range(len(h_scores_df)), h_scores_df['H-Statistic'], color='skyblue')
    plt.xticks(range(len(h_scores_df)), h_scores_df.index, rotation=90)
    plt.title('Importância de Features - Teste de Kruskal-Wallis')
    plt.xlabel('Features')
    plt.ylabel('H-Statistic')
    plt.show()

    return h_scores_df

# Análise da Importância das Features com Random Forest

In [6]:
def analyze_feature_importance_rf(df, numeric_features, target_col='Tipo de Ataque'):
    '''
    Análise de Importância de Features com Random Forest
    '''

    hyper_params = {
        'n_estimators': 150, # Número de Árvores
        'max_depth': 5, # Limite da Profundidade das Árvores
        'random_state': 42, # Para Reprodutibilidade
        'n_jobs': -1 # Para Paralelização
    }

    X = df[numeric_features]
    y = df[target_col]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=hyper_params['random_state'], stratify=y)

    rf = RandomForestClassifier(**hyper_params)

    rf.fit(X_train, y_train)

    cv_scores = cross_val_score(rf, X_train, y_train, cv=5, n_jobs=-1)
    print(f"Cross-Validation Scores: {np.mean(cv_scores):.4f} +/- {np.std(cv_scores):.4f}")

    y_pred = rf.predict(X_test)

    importances = rf.feature_importances_
    feature_importance_df = pd.DataFrame({'Feature': numeric_features, 'Importance': importances})
    feature_importance_df = feature_importance_df.sort_values('Importance', ascending=False)

    rf_labels = rf.classes_
    cm = confusion_matrix(y_test, y_pred)

    report = classification_report(y_test, y_pred, target_names=rf_labels)
    print("\nClassification Report:\n")
    print(report, end='\n\n')

    plt.figure(figsize=(18, 12))
    plt.bar(feature_importance_df['Feature'], feature_importance_df['Importance'], color='skyblue')
    plt.ylabel('Importância')
    plt.xlabel('Features')
    plt.title('Importância de Features - Random Forest')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.show()

    return feature_importance_df, cm, rf_labels, cv_scores


# Cálculo da Percentagem de Outliers para Cada Feature com IQR

In [7]:
def calculate_outliers_percentage(df):
    '''
    Calcula a Percentagem de Outliers com Método IQR
    '''

    outliers_percentage = {}

    for column in df.columns:
        q1 = df[column].quantile(0.25)
        q3 = df[column].quantile(0.75)
        iqr = q3 - q1

        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr

        outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]

        outliers_percentage[column] = len(outliers) / len(df) * 100

        outliers_percentage[column] = outliers_percentage
   
    return outliers_percentage