In [1]:


#########################
### EDA SCRIPT ###
#########################

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy import stats
from sklearn.preprocessing import LabelEncoder
from statsmodels.graphics.mosaicplot import mosaic
import sweetviz as sv

# ======================
# CONFIGURATION
# ======================
class EDACfg:
    INPUT_PATH = r"C:\Users\soulf\Downloads\archive\HR_Employee_Cleaned.pkl"
    OUTPUT_DIR = r"C:\Users\soulf\Downloads\archive\EDA_Reports"
    TARGET = "Attrition"
    
    # Plot parameters
    PALETTE = "viridis"
    STYLE = "whitegrid"
    FIG_SIZE = (14, 8)
    
    # Analysis parameters
    CORR_THRESHOLD = 0.6
    ALPHA = 0.05

# ======================
# HELPER FUNCTIONS
# ======================
def setup_environment():
    """Configure visualization settings"""
    sns.set(style=EDACfg.STYLE, palette=EDACfg.PALETTE)
    plt.rcParams["figure.figsize"] = EDACfg.FIG_SIZE
    os.makedirs(os.path.join(EDACfg.OUTPUT_DIR, "plots"), exist_ok=True)
    os.makedirs(os.path.join(EDACfg.OUTPUT_DIR, "stats_tests"), exist_ok=True)

def save_plot(name, tight_layout=True):
    """Save plot to file"""
    path = os.path.join(EDACfg.OUTPUT_DIR, "plots", f"{name}.png")
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, dpi=300, bbox_inches="tight")
    plt.close()

def target_analysis(df):
    """Comprehensive analysis of target variable"""
    print("\n=== TARGET ANALYSIS ===")
    
    # Distribution plot
    plt.figure()
    ax = sns.countplot(x=EDACfg.TARGET, data=df)
    plt.title("Target Variable Distribution")
    add_percent_labels(ax)
    save_plot("target_distribution")
    
    # Class imbalance stats
    target_counts = df[EDACfg.TARGET].value_counts(normalize=True)
    print(f"Class Distribution:\n{target_counts.to_string()}")

def add_percent_labels(ax):
    """Add percentage labels to bar plots"""
    total = sum([p.get_height() for p in ax.patches])
    for p in ax.patches:
        percentage = f"{100*p.get_height()/total:.1f}%"
        ax.annotate(percentage, 
                    (p.get_x() + p.get_width()/2, p.get_height()),
                    ha='center', va='center', xytext=(0, 5),
                    textcoords='offset points')

def numerical_analysis(df, col):
    """Full numerical feature analysis"""
    print(f"\nAnalyzing {col}")
    
    # Distribution plots
    fig, ax = plt.subplots(1, 2, figsize=(14, 6))
    sns.histplot(df[col], kde=True, ax=ax[0])
    ax[0].set_title(f"Distribution of {col}")
    sns.boxplot(x=df[col], ax=ax[1])
    ax[1].set_title(f"Boxplot of {col}")
    save_plot(f"num_{col}_dist")
    
    # Statistical summary
    stats = df[col].describe().to_frame().T
    stats["skew"] = df[col].skew()
    stats["kurtosis"] = df[col].kurtosis()
    print(stats)
    
    # QQ-Plot
    plt.figure()
    stats.probplot(df[col], plot=plt)
    plt.title(f"QQ-Plot for {col}")
    save_plot(f"num_{col}_qq")

def categorical_analysis(df, col):
    """Full categorical feature analysis"""
    print(f"\nAnalyzing {col}")
    
    # Distribution plot
    plt.figure()
    ax = sns.countplot(x=col, data=df, order=df[col].value_counts().index)
    plt.xticks(rotation=45)
    add_percent_labels(ax)
    plt.title(f"Distribution of {col}")
    save_plot(f"cat_{col}_dist")
    
    # Target relationship
    plt.figure()
    ct = pd.crosstab(df[col], df[EDACfg.TARGET], normalize='index')
    ct.plot(kind='bar', stacked=True)
    plt.title(f"{col} vs {EDACfg.TARGET}")
    plt.ylabel("Percentage")
    save_plot(f"cat_{col}_vs_target")

def multivariate_analysis(df):
    """Advanced multivariate techniques"""
    print("\n=== MULTIVARIATE ANALYSIS ===")
    
    # Correlation matrix
    corr = df.corr(numeric_only=True)
    mask = np.triu(np.ones_like(corr, dtype=bool))
    plt.figure(figsize=(20, 16))
    sns.heatmap(corr.round(2), mask=mask, cmap="coolwarm", annot=True)
    plt.title("Feature Correlation Matrix")
    save_plot("correlation_matrix")
    
    # Pairplot for key features
    key_features = corr[EDACfg.TARGET].abs().sort_values(ascending=False).index[:5]
    sns.pairplot(df, vars=key_features, hue=EDACfg.TARGET)
    save_plot("pairplot_key_features")
    
    # Interaction analysis
    interactions = [
        ('MonthlyIncome', 'JobLevel'),
        ('Age', 'YearsAtCompany')
    ]
    for var1, var2 in interactions:
        plt.figure()
        sns.lmplot(x=var1, y=var2, hue=EDACfg.TARGET, data=df, height=8)
        plt.title(f"{var1} vs {var2} by {EDACfg.TARGET}")
        save_plot(f"interaction_{var1}_{var2}")

def statistical_testing(df):
    """Run statistical tests for feature significance"""
    print("\n=== STATISTICAL TESTING ===")
    
    results = []
    for col in df.columns:
        if col == EDACfg.TARGET:
            continue
            
        # Numerical features
        if pd.api.types.is_numeric_dtype(df[col]):
            group1 = df[df[EDACfg.TARGET] == 1][col]
            group0 = df[df[EDACfg.TARGET] == 0][col]
            stat, p = stats.mannwhitneyu(group1, group0)
            test_type = "Mann-Whitney U"
            
        # Categorical features
        else:
            contingency = pd.crosstab(df[col], df[EDACfg.TARGET])
            stat, p, dof, _ = stats.chi2_contingency(contingency)
            test_type = "Chi-Square"
            
        results.append({
            "feature": col,
            "test": test_type,
            "statistic": stat,
            "p_value": p,
            "significant": p < EDACfg.ALPHA
        })
    
    # Save results
    results_df = pd.DataFrame(results)
    results_df.to_csv(os.path.join(EDACfg.OUTPUT_DIR, "stats_tests", "significance_results.csv"), index=False)
    print(results_df)

def generate_sweetviz_report(df):
    """Generate interactive EDA report"""
    report = sv.analyze(df, target_feat=EDACfg.TARGET)
    report.show_html(os.path.join(EDACfg.OUTPUT_DIR, "sweetviz_report.html"))

if __name__ == "__main__":
    setup_environment()
    df = pd.read_pickle(EDACfg.INPUT_PATH)
    
    # Automated Analysis
    target_analysis(df)
    numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
    categorical_cols = df.select_dtypes(include="category").columns.tolist()
    
    for col in numerical_cols:
        numerical_analysis(df, col)
        
    for col in categorical_cols:
        categorical_analysis(df, col)
        
    multivariate_analysis(df)
    statistical_testing(df)
    generate_sweetviz_report(df)
    
    print(f"✅ EDA complete. Results saved to {EDACfg.OUTPUT_DIR}")

=== DATASET OVERVIEW ===
Shape: (699, 31)

Data Types:
 Age                         float64
Attrition                     int64
BusinessTravel                int64
DailyRate                   float64
Department                    int64
DistanceFromHome            float64
Education                   float64
EducationField                int64
EnvironmentSatisfaction     float64
Gender                        int64
HourlyRate                  float64
JobInvolvement              float64
JobLevel                    float64
JobRole                       int64
JobSatisfaction             float64
MaritalStatus                 int64
MonthlyIncome               float64
MonthlyRate                 float64
NumCompaniesWorked          float64
OverTime                      int64
PercentSalaryHike           float64
PerformanceRating           float64
RelationshipSatisfaction    float64
StockOptionLevel            float64
TotalWorkingYears           float64
TrainingTimesLastYear       float64
WorkLife


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(x="Attrition", data=df, palette="coolwarm")



Attrition Percentage:
 Attrition
0    82.546495
1    17.453505
Name: proportion, dtype: float64

=== UNIVARIATE ANALYSIS ===

Age Statistics:
     count          mean       std       min       25%       50%       75%       max  skewness  kurtosis
Age  699.0  2.744586e-16  1.000716 -2.001782 -0.698505 -0.106106  0.604772  2.974367  0.598466  0.173061

Attrition Statistics:
           count      mean       std  min  25%  50%  75%  max  skewness  kurtosis
Attrition  699.0  0.174535  0.379841  0.0  0.0  0.0  0.0  1.0  1.714919  0.940947

BusinessTravel Statistics:
                count      mean       std  min  25%  50%  75%  max  skewness  kurtosis
BusinessTravel  699.0  1.615165  0.657205  0.0  1.0  2.0  2.0  2.0 -1.460169  0.789567

DailyRate Statistics:
           count          mean       std       min      25%       50%       75%      max  skewness  kurtosis
DailyRate  699.0  1.219816e-16  1.000716 -1.767233 -0.84438  0.010763  0.882206  1.73108 -0.042812 -1.175852

Department Stati


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=target, y=col, data=df, palette="coolwarm")
  return f(*args, **kwargs)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=target, y=col, data=df, palette="coolwarm")
  return f(*args, **kwargs)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=target, y=col, data=df, palette="coolwarm")



Age vs Attrition - t-stat: nan, p-value: nan

Attrition vs Attrition - t-stat: nan, p-value: nan


  return f(*args, **kwargs)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=target, y=col, data=df, palette="coolwarm")



BusinessTravel vs Attrition - t-stat: nan, p-value: nan


  return f(*args, **kwargs)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=target, y=col, data=df, palette="coolwarm")



DailyRate vs Attrition - t-stat: nan, p-value: nan


  return f(*args, **kwargs)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=target, y=col, data=df, palette="coolwarm")



Department vs Attrition - t-stat: nan, p-value: nan


  return f(*args, **kwargs)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=target, y=col, data=df, palette="coolwarm")



DistanceFromHome vs Attrition - t-stat: nan, p-value: nan


  return f(*args, **kwargs)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=target, y=col, data=df, palette="coolwarm")



Education vs Attrition - t-stat: nan, p-value: nan


  return f(*args, **kwargs)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=target, y=col, data=df, palette="coolwarm")



EducationField vs Attrition - t-stat: nan, p-value: nan


  return f(*args, **kwargs)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=target, y=col, data=df, palette="coolwarm")



EnvironmentSatisfaction vs Attrition - t-stat: nan, p-value: nan


  return f(*args, **kwargs)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=target, y=col, data=df, palette="coolwarm")



Gender vs Attrition - t-stat: nan, p-value: nan


  return f(*args, **kwargs)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=target, y=col, data=df, palette="coolwarm")
  return f(*args, **kwargs)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=target, y=col, data=df, palette="coolwarm")



HourlyRate vs Attrition - t-stat: nan, p-value: nan

JobInvolvement vs Attrition - t-stat: nan, p-value: nan


  return f(*args, **kwargs)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=target, y=col, data=df, palette="coolwarm")
  return f(*args, **kwargs)



JobLevel vs Attrition - t-stat: nan, p-value: nan

JobRole vs Attrition - t-stat: nan, p-value: nan



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=target, y=col, data=df, palette="coolwarm")
  return f(*args, **kwargs)



JobSatisfaction vs Attrition - t-stat: nan, p-value: nan



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=target, y=col, data=df, palette="coolwarm")
  return f(*args, **kwargs)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=target, y=col, data=df, palette="coolwarm")



MaritalStatus vs Attrition - t-stat: nan, p-value: nan


  return f(*args, **kwargs)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=target, y=col, data=df, palette="coolwarm")



MonthlyIncome vs Attrition - t-stat: nan, p-value: nan


  return f(*args, **kwargs)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=target, y=col, data=df, palette="coolwarm")



MonthlyRate vs Attrition - t-stat: nan, p-value: nan


  return f(*args, **kwargs)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=target, y=col, data=df, palette="coolwarm")



NumCompaniesWorked vs Attrition - t-stat: nan, p-value: nan


  return f(*args, **kwargs)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=target, y=col, data=df, palette="coolwarm")



OverTime vs Attrition - t-stat: nan, p-value: nan


  return f(*args, **kwargs)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=target, y=col, data=df, palette="coolwarm")
  return f(*args, **kwargs)



PercentSalaryHike vs Attrition - t-stat: nan, p-value: nan

PerformanceRating vs Attrition - t-stat: nan, p-value: nan



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=target, y=col, data=df, palette="coolwarm")
  return f(*args, **kwargs)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=target, y=col, data=df, palette="coolwarm")
  return f(*args, **kwargs)



RelationshipSatisfaction vs Attrition - t-stat: nan, p-value: nan

StockOptionLevel vs Attrition - t-stat: nan, p-value: nan



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=target, y=col, data=df, palette="coolwarm")
  return f(*args, **kwargs)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=target, y=col, data=df, palette="coolwarm")



TotalWorkingYears vs Attrition - t-stat: nan, p-value: nan


  return f(*args, **kwargs)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=target, y=col, data=df, palette="coolwarm")



TrainingTimesLastYear vs Attrition - t-stat: nan, p-value: nan


  return f(*args, **kwargs)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=target, y=col, data=df, palette="coolwarm")



WorkLifeBalance vs Attrition - t-stat: nan, p-value: nan


  return f(*args, **kwargs)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=target, y=col, data=df, palette="coolwarm")
  return f(*args, **kwargs)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=target, y=col, data=df, palette="coolwarm")



YearsAtCompany vs Attrition - t-stat: nan, p-value: nan

YearsInCurrentRole vs Attrition - t-stat: nan, p-value: nan


  return f(*args, **kwargs)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=target, y=col, data=df, palette="coolwarm")



YearsSinceLastPromotion vs Attrition - t-stat: nan, p-value: nan


  return f(*args, **kwargs)



YearsWithCurrManager vs Attrition - t-stat: nan, p-value: nan

=== MULTIVARIATE ANALYSIS ===

=== FEATURE ENGINEERING ===
Created feature: WorkLife_Score = WorkLifeBalance * JobSatisfaction

Enhanced dataset saved to: C:\Users\soulf\Downloads\archive\eda_results\HR_Employee_Enhanced_EDA.csv



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=target, y=col, data=df, palette="coolwarm")
