In [1]:
from sklearn import svm
from sklearn import metrics
import numpy as np
import pandas as pd
import seaborn as sns
from tabulate import tabulate
import pandas_profiling
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
import matplotlib.pylab as pylab
import warnings
from IPython.display import IFrame
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
import xgboost
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
%matplotlib inline
warnings.filterwarnings('ignore')

df = pd.read_excel("../data/info_satisfaccion_trabajo.xlsx")

In [2]:
# # Generate report using pandas-profiling
# report = df.profile_report()

# # Save the report to an HTML file
# report.to_file("report.html")

# # View the report in the notebook
# IFrame(src='report.html', width=1000, height=600)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

The report shows that the dataset contains 1470 observations and 35 variables, with 15 being numeric, 3 boolean, and 17 categorical. No missing cells or duplicate rows were found in the dataset.

The Alerts highlight some important characteristics of the dataset, including the presence of constant variables (EmployeeCount, Over18, StandardHours), and high correlations between several numeric variables, such as Age with TotalWorkingYears, MonthlyIncome with TotalWorkingYears and one other variable, PercentSalaryHike with PerformanceRating, among others. There is also high correlation between categorical variables like Department and EducationField, and JobRole and Department.

Overall, the pandas_profiling report provides valuable information about the dataset characteristics, including the distribution and type of variables, as well as relationships and correlations between them. It also detects constant variables and missing or zero values. This information will be useful for developing the requested supervised learning models and for monitoring their performance in case they are deployed for recurrent use in the company.

In [3]:
def dataframe_summary(df):
    # Number of rows and columns
    summary = []
    summary.append(["Number of rows and columns:", f"{df.shape[0]} rows, {df.shape[1]} columns"])
    
    # Number of null values
    null_counts = df.isnull().sum()
    if null_counts.sum() > 0:
        null_cols = null_counts[null_counts > 0]
        summary.append(["Number of null values:", f"{null_counts.sum()} in total across {len(null_cols)} columns"])
    else :
        summary.append(["Number of null values:", "None"])
        
    # Number of duplicates
    num_duplicates = df.duplicated().sum()
    if num_duplicates > 0:
        summary.append(["Number of duplicates:", f"{num_duplicates}"])
    else:
        summary.append(["Number of duplicates:", "None"])
    
    # Columns with many outliers
    outlier_threshold = 3
    outliers_summary = []
    for col in df.select_dtypes(include=np.number).columns:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        interquartile_range = q3 - q1
        outliers = ((df[col] < q1 - outlier_threshold*interquartile_range) | (df[col] > q3 + outlier_threshold*interquartile_range))
        outlier_percentage = outliers.sum()/df.shape[0]
        if outlier_percentage > 0.05:
            outliers_summary.append([col, f"{outlier_percentage:.2%}"])
    
    if outliers_summary:
        summary.append(["Columns with many outliers:", ""])
        summary.extend(outliers_summary)
        summary.append(["", ""])
    
    # Imbalanced columns
    imbalanced_summary = []
    for col in df.select_dtypes(include=["object", "category"]).columns:
        if df[col].nunique() > 2:
            value_counts = df[col].value_counts(normalize=True)
            if (value_counts < 0.05).any():
                imbalanced_summary.append([col, ""])
    
    if imbalanced_summary:
        summary.append(["Imbalanced categorical columns:", ""])
        summary.extend(imbalanced_summary)
        summary.append(["", ""])
    
    # Categorical columns that need encoding
    cat_cols = df.select_dtypes(include=["object", "category"]).columns
    cat_cols_to_encode = [col for col in cat_cols if len(df[col].value_counts()) > 2]
    if cat_cols_to_encode:
        summary.append(["Categorical columns that need encoding for ML:", ""])
        summary.append([", ".join(cat_cols_to_encode), ""])
        summary.append(["", ""])
    else:
        summary.append(["All categorical columns are encoded for ML.", ""])
        summary.append(["", ""])
    
    # Binary columns that need encoding
    binary_cols = [col for col in df.columns if len(df[col].value_counts()) == 2]
    binary_cols_to_encode = [col for col in binary_cols if df[col].dtype == "object" or df[col].dtype.name == "category"]
    if binary_cols_to_encode:
        summary.append(["Binary columns that need encoding for ML:", ""])
        summary.append([", ".join(binary_cols_to_encode), ""])
        summary.append(["", ""])
    else:
        summary.append(["All binary columns are encoded for ML.", ""])
        summary.append(["", ""])
    
    # Columns with only one unique value
    single_value_cols = [col for col in df.columns if len(df[col].value_counts()) == 1]
    if single_value_cols:
        summary.append(["Columns with only one unique value:", ""])
        summary.append([", ".join(single_value_cols), ""])
    
    # Print summary table
    print(tabulate(summary, headers=["Issue", "Details"], tablefmt="pretty"))

dataframe_summary(df)

+--------------------------------------------------------------------+-----------------------+
|                               Issue                                |        Details        |
+--------------------------------------------------------------------+-----------------------+
|                    Number of rows and columns:                     | 1470 rows, 35 columns |
|                       Number of null values:                       |         None          |
|                       Number of duplicates:                        |         None          |
|                    Columns with many outliers:                     |                       |
|                         PerformanceRating                          |        15.37%         |
|                                                                    |                       |
|                  Imbalanced categorical columns:                   |                       |
|                             Department          