### 1. Import Libraries and Load Data

In [217]:
# Core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
import warnings
warnings.filterwarnings('ignore')

# Preprocessing libraries
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, confusion_matrix

# Statistical libraries
from scipy import stats
from scipy.stats import zscore, skew

# Set style for better visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")

Libraries imported successfully!


In [218]:
file_path = 'https://raw.githubusercontent.com/ek-chris/Practice_datasets/refs/heads/main/EasyVisa%20(1).csv'

### 2. EDA-Based Data Quality Assessment

In [219]:
def load_check_data_quality(file_path):
    """
    Loads dataset, performs basic data quality check
    
    Steps:
    1. Load CSV file.
    2. check missing value.
    3. check for duplicate.
    4. Check skewness for variables identified in EDA as right-skewed.
  
    """

    # === 1. Load dataset ===
    try:
        df = pd.read_csv(file_path)
        print(f"Dataset loaded successfully: {file_path}")
        print(f"Shape: {df.shape[0]} rows Ã— {df.shape[1]} columns\n")
    except Exception as e:
        print(f"Error loading file: {e}")
        return None

    # === 1. Missing Values ===
    print("\n=== MISSING VALUE PERCENTAGES ===")
    missing = df.isnull().mean() * 100
    print(missing[missing > 0].sort_values(ascending=False))

    # 2. Check for duplicates
    print("\n2. Duplicate Rows:")
    duplicates = df.duplicated().sum()
    print(f"Number of duplicate rows: {duplicates}")
    if duplicates > 0:
        print(f"Percentage of duplicates: {(duplicates/len(df))*100:.2f}%")

    # 3. Check skewness for variables identified in EDA as right-skewed
    print("\n3. Skewness Analysis (EDA identified right-skewed variables):")
    skewed_vars = ['no_of_employees', 'yr_of_estab', 'prevailing_wage ']
    for var in skewed_vars:
         if var in df.columns:
            skewness = skew(df[var])
         print(f"{var}: skewness = {skewness:.3f} "
         f"({'strongly skewed' if abs(skewness) > 0.7 else 'moderately skewed' if abs(skewness) > 0.3 else 'approximately normal'})")

   

    return df


In [220]:
df = load_check_data_quality(file_path)

Dataset loaded successfully: https://raw.githubusercontent.com/ek-chris/Practice_datasets/refs/heads/main/EasyVisa%20(1).csv
Shape: 25480 rows Ã— 12 columns


=== MISSING VALUE PERCENTAGES ===
Series([], dtype: float64)

2. Duplicate Rows:
Number of duplicate rows: 0

3. Skewness Analysis (EDA identified right-skewed variables):
no_of_employees: skewness = 12.265 (strongly skewed)
yr_of_estab: skewness = -2.037 (strongly skewed)
prevailing_wage : skewness = -2.037 (strongly skewed)


### 3. Data Cleaning

In [221]:
def clean_employee_data(df):
    """
    Data Cleaning
    ---------------------
    Handles duplicates, missing values, inconsistencies, and outliers.
    """
    # Remove duplicates
    df = df.drop_duplicates()

    # Handle missing values
    print("\n=== HANDLING MISSING VALUES ===")

    # lets iomport the imputer library
    from sklearn.impute import SimpleImputer
    
        # Prints and returns all categorical columns in the DataFrame.
    numerical_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    print(f"Numerical Features:\n{numerical_features}")

    # Prints and returns all categorical columns in the DataFrame.
    categorical_features = df.select_dtypes(include=['object']).columns.tolist()
    print(f"Categorical Features:\n{categorical_features}")

    # Lets create an instance of the imputer class using "median" as strategy for imputation
    num_imputer = SimpleImputer(strategy="median")

    # Lets create an instance of the imputer class using "most_frequent" as strategy for imputation
    cat_imputer = SimpleImputer(strategy="most_frequent")

        # Lets apply the defined instances above
    df[numerical_features] = num_imputer.fit_transform(df[numerical_features])
    df[categorical_features] = cat_imputer.fit_transform(df[categorical_features])
    
    # Correct inconsistencies
    text_cols = df.select_dtypes(include=['object']).columns
    for col in text_cols:
        df[col] = df[col].str.strip().str.title()

    # Handle outliers (IQR method)
    # Outlier treatment based on EDA recommendations
    print("=== OUTLIER TREATMENT (IQR-CAPPING METHOD) ===")
    print("EDA recommended IQR-capping for  to preserve data points")

    # Define numerical columns (excluding target)
    numerical_cols =  ['no_of_employees', 'yr_of_estab', 'prevailing_wage']
    if 'case_status' in numerical_cols:
        numerical_cols.remove('case_status')

    print(f"Treating outliers in {len(numerical_cols)} numerical features...")

    # Apply IQR-capping method
    outliers_capped = 0
    for col in numerical_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Count outliers before capping
        outliers_before = ((df[col] < lower_bound) | (df[col] > upper_bound)).sum()
        
        if outliers_before > 0:
            # Cap outliers
            df[col] = np.where(df[col] < lower_bound, lower_bound, df[col])
            df[col] = np.where(df[col] > upper_bound, upper_bound, df[col])
            outliers_capped += outliers_before
            print(f"âœ“ {col}: Capped {outliers_before} outliers")

    print(f"\nTotal outliers capped: {outliers_capped}")
    print(f"Dataset shape after outlier treatment: {df.shape}")

    print("Data cleaning complete.")

    return df


In [222]:
df = clean_employee_data(df)


=== HANDLING MISSING VALUES ===
Numerical Features:
['no_of_employees', 'yr_of_estab', 'prevailing_wage']
Categorical Features:
['case_id', 'continent', 'education_of_employee', 'has_job_experience', 'requires_job_training', 'region_of_employment', 'unit_of_wage', 'full_time_position', 'case_status']
=== OUTLIER TREATMENT (IQR-CAPPING METHOD) ===
EDA recommended IQR-capping for  to preserve data points
Treating outliers in 3 numerical features...
âœ“ no_of_employees: Capped 1556 outliers
âœ“ yr_of_estab: Capped 3260 outliers
âœ“ prevailing_wage: Capped 427 outliers

Total outliers capped: 5243
Dataset shape after outlier treatment: (25480, 12)
Data cleaning complete.


### 4. Encoding(like label encoding and one-hot encoding)

In [223]:
def transform_employee_data(df):
    """
    Step 2: Transformation (Encoding)
    ---------------------------------
    Converts categorical variables into numeric using Label & One-Hot encoding.
    """
   # Label Encoding (binary columns)
    label_map = {
        'has_job_experience': {'Y': 1, 'N': 0},
        'requires_job_training': {'Y': 1, 'N': 0},
        'full_time_position': {'Y': 1, 'N': 0},
        'case_status': {'Certified': 1, 'Denied': 0}
    }

    for col, mapping in label_map.items():
        if col in df.columns:
            df[col] = df[col].map(mapping)

    # One-hot Encoding (multi-category columns)
    onehot_cols = [
        'continent',
        'education_of_employee',
        'region_of_employment',
        'unit_of_wage'
    ]

    df = pd.get_dummies(df, columns=onehot_cols, drop_first=False, dtype=int)

    print("Transformation (encoding) complete.")
    return df


In [224]:
df_processed = transform_employee_data(df)
print(df_processed.head())

Transformation (encoding) complete.
  case_id  has_job_experience  requires_job_training  no_of_employees  \
0  Ezyv01                   0                      0           7227.0   
1  Ezyv02                   1                      0           2412.0   
2  Ezyv03                   0                      1           7227.0   
3  Ezyv04                   0                      0             98.0   
4  Ezyv05                   1                      0           1082.0   

   yr_of_estab  prevailing_wage  full_time_position  case_status  \
0       2007.0         592.2029                   1            0   
1       2002.0       83425.6500                   1            1   
2       2008.0      122996.8600                   1            0   
3       1932.5       83434.0300                   1            0   
4       2005.0      149907.3900                   1            1   

   continent_Africa  continent_Asia  ...  education_of_employee_Master'S  \
0                 0               1  ...

### 5. Log-Transform Skewed Variables (EDA Recommendation)

In [225]:
import numpy as np
from scipy.stats import skew

def normalize_employee_data(df_processed):
    """
    Step: Normalization
    ---------------------
    Safely applies log transformation to skewed numeric features.
    Handles zeros, negatives, and NaNs automatically.
    """
    print("=== LOG-TRANSFORMING SKEWED VARIABLES ===")
    print("EDA identified these variables as right-skewed and recommended log transformation:")

    # Variables identified as skewed from EDA
    skewed_vars = ['no_of_employees', 'yr_of_estab', 'prevailing_wage']

    for var in skewed_vars:
        if var in df_processed.columns:
            # Ensure numeric
            if not np.issubdtype(df_processed[var].dtype, np.number):
                print(f"Skipping {var}: non-numeric column.")
                continue

            # Handle NaN safely
            df[var] = df_processed[var].fillna(0)

            # Check for zeros/negatives
            min_val = df_processed[var].min()
            if min_val <= 0:
                shift = abs(min_val) + 1
                df_processed[f'{var}_log'] = np.log1p(df_processed[var] + shift)
                print(f"âœ“ {var}: Shifted by {shift:.2f} and applied log1p (min={min_val:.3f})")
            else:
                df_processed[f'{var}_log'] = np.log1p(df_processed[var])
                print(f"âœ“ {var}: Applied log1p transformation (all positive values)")

            # Compute skewness safely
            try:
                original_skew = skew(df_processed[var], nan_policy='omit')
                transformed_skew = skew(df_processed[f'{var}_log'], nan_policy='omit')
                print(f"  Original skewness: {original_skew:.3f} â†’ Transformed skewness: {transformed_skew:.3f}")
            except Exception as e:
                print(f"Skewness computation failed for {var}: {e}")

    print(f"\nDataset shape after log transformation: {df_processed.shape}")
    print("New log-transformed columns:", [col for col in df_processed.columns if '_log' in col])
    print("Normalization (log transform) complete.\n")

    return df_processed


In [226]:
df_processed = normalize_employee_data(df_processed)

=== LOG-TRANSFORMING SKEWED VARIABLES ===
EDA identified these variables as right-skewed and recommended log transformation:
âœ“ no_of_employees: Shifted by 27.00 and applied log1p (min=-26.000)
  Original skewness: 0.959 â†’ Transformed skewness: -1.145
âœ“ yr_of_estab: Applied log1p transformation (all positive values)
  Original skewness: -1.111 â†’ Transformed skewness: -1.124
âœ“ prevailing_wage: Applied log1p transformation (all positive values)
  Original skewness: 0.547 â†’ Transformed skewness: -2.136

Dataset shape after log transformation: (25480, 30)
New log-transformed columns: ['no_of_employees_log', 'yr_of_estab_log', 'prevailing_wage_log']
Normalization (log transform) complete.



### 6. Feature Engineering

In [227]:
def enhance_employee_data(df_processed):
    """
    Step 4: Feature Engineering
    ---------------------------
    Creates new, meaningful derived features.
    """
    df_processed['wage_per_employee'] = df_processed['prevailing_wage'] / (df_processed['no_of_employees'] + 1)
    df_processed['firm_age'] = 2025 - df_processed['yr_of_estab']
    df['experience_training'] = df['has_job_experience'] * df_processed['requires_job_training']

    print("Feature engineering complete.")
    print(f"\nDataset shape after feature engineering: {df_processed.shape}")
    print(f"New engineered features: {[col for col in df_processed.columns if col not in df.columns]}")

    return df_processed


In [228]:
df_processed = enhance_employee_data(df_processed)
#df.columns

Feature engineering complete.

Dataset shape after feature engineering: (25480, 32)
New engineered features: ['continent_Africa', 'continent_Asia', 'continent_Europe', 'continent_North America', 'continent_Oceania', 'continent_South America', "education_of_employee_Bachelor'S", 'education_of_employee_Doctorate', 'education_of_employee_High School', "education_of_employee_Master'S", 'region_of_employment_Island', 'region_of_employment_Midwest', 'region_of_employment_Northeast', 'region_of_employment_South', 'region_of_employment_West', 'unit_of_wage_Hour', 'unit_of_wage_Month', 'unit_of_wage_Week', 'unit_of_wage_Year', 'no_of_employees_log', 'yr_of_estab_log', 'prevailing_wage_log', 'wage_per_employee', 'firm_age']


### 7. Feature Selection

In [229]:
def feature_selection_by_significance(df, p_threshold=0.05):
    """
    Automatically keeps only statistically significant features (p < p_threshold)
    based on Mannâ€“Whitney U (numeric) and Chi-square (categorical) tests.
    Assumes 'case_status' is binary-encoded: 1 = Certified, 0 = Denied.
    """

    from scipy.stats import mannwhitneyu, chi2_contingency

    # Identify feature types
    numeric_features = df.select_dtypes(include=['float64', 'int64']).columns.drop('case_status', errors='ignore')
    categorical_features = df.select_dtypes(include=['object', 'category']).columns

    significant_features = []

    print("=== FEATURE SELECTION BASED ON STATISTICAL SIGNIFICANCE ===")

    # ---------- Numeric Features ----------
    print("\nðŸ”¹ Numeric Features (Mannâ€“Whitney U Test)")
    for feature in numeric_features:
        group_cert = df[df['case_status'] == 1][feature].dropna()
        group_denied = df[df['case_status'] == 0][feature].dropna()

        # Skip columns with constant values
        if group_cert.nunique() <= 1 or group_denied.nunique() <= 1:
            print(f"{feature}: Skipped (constant values)")
            continue

        stat, p = mannwhitneyu(group_cert, group_denied, alternative='two-sided')

        if p < p_threshold:
            significant_features.append(feature)
            print(f"{feature}: p = {p:.4f}  Significant")
        else:
            print(f"{feature}: p = {p:.4f} Not significant")

    # ---------- Categorical Features ----------
    print("\nðŸ”¹ Categorical Features (Chi-square Test)")
    for feature in categorical_features:
        contingency = pd.crosstab(df[feature], df['case_status'])
        if contingency.shape[0] < 2 or contingency.shape[1] < 2:
            print(f"{feature}: Skipped (insufficient variation)")
            continue

        chi2, p, _, _ = chi2_contingency(contingency)
        if p < p_threshold:
            significant_features.append(feature)
            print(f"{feature}: p = {p:.4f} Significant")
        else:
            print(f"{feature}: p = {p:.4f} Not significant")

    # ---------- Summary ----------
    print("\n=== SUMMARY ===")
    print(f"Significant features retained ({len(significant_features)}): {significant_features}")

    # Keep only significant columns + target
    selected_df = df[significant_features + ['case_status']].copy()

    print(f"Final dataset shape after feature selection: {selected_df.shape}")
    return selected_df


In [230]:
df_selected = feature_selection_by_significance(df)

=== FEATURE SELECTION BASED ON STATISTICAL SIGNIFICANCE ===

ðŸ”¹ Numeric Features (Mannâ€“Whitney U Test)


has_job_experience: p = 0.0000  Significant
requires_job_training: p = 0.1788 Not significant
no_of_employees: p = 0.0000  Significant
yr_of_estab: p = 0.0009  Significant
prevailing_wage: p = 0.0000  Significant
full_time_position: p = 0.0425  Significant
experience_training: p = 0.0000  Significant

ðŸ”¹ Categorical Features (Chi-square Test)
case_id: p = 0.4971 Not significant
continent: p = 0.0000 Significant
education_of_employee: p = 0.0000 Significant
region_of_employment: p = 0.0000 Significant
unit_of_wage: p = 0.0000 Significant

=== SUMMARY ===
Significant features retained (10): ['has_job_experience', 'no_of_employees', 'yr_of_estab', 'prevailing_wage', 'full_time_position', 'experience_training', 'continent', 'education_of_employee', 'region_of_employment', 'unit_of_wage']
Final dataset shape after feature selection: (25480, 11)


### 8. Data Splitting

In [None]:
# Stratified data splitting based on EDA findings about class imbalance
print("=== STRATIFIED DATA SPLITTING ===")
print("EDA identified class imbalance - using stratified splitting to preserve class distribution")

# Select the chosen features
X_selected = X[selected_features]
print(f"Selected features shape: {X_selected.shape}")

# First split: 80% train+val, 20% test
X_temp, X_test, y_temp, y_test = train_test_split(
    X_selected, y, test_size=0.2, random_state=42, stratify=y
)

# Second split: 75% train, 25% validation (of the 80%)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp
)

print(f"\nData split results:")
print(f"Training set: {X_train.shape} ({(X_train.shape[0]/len(X_selected))*100:.1f}%)")
print(f"Validation set: {X_val.shape} ({(X_val.shape[0]/len(X_selected))*100:.1f}%)")
print(f"Test set: {X_test.shape} ({(X_test.shape[0]/len(X_selected))*100:.1f}%)")

# Check class distribution in each set (should be similar due to stratification)
print(f"\nClass distribution verification:")
print("Training set Loan_Status distribution:")
print(y_train.value_counts().sort_index())
print("\nValidation set Loan_Status distribution:")
print(y_val.value_counts().sort_index())
print("\nTest set Loan_Status distribution:")
print(y_test.value_counts().sort_index())

In [231]:
df = optimize_employee_features(df, target_col='case_status')

Feature selection (significance testing) complete.


In [232]:
from sklearn.model_selection import train_test_split

def split_employee_data(df, target_col='case_status', test_size=0.2, random_state=42):
    """
    Step 6: Data Splitting
    ----------------------
    Splits the dataset into train and test sets.
    """
    X = df.drop(columns=[target_col])
    y = df[target_col]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=random_state
    )

    print("Data splitting complete.")
    return X_train, X_test, y_train, y_test


In [233]:
from sklearn.preprocessing import StandardScaler

def scale_employee_features(X_train, X_test):
    """
    Step 7: Feature Scaling
    -----------------------
    Standardizes numeric features for model training.
    """
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    print("Feature scaling complete.")
    return X_train_scaled, X_test_scaled, scaler


In [234]:
from sklearn.ensemble import RandomForestClassifier

def feature_importance_analysis(X_train_scaled, y_train, feature_names):
    """
    Step 8: Feature Importance Analysis
    -----------------------------------
    Uses Random Forest to determine feature importance.
    """
    rf = RandomForestClassifier(random_state=42)
    rf.fit(X_train_scaled, y_train)

    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': rf.feature_importances_
    }).sort_values(by='Importance', ascending=False)

    print("Feature importance analysis complete.")
    return importance_df
