# Preprocessing for the Travel dataset

This notebook outlines the steps taken to preprocess the Travel dataset for analysis.

## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import warnings
import os
from pathlib import Path
warnings.filterwarnings('ignore')

## Loading Datasets

In [2]:
dataset_dir = Path.cwd().joinpath("datasets")

train_dataset = "train.csv"
test_dataset = "test.csv"

train_df = pd.read_csv(dataset_dir.joinpath(train_dataset))
test_df = pd.read_csv(dataset_dir.joinpath(test_dataset))

print("Successfully loaded dataset:", train_dataset, ",", test_dataset)

Successfully loaded dataset: train.csv , test.csv


## Data Analysis

In [3]:
def perform_eda(df):
    """Perform basic exploratory data analysis"""

    # Dataset Info
    print("\nDataset Info:")
    print(df.info())

    # Shape
    print(f"\nDataset Shape: {df.shape}")

    # First few rows
    # print("\nFirst 5 rows:")
    # print(df.head())

    # Statistical summary
    print("\nStatistical Summary:")
    print(df.describe())

    # Missing values
    print("\nMissing Values:")
    missing = df.isnull().sum()
    missing_pct = (missing / len(df)) * 100
    missing_df = pd.DataFrame({
        'Missing_Count': missing,
        'Percentage': missing_pct
    })
    print(missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False))

    # Data types
    print("\nData Types:")
    print(df.dtypes.value_counts())

    # Duplicate rows
    duplicates = df.duplicated().sum()
    print(f"\nDuplicate Rows: {duplicates}")

    # Unique values for categorical columns
    print("\nUnique Values in Categorical Columns:")
    categorical_cols = df.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        print(f"{col}: {df[col].nunique()} unique values")

    return df

print("\n" + "="*50)
print("EXPLORATORY DATA ANALYSIS for train")
print("="*50)
perform_eda(train_df)
print("\n" + "="*50)
print("EXPLORATORY DATA ANALYSIS for test")
print("="*50)
perform_eda(test_df)


EXPLORATORY DATA ANALYSIS for train

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204277 entries, 0 to 204276
Data columns (total 18 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   ProfileID           204277 non-null  object 
 1   ApplicantYears      204277 non-null  int64  
 2   AnnualEarnings      204277 non-null  int64  
 3   RequestedSum        204277 non-null  int64  
 4   TrustMetric         204277 non-null  int64  
 5   WorkDuration        204277 non-null  int64  
 6   ActiveAccounts      204277 non-null  int64  
 7   OfferRate           204277 non-null  float64
 8   RepayPeriod         204277 non-null  int64  
 9   DebtFactor          204277 non-null  float64
 10  QualificationLevel  204277 non-null  object 
 11  WorkCategory        204277 non-null  object 
 12  RelationshipStatus  204277 non-null  object 
 13  OwnsProperty        204277 non-null  object 
 14  FamilyObligation    204277 non-n

Unnamed: 0,ProfileID,ApplicantYears,AnnualEarnings,RequestedSum,TrustMetric,WorkDuration,ActiveAccounts,OfferRate,RepayPeriod,DebtFactor,QualificationLevel,WorkCategory,RelationshipStatus,OwnsProperty,FamilyObligation,FundUseCase,JointApplicant
0,CKV34LU7V7,55,112656,92393,581,113,2,23.54,36,0.15,PhD,Self-employed,Single,Yes,Yes,Home,No
1,62KTYNH93J,56,91569,131575,641,54,1,15.19,12,0.43,High School,Part-time,Divorced,Yes,Yes,Education,Yes
2,JGFUSOIUH7,26,78169,75417,569,105,3,18.02,12,0.29,Master's,Part-time,Married,Yes,Yes,Education,Yes
3,4538THBHOX,26,63033,10804,326,118,1,14.71,24,0.41,High School,Part-time,Single,No,No,Business,Yes
4,DXLNA06JHR,24,29665,21182,662,102,3,15.02,60,0.69,PhD,Unemployed,Single,No,Yes,Business,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51065,DQRTA8KWGC,51,99473,170353,628,24,1,17.03,12,0.46,PhD,Self-employed,Divorced,Yes,Yes,Auto,Yes
51066,W0FDMPACG3,29,42016,111314,371,51,4,7.10,36,0.50,PhD,Self-employed,Married,No,No,Other,No
51067,MA0F4U8ORY,67,88507,142666,731,51,1,22.89,48,0.79,Bachelor's,Part-time,Divorced,No,No,Education,No
51068,6QUH04P7EJ,42,116649,190938,488,6,1,10.83,60,0.32,Bachelor's,Full-time,Married,No,Yes,Other,Yes


## Understanding the Dataset

Train and Test datasets each have a ton of missing values, most of them are less than 5% missing, but some columns have more than 50% missing values.

Two options, drop the values of those columns or impute them.

For now, we will drop datapoints for which columns have less than 10% missing values and for columns with more than 10% missing values those columns will be dropped.

In [4]:
def handle_missing_values(df, strategy='auto'):
    """
    Handle missing values using different strategies

    Parameters:
    - strategy: 'auto', 'mean', 'median', 'mode', 'drop', 'drop_column, or custom dict
    """

    df_copy = df.copy()

    # Identify numerical and categorical columns
    numerical_cols = df_copy.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = df_copy.select_dtypes(include=['object', 'category']).columns.tolist()

    if strategy == 'auto':
        # For numerical: use median
        if numerical_cols:
            num_imputer = SimpleImputer(strategy='median')
            df_copy[numerical_cols] = num_imputer.fit_transform(df_copy[numerical_cols])
            print(f"Imputed {len(numerical_cols)} numerical columns with median")

        # For categorical: use mode
        if categorical_cols:
            for col in categorical_cols:
                if df_copy[col].isnull().sum() > 0:
                    mode_value = df_copy[col].mode()[0] if len(df_copy[col].mode()) > 0 else 'Unknown'
                    df_copy[col].fillna(mode_value, inplace=True)
            print(f"Imputed {len(categorical_cols)} categorical columns with mode")

    elif strategy == 'mean':
        num_imputer = SimpleImputer(strategy='mean')
        df_copy[numerical_cols] = num_imputer.fit_transform(df_copy[numerical_cols])

    elif strategy == 'median':
        num_imputer = SimpleImputer(strategy='median')
        df_copy[numerical_cols] = num_imputer.fit_transform(df_copy[numerical_cols])

    elif strategy == 'drop':
        # 1. Calculate the percentage of missing values for each column
        null_percentages = df_copy.isnull().mean()

        # 2. Identify columns where missing values are less than 10% (0.1)
        # We also ensure > 0 to avoid checking columns that are already full
        cols_to_check = null_percentages[
            (null_percentages < 0.10) & (null_percentages > 0)
        ].index

        # 3. Drop rows only if they have NaN in those specific columns
        df_copy = df_copy.dropna(subset=cols_to_check)
        print(f"Dropped rows with missing values. New shape: {df_copy.shape}")
    
    elif strategy == 'drop_column':
        # Drop columns with more than 10% missing values
        df_copy = df_copy.drop(columns=df_copy.columns[df_copy.isnull().sum() > 0.1 * len(df_copy)])

    print(f"\nMissing values after imputation: {df_copy.isnull().sum().sum()}")

    return df_copy

print("\n" + "="*50)
print("HANDLING MISSING VALUES for train and test datasets")
print("="*50)
train_df = handle_missing_values(handle_missing_values(train_df, strategy='drop'), strategy='drop_column')
print("\nDoing for test now\n")
test_df = handle_missing_values(handle_missing_values(test_df, strategy='drop'), strategy='drop_column')


HANDLING MISSING VALUES for train and test datasets
Dropped rows with missing values. New shape: (204277, 18)

Missing values after imputation: 0

Missing values after imputation: 0

Doing for test now

Dropped rows with missing values. New shape: (51070, 17)

Missing values after imputation: 0

Missing values after imputation: 0


## Detecting Outliers and handling them

We're using the IQR method to detect outliers in the dataset.

In [5]:
def detect_outliers_iqr(df, columns=None, threshold=1.5):

    if columns is None:
        columns = df.select_dtypes(include=['int64', 'float64']).columns

    outlier_indices = []

    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1

        lower_bound = Q1 - threshold * IQR
        upper_bound = Q3 + threshold * IQR

        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)].index
        outlier_indices.extend(outliers)

        print(f"{col}: {len(outliers)} outliers detected")

    return list(set(outlier_indices))

"""Detect outliers using IQR method"""
print("\n" + "="*50)
print("OUTLIER DETECTION for train dataset")
print("="*50)
# indices of outlers in the dataset
outliers_train = detect_outliers_iqr(train_df)

print("\n" + "="*50)
print("OUTLIER DETECTION for test dataset")
print("="*50)
# indices of outlers in the dataset
outliers_test = detect_outliers_iqr(test_df)


OUTLIER DETECTION for train dataset
ApplicantYears: 0 outliers detected
AnnualEarnings: 0 outliers detected
RequestedSum: 0 outliers detected
TrustMetric: 0 outliers detected
WorkDuration: 0 outliers detected
ActiveAccounts: 0 outliers detected
OfferRate: 0 outliers detected
RepayPeriod: 0 outliers detected
DebtFactor: 0 outliers detected
RiskFlag: 23753 outliers detected

OUTLIER DETECTION for test dataset
ApplicantYears: 0 outliers detected
AnnualEarnings: 0 outliers detected
RequestedSum: 0 outliers detected
TrustMetric: 0 outliers detected
WorkDuration: 0 outliers detected
ActiveAccounts: 0 outliers detected
OfferRate: 0 outliers detected
RepayPeriod: 0 outliers detected
DebtFactor: 0 outliers detected


In [6]:
save_dir = dataset_dir.joinpath("preprocessed")
os.makedirs(save_dir, exist_ok=True)
train_df.to_csv(save_dir.joinpath("train_preprocessed.csv"), index=False)
test_df.to_csv(save_dir.joinpath("test_preprocessed.csv"), index=False)
print(f"\nPreprocessed datasets saved to {save_dir}")


Preprocessed datasets saved to c:\Users\gathi\projects\ML-Project-2\Binary-Class Problem\datasets\preprocessed


## Things left to do

Do the following preprocessing steps based on your requirements.

1) Feature Engineering
2) Feature Encoding
4) Encoding Categorical Variables
3) Feature Scaling
4) Feature Selection
5) Data Validation