# Preprocessing for the Travel dataset

This notebook outlines the steps taken to preprocess the Travel dataset for analysis.

## Importing Libraries

In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import warnings
import os
from pathlib import Path
import json
warnings.filterwarnings('ignore')

## Loading Datasets

In [30]:
dataset_dir = Path.cwd().joinpath("datasets")

train_dataset = "train.csv"
test_dataset = "test.csv"

train_df = pd.read_csv(dataset_dir.joinpath(train_dataset))
test_df = pd.read_csv(dataset_dir.joinpath(test_dataset))

print("Successfully loaded dataset:", train_dataset, ",", test_dataset)

Successfully loaded dataset: train.csv , test.csv


## Data Analysis

In [31]:
def perform_eda(df, save=True, config_file="categorical_config.json"):
    """
    Perform basic exploratory data analysis.
    Saves low-cardinality categorical values to a JSON config file.
    """

    # Dataset Info
    print("\nDataset Info:")
    print(df.info())

    # Shape
    print(f"\nDataset Shape: {df.shape}")

    # Statistical summary
    print("\nStatistical Summary:")
    print(df.describe())

    # Missing values
    print("\nMissing Values:")
    missing = df.isnull().sum()
    missing_pct = (missing / len(df)) * 100
    missing_df = pd.DataFrame({
        'Missing_Count': missing,
        'Percentage': missing_pct
    })
    print(missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False))

    # Data types
    print("\nData Types:")
    print(df.dtypes.value_counts())

    # Duplicate rows
    duplicates = df.duplicated().sum()
    print(f"\nDuplicate Rows: {duplicates}")

    # --- MODIFIED SECTION START ---
    print("\nUnique Values in Categorical Columns:")
    
    # Included 'category' dtype as well as object
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns
    
    # Dictionary to store values for the config file
    low_cardinality_map = {}

    for col in categorical_cols:
        unique_count = df[col].nunique()
        print(f"{col}: {unique_count} unique values")
        
        # Logic: If less than 20 unique values
        if unique_count < 20:
            # Get unique values and convert to a standard Python list
            # We convert to str to ensure it is JSON serializable (handles np.nan objects)
            unique_vals = [str(x) for x in df[col].unique().tolist()]
            
            # 1. Print them
            print(f"   -> [Low Cardinality] Values: {unique_vals}")
            
            # 2. Add to dictionary for saving
            low_cardinality_map[col] = unique_vals

    # Save to config file if we found any matching columns
    if low_cardinality_map and save:
        try:
            with open(config_file, 'w') as f:
                json.dump(low_cardinality_map, f, indent=4)
            print(f"\n[SUCCESS] Low cardinality categories saved to '{config_file}'")
        except Exception as e:
            print(f"\n[ERROR] Could not save config file: {e}")
    # --- MODIFIED SECTION END ---

    return df

print("\n" + "="*50)
print("EXPLORATORY DATA ANALYSIS for train")
print("="*50)
d = perform_eda(train_df, False)
print("\n" + "="*50)
print("EXPLORATORY DATA ANALYSIS for test")
print("="*50)
d = perform_eda(test_df, False)


EXPLORATORY DATA ANALYSIS for train

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12654 entries, 0 to 12653
Data columns (total 25 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   trip_id                      12654 non-null  object 
 1   country                      12424 non-null  object 
 2   age_group                    12646 non-null  object 
 3   travel_companions            11917 non-null  object 
 4   num_females                  12652 non-null  float64
 5   num_males                    12650 non-null  float64
 6   main_activity                12526 non-null  object 
 7   visit_purpose                12654 non-null  object 
 8   is_first_visit               12555 non-null  object 
 9   mainland_stay_nights         12654 non-null  int64  
 10  island_stay_nights           12654 non-null  int64  
 11  tour_type                    12654 non-null  object 
 12  intl_transport_include

## Understanding the Dataset

Train and Test datasets each have a ton of missing values, most of them are less than 5% missing, but some columns have more than 50% missing values.

Two options, drop the values of those columns or impute them.

For now, we will drop datapoints for which columns have less than 10% missing values and for columns with more than 10% missing values those columns will be dropped.

In [32]:
def handle_missing_values(df, strategy='auto'):
    """
    Handle missing values using different strategies

    Parameters:
    - strategy: 'auto', 'mean', 'median', 'mode', 'drop', 'drop_column, or custom dict
    """

    df_copy = df.copy()

    # Identify numerical and categorical columns
    numerical_cols = df_copy.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = df_copy.select_dtypes(include=['object', 'category']).columns.tolist()

    if strategy == 'auto':
        # For numerical: use median
        if numerical_cols:
            num_imputer = SimpleImputer(strategy='median')
            df_copy[numerical_cols] = num_imputer.fit_transform(df_copy[numerical_cols])
            print(f"Imputed {len(numerical_cols)} numerical columns with median")

        # For categorical: use mode
        if categorical_cols:
            for col in categorical_cols:
                if df_copy[col].isnull().sum() > 0:
                    mode_value = df_copy[col].mode()[0] if len(df_copy[col].mode()) > 0 else 'Unknown'
                    df_copy[col].fillna(mode_value, inplace=True)
            print(f"Imputed {len(categorical_cols)} categorical columns with mode")

    elif strategy == 'mean':
        num_imputer = SimpleImputer(strategy='mean')
        df_copy[numerical_cols] = num_imputer.fit_transform(df_copy[numerical_cols])

    elif strategy == 'median':
        num_imputer = SimpleImputer(strategy='median')
        df_copy[numerical_cols] = num_imputer.fit_transform(df_copy[numerical_cols])

    elif strategy == 'drop':
        # 1. Calculate the percentage of missing values for each column
        null_percentages = df_copy.isnull().mean()

        # 2. Identify columns where missing values are less than 10% (0.1)
        # We also ensure > 0 to avoid checking columns that are already full
        cols_to_check = null_percentages[
            (null_percentages < 0.10) & (null_percentages > 0)
        ].index

        # 3. Drop rows only if they have NaN in those specific columns
        df_copy = df_copy.dropna(subset=cols_to_check)
        print(f"Dropped rows with missing values. New shape: {df_copy.shape}")
    
    elif strategy == 'drop_column':
        # Drop columns with more than 10% missing values
        df_copy = df_copy.drop(columns=df_copy.columns[df_copy.isnull().sum() > 0.1 * len(df_copy)])

    print(f"\nMissing values after imputation: {df_copy.isnull().sum().sum()}")

    return df_copy

print("\n" + "="*50)
print("HANDLING MISSING VALUES (Corrected)")
print("="*50)

# --- TRAIN DATA ---
# It is okay to drop rows in training if that's your preferred strategy
print("Processing TRAIN data...")
train_df = handle_missing_values(train_df, strategy='drop')         # Drop rows with missing values
train_df = handle_missing_values(train_df, strategy='drop_column')  # Drop columns with too many missing values

# --- TEST DATA ---
# NEVER drop rows in test data. Use imputation ('auto') instead.
print("\nProcessing TEST data...")
test_df = handle_missing_values(test_df, strategy='auto')          # <--- CHANGED: Impute instead of drop
test_df = handle_missing_values(test_df, strategy='drop_column')   # Drop columns (risky* see note below)

# *NOTE ON DROPPING COLUMNS: 
# If 'drop_column' removes a column in Train but keeps it in Test (or vice versa), 
# your model will crash due to mismatched features. 
# A safer way is to align them at the end:
train_cols = train_df.columns.tolist()
# Ensure test only has columns that are in train (excluding target 'spend_category')
cols_to_keep = [c for c in train_cols if c != 'spend_category']
test_df = test_df[cols_to_keep]


HANDLING MISSING VALUES (Corrected)
Processing TRAIN data...
Dropped rows with missing values. New shape: (10492, 25)

Missing values after imputation: 10850

Missing values after imputation: 0

Processing TEST data...
Imputed 4 numerical columns with median
Imputed 20 categorical columns with mode

Missing values after imputation: 0

Missing values after imputation: 0


## Detecting Outliers

We're using the IQR method to detect outliers in the dataset.

In [33]:
def detect_outliers_iqr(df, columns=None, threshold=1.5):

    if columns is None:
        columns = df.select_dtypes(include=['int64', 'float64']).columns

    outlier_indices = []

    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1

        lower_bound = Q1 - threshold * IQR
        upper_bound = Q3 + threshold * IQR

        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)].index
        outlier_indices.extend(outliers)

        print(f"{col}: {len(outliers)} outliers detected")

    return list(set(outlier_indices))

"""Detect outliers using IQR method"""
print("\n" + "="*50)
print("OUTLIER DETECTION for train dataset")
print("="*50)
# indices of outlers in the dataset
outliers_train = detect_outliers_iqr(train_df)

print("\n" + "="*50)
print("OUTLIER DETECTION for test dataset")
print("="*50)
# indices of outlers in the dataset
outliers_test = detect_outliers_iqr(test_df)


OUTLIER DETECTION for train dataset
num_females: 568 outliers detected
num_males: 3764 outliers detected
mainland_stay_nights: 612 outliers detected
island_stay_nights: 321 outliers detected
spend_category: 0 outliers detected

OUTLIER DETECTION for test dataset
num_females: 302 outliers detected
num_males: 2131 outliers detected
mainland_stay_nights: 464 outliers detected
island_stay_nights: 240 outliers detected


In [34]:
save_dir = dataset_dir.joinpath("preprocessed")
os.makedirs(save_dir, exist_ok=True)
train_df.to_csv(save_dir.joinpath("train_preprocessed.csv"), index=False)
test_df.to_csv(save_dir.joinpath("test_preprocessed.csv"), index=False)
print(f"\nPreprocessed datasets saved to {save_dir}")


Preprocessed datasets saved to c:\Users\gathi\projects\ML-Project-2\Multi-class Problem\datasets\preprocessed


## Feature Engineering

Creating new features based on existing ones to enhance model performance. Do check if you need more features and change code as neccessary.

New features created:
 - age_group split into categogories

In [35]:
print("\n" + "="*50)
print("EXPLORATORY DATA ANALYSIS for train")
print("="*50)
d = perform_eda(train_df)
print("\n" + "="*50)
print("EXPLORATORY DATA ANALYSIS for test")
print("="*50)
d = perform_eda(test_df)


EXPLORATORY DATA ANALYSIS for train

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
Index: 10492 entries, 0 to 12652
Data columns (total 22 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   trip_id                      10492 non-null  object 
 1   country                      10492 non-null  object 
 2   age_group                    10492 non-null  object 
 3   travel_companions            10492 non-null  object 
 4   num_females                  10492 non-null  float64
 5   num_males                    10492 non-null  float64
 6   main_activity                10492 non-null  object 
 7   visit_purpose                10492 non-null  object 
 8   is_first_visit               10492 non-null  object 
 9   mainland_stay_nights         10492 non-null  int64  
 10  island_stay_nights           10492 non-null  int64  
 11  tour_type                    10492 non-null  object 
 12  intl_transport_included    

## Things left to do

Do the following preprocessing steps based on your requirements.

1) Feature Engineering
2) Feature Encoding
4) Encoding Categorical Variables
3) Feature Scaling
4) Feature Selection
5) Data Validation