# Preprocessing for the Travel dataset

This notebook outlines the steps taken to preprocess the Travel dataset for analysis.

## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import warnings
import os
from pathlib import Path
warnings.filterwarnings('ignore')

## Loading Datasets

In [2]:
dataset_dir = Path.cwd().joinpath("datasets")

train_dataset = "train.csv"
test_dataset = "test.csv"

train_df = pd.read_csv(dataset_dir.joinpath(train_dataset))
test_df = pd.read_csv(dataset_dir.joinpath(test_dataset))

print("Successfully loaded dataset:", train_dataset, ",", test_dataset)

Successfully loaded dataset: train.csv , test.csv


## Data Analysis

In [7]:
def perform_eda(df):
    """Perform basic exploratory data analysis"""

    # Dataset Info
    print("\nDataset Info:")
    print(df.info())

    # Shape
    print(f"\nDataset Shape: {df.shape}")

    # First few rows
    # print("\nFirst 5 rows:")
    # print(df.head())

    # Statistical summary
    print("\nStatistical Summary:")
    print(df.describe())

    # Missing values
    print("\nMissing Values:")
    missing = df.isnull().sum()
    missing_pct = (missing / len(df)) * 100
    missing_df = pd.DataFrame({
        'Missing_Count': missing,
        'Percentage': missing_pct
    })
    print(missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False))

    # Data types
    print("\nData Types:")
    print(df.dtypes.value_counts())

    # Duplicate rows
    duplicates = df.duplicated().sum()
    print(f"\nDuplicate Rows: {duplicates}")

    # Unique values for categorical columns
    print("\nUnique Values in Categorical Columns:")
    categorical_cols = df.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        print(f"{col}: {df[col].nunique()} unique values")

    return df

print("\n" + "="*50)
print("EXPLORATORY DATA ANALYSIS for train")
print("="*50)
perform_eda(train_df)
print("\n" + "="*50)
print("EXPLORATORY DATA ANALYSIS for test")
print("="*50)
perform_eda(test_df)


EXPLORATORY DATA ANALYSIS for train

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12654 entries, 0 to 12653
Data columns (total 25 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   trip_id                      12654 non-null  object 
 1   country                      12424 non-null  object 
 2   age_group                    12646 non-null  object 
 3   travel_companions            11917 non-null  object 
 4   num_females                  12652 non-null  float64
 5   num_males                    12650 non-null  float64
 6   main_activity                12526 non-null  object 
 7   visit_purpose                12654 non-null  object 
 8   is_first_visit               12555 non-null  object 
 9   mainland_stay_nights         12654 non-null  int64  
 10  island_stay_nights           12654 non-null  int64  
 11  tour_type                    12654 non-null  object 
 12  intl_transport_include

Unnamed: 0,trip_id,country,age_group,travel_companions,num_females,num_males,main_activity,visit_purpose,is_first_visit,mainland_stay_nights,...,accomodation_included,food_included,domestic_transport_included,sightseeing_included,guide_included,insurance_included,days_booked_before_trip,arrival_weather,total_trip_days,has_special_requirements
0,tour_id8gzpck76,CONGO,25-44,Alone,0.0,1.0,Widlife Tourism,Business,No,14,...,No,No,No,No,No,No,15-30,"sunny,",15-30,"wheelchair,"
1,tour_idow1zxkou,SWIZERLAND,45-64,With Spouse,1.0,1.0,Widlife Tourism,Leisure and Holidays,Yes,8,...,Yes,Yes,No,No,Yes,No,61-90,Stormy,1-6,
2,tour_idue7esfqz,MEXICO,45-64,With Other Friends/Relatives,2.0,0.0,Cultural Tourism,Leisure and Holidays,Yes,3,...,Yes,Yes,Yes,Yes,Yes,No,1-7,Stormy,30+,none
3,tour_idnj3mjzpb,JAPAN,25-44,With Other Friends/Relatives,1.0,1.0,Widlife Tourism,Leisure and Holidays,Yes,5,...,Yes,Yes,Yes,Yes,Yes,No,90+,"sunny,",7-14,
4,tour_ida3us5yk2,SPAIN,25-44,With Other Friends/Relatives,2.0,0.0,Wildlife Tourism,Leisure and Holidays,Yes,0,...,Yes,Yes,No,Yes,No,Yes,61-90,Stormy,30+,"dietary needs,"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5847,tour_id0bnf9xcp,ZIMBABWE,25-44,Alone,0.0,1.0,Beach Tourism,Business,Yes,3,...,No,No,No,No,No,No,,other,1-6,
5848,tour_idqfcw3991,UNITED STATES OF AMERICA,25-44,With Spouse,1.0,1.0,Widlife Tourism,Leisure and Holidays,Yes,9,...,Yes,Yes,Yes,Yes,Yes,No,31-60,"sunny,",30+,
5849,tour_idrarc7v2s,SWEDEN,25-44,With Spouse and Children,2.0,1.0,Beach Tourism,Leisure and Holidays,Yes,0,...,No,No,No,No,No,No,61-90,,30+,
5850,tour_idvuu6dqrf,PAKISTAN,25-44,Alone,0.0,1.0,Hunting Tourism,Visiting Friends and Relatives,Yes,2,...,No,No,No,No,No,No,61-90,"cloudy,",7-14,


## Understanding the Dataset

Train and Test datasets each have a ton of missing values, most of them are less than 5% missing, but some columns have more than 50% missing values.

Two options, drop the values of those columns or impute them.

For now, we will drop datapoints for which columns have less than 10% missing values and for columns with more than 10% missing values those columns will be dropped.

In [13]:
def handle_missing_values(df, strategy='auto'):
    """
    Handle missing values using different strategies

    Parameters:
    - strategy: 'auto', 'mean', 'median', 'mode', 'drop', 'drop_column, or custom dict
    """

    df_copy = df.copy()

    # Identify numerical and categorical columns
    numerical_cols = df_copy.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = df_copy.select_dtypes(include=['object', 'category']).columns.tolist()

    if strategy == 'auto':
        # For numerical: use median
        if numerical_cols:
            num_imputer = SimpleImputer(strategy='median')
            df_copy[numerical_cols] = num_imputer.fit_transform(df_copy[numerical_cols])
            print(f"Imputed {len(numerical_cols)} numerical columns with median")

        # For categorical: use mode
        if categorical_cols:
            for col in categorical_cols:
                if df_copy[col].isnull().sum() > 0:
                    mode_value = df_copy[col].mode()[0] if len(df_copy[col].mode()) > 0 else 'Unknown'
                    df_copy[col].fillna(mode_value, inplace=True)
            print(f"Imputed {len(categorical_cols)} categorical columns with mode")

    elif strategy == 'mean':
        num_imputer = SimpleImputer(strategy='mean')
        df_copy[numerical_cols] = num_imputer.fit_transform(df_copy[numerical_cols])

    elif strategy == 'median':
        num_imputer = SimpleImputer(strategy='median')
        df_copy[numerical_cols] = num_imputer.fit_transform(df_copy[numerical_cols])

    elif strategy == 'drop':
        # 1. Calculate the percentage of missing values for each column
        null_percentages = df_copy.isnull().mean()

        # 2. Identify columns where missing values are less than 10% (0.1)
        # We also ensure > 0 to avoid checking columns that are already full
        cols_to_check = null_percentages[
            (null_percentages < 0.10) & (null_percentages > 0)
        ].index

        # 3. Drop rows only if they have NaN in those specific columns
        df_copy = df_copy.dropna(subset=cols_to_check)
        print(f"Dropped rows with missing values. New shape: {df_copy.shape}")
    
    elif strategy == 'drop_column':
        # Drop columns with more than 10% missing values
        df_copy = df_copy.drop(columns=df_copy.columns[df_copy.isnull().sum() > 0.1 * len(df_copy)])

    print(f"\nMissing values after imputation: {df_copy.isnull().sum().sum()}")

    return df_copy

print("\n" + "="*50)
print("HANDLING MISSING VALUES for train and test datasets")
print("="*50)
train_df = handle_missing_values(handle_missing_values(train_df, strategy='drop'), strategy='drop_column')
print("\nDoing for test now\n")
test_df = handle_missing_values(handle_missing_values(test_df, strategy='drop'), strategy='drop_column')


HANDLING MISSING VALUES for train and test datasets
Dropped rows with missing values. New shape: (10492, 22)

Missing values after imputation: 0

Missing values after imputation: 0

Doing for test now

Dropped rows with missing values. New shape: (4907, 21)

Missing values after imputation: 0

Missing values after imputation: 0


## Detecting Outliers

We're using the IQR method to detect outliers in the dataset.

In [None]:
def detect_outliers_iqr(df, columns=None, threshold=1.5):

    if columns is None:
        columns = df.select_dtypes(include=['int64', 'float64']).columns

    outlier_indices = []

    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1

        lower_bound = Q1 - threshold * IQR
        upper_bound = Q3 + threshold * IQR

        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)].index
        outlier_indices.extend(outliers)

        print(f"{col}: {len(outliers)} outliers detected")

    return list(set(outlier_indices))

"""Detect outliers using IQR method"""
print("\n" + "="*50)
print("OUTLIER DETECTION for train dataset")
print("="*50)
# indices of outlers in the dataset
outliers_train = detect_outliers_iqr(train_df)

print("\n" + "="*50)
print("OUTLIER DETECTION for test dataset")
print("="*50)
# indices of outlers in the dataset
outliers_test = detect_outliers_iqr(test_df)


OUTLIER DETECTION for train dataset
num_females: 568 outliers detected
num_males: 3764 outliers detected
mainland_stay_nights: 612 outliers detected
island_stay_nights: 321 outliers detected
spend_category: 0 outliers detected

OUTLIER DETECTION for test dataset
num_females: 266 outliers detected
num_males: 1784 outliers detected
mainland_stay_nights: 377 outliers detected
island_stay_nights: 210 outliers detected


In [18]:
save_dir = dataset_dir.joinpath("preprocessed")
os.makedirs(save_dir, exist_ok=True)
train_df.to_csv(save_dir.joinpath("train_preprocessed.csv"), index=False)
test_df.to_csv(save_dir.joinpath("test_preprocessed.csv"), index=False)
print(f"\nPreprocessed datasets saved to {save_dir}")


Preprocessed datasets saved to c:\Users\gathi\projects\ML-Project-2\Multi-class Problem\datasets\preprocessed


## Things left to do

Do the following preprocessing steps based on your requirements.

1) Feature Engineering
2) Feature Encoding
4) Encoding Categorical Variables
3) Feature Scaling
4) Feature Selection
5) Data Validation