In [1]:
# Import basic liabraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load the data
X_train = pd.read_csv('X_train_no_miss.csv')
y_train = pd.read_csv('Y_train.csv')
X_test = pd.read_csv('X_test_no_miss.csv')

In [3]:
X_train.columns

Index(['ID', 'Gender', 'Customer_Type', 'Age', 'Type_Travel', 'Travel_Class',
       'Travel_Distance', 'Departure_Delay_in_Mins', 'Arrival_Delay_in_Mins',
       'Seat_Comfort', 'Seat_Class', 'Arrival_Time_Convenient', 'Catering',
       'Platform_Location', 'Onboard_Wifi_Service', 'Onboard_Entertainment',
       'Online_Support', 'Ease_of_Online_Booking', 'Onboard_Service',
       'Legroom', 'Baggage_Handling', 'CheckIn_Service', 'Cleanliness',
       'Online_Boarding'],
      dtype='object')

In [4]:
from sklearn.preprocessing import PolynomialFeatures

def create_interaction_features(df, selected_columns):
    """
    Function to add interaction features to the dataset, allowing dynamic column selection.
    
    Args:
    df (pd.DataFrame): Input DataFrame containing the features.
    selected_columns (list): List of column names for which to create interaction terms.
    
    Returns:
    pd.DataFrame: DataFrame with the original and new interaction features.
    """
    # Initialize PolynomialFeatures object to create interaction terms
    # Only interaction terms, no quadratic terms (degree=2 and interaction_only=True)
    poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
    
    # Fit and transform the selected columns to create interaction features
    interactions = poly.fit_transform(df[selected_columns])
    
    # Create feature names for the interactions using get_feature_names_out for compatibility with newer sklearn
    interaction_features = poly.get_feature_names_out(selected_columns)
    
    # Remove original feature names from the list as they are included in the DataFrame already
    interaction_features = interaction_features[len(selected_columns):]
    
    # Convert the generated array from PolynomialFeatures into a DataFrame
    interaction_df = pd.DataFrame(interactions[:, len(selected_columns):], columns=interaction_features, index=df.index)
    
    # Concatenate the original DataFrame with the new interaction features DataFrame
    new_df = pd.concat([df, interaction_df], axis=1)
    
    return new_df



In [5]:
# Consider: 'Travel_Distance', 'Departure_Delay_in_Mins', 'Arrival_Delay_in_Mins'
A = ['Travel_Distance', 'Departure_Delay_in_Mins', 'Arrival_Delay_in_Mins']
train_data_A = create_interaction_features(X_train, A)
test_data_A = create_interaction_features(X_test, A)

In [6]:
# Consider: 'Online_Support', 'Ease_of_Online_Booking', 'Onboard_Service', 'Online_Boarding'
B = ['Online_Support', 'Ease_of_Online_Booking', 'Onboard_Service', 'Online_Boarding']
train_data_AB = create_interaction_features(train_data_A, B)
test_data_AB = create_interaction_features(test_data_A, B)

In [7]:
train_data_AB

Unnamed: 0,ID,Gender,Customer_Type,Age,Type_Travel,Travel_Class,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins,Seat_Comfort,...,Online_Boarding,Travel_Distance Departure_Delay_in_Mins,Travel_Distance Arrival_Delay_in_Mins,Departure_Delay_in_Mins Arrival_Delay_in_Mins,Online_Support Ease_of_Online_Booking,Online_Support Onboard_Service,Online_Support Online_Boarding,Ease_of_Online_Booking Onboard_Service,Ease_of_Online_Booking Online_Boarding,Onboard_Service Online_Boarding
0,98800001,0.0,1.0,52.0,0.0,0,272,0.0,5.0,2.0,...,1.0,0.0,1360.0,0.0,6.0,6.0,3.0,4.0,2.0,2.0
1,98800002,1.0,1.0,48.0,1.0,1,2200,9.0,0.0,1.0,...,4.0,19800.0,0.0,0.0,16.0,20.0,16.0,20.0,16.0,20.0
2,98800003,0.0,1.0,43.0,0.0,0,1061,77.0,119.0,2.0,...,5.0,81697.0,126259.0,9163.0,25.0,25.0,25.0,25.0,25.0,25.0
3,98800004,0.0,1.0,44.0,0.0,0,780,13.0,18.0,3.0,...,3.0,10140.0,14040.0,234.0,9.0,9.0,9.0,9.0,9.0,9.0
4,98800005,0.0,1.0,50.0,0.0,0,1981,0.0,0.0,3.0,...,4.0,0.0,0.0,0.0,20.0,20.0,20.0,16.0,16.0,16.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94374,98894375,1.0,1.0,32.0,0.0,0,1357,83.0,125.0,1.0,...,1.0,112631.0,169625.0,10375.0,1.0,4.0,1.0,4.0,1.0,4.0
94375,98894376,1.0,1.0,44.0,0.0,0,592,5.0,11.0,4.0,...,4.0,2960.0,6512.0,55.0,15.0,15.0,20.0,9.0,12.0,12.0
94376,98894377,1.0,1.0,63.0,0.0,0,2794,0.0,0.0,2.0,...,3.0,0.0,0.0,0.0,16.0,16.0,12.0,16.0,12.0,12.0
94377,98894378,1.0,1.0,16.0,1.0,1,2744,0.0,0.0,2.0,...,4.0,0.0,0.0,0.0,16.0,12.0,16.0,12.0,16.0,12.0


In [8]:
# Consider: 'Seat_Comfort', 'Seat_Class', 'Legroom'
C = ['Seat_Comfort', 'Seat_Class', 'Legroom']
train_data_ABC = create_interaction_features(train_data_AB, C)
test_data_ABC = create_interaction_features(test_data_AB, C)

In [9]:
train_data_ABC

Unnamed: 0,ID,Gender,Customer_Type,Age,Type_Travel,Travel_Class,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins,Seat_Comfort,...,Departure_Delay_in_Mins Arrival_Delay_in_Mins,Online_Support Ease_of_Online_Booking,Online_Support Onboard_Service,Online_Support Online_Boarding,Ease_of_Online_Booking Onboard_Service,Ease_of_Online_Booking Online_Boarding,Onboard_Service Online_Boarding,Seat_Comfort Seat_Class,Seat_Comfort Legroom,Seat_Class Legroom
0,98800001,0.0,1.0,52.0,0.0,0,272,0.0,5.0,2.0,...,0.0,6.0,6.0,3.0,4.0,2.0,2.0,0.0,6.0,0.0
1,98800002,1.0,1.0,48.0,1.0,1,2200,9.0,0.0,1.0,...,0.0,16.0,20.0,16.0,20.0,16.0,20.0,1.0,2.0,2.0
2,98800003,0.0,1.0,43.0,0.0,0,1061,77.0,119.0,2.0,...,9163.0,25.0,25.0,25.0,25.0,25.0,25.0,0.0,10.0,0.0
3,98800004,0.0,1.0,44.0,0.0,0,780,13.0,18.0,3.0,...,234.0,9.0,9.0,9.0,9.0,9.0,9.0,3.0,9.0,3.0
4,98800005,0.0,1.0,50.0,0.0,0,1981,0.0,0.0,3.0,...,0.0,20.0,20.0,20.0,16.0,16.0,16.0,3.0,12.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94374,98894375,1.0,1.0,32.0,0.0,0,1357,83.0,125.0,1.0,...,10375.0,1.0,4.0,1.0,4.0,1.0,4.0,1.0,4.0,4.0
94375,98894376,1.0,1.0,44.0,0.0,0,592,5.0,11.0,4.0,...,55.0,15.0,15.0,20.0,9.0,12.0,12.0,4.0,12.0,3.0
94376,98894377,1.0,1.0,63.0,0.0,0,2794,0.0,0.0,2.0,...,0.0,16.0,16.0,12.0,16.0,12.0,12.0,0.0,8.0,0.0
94377,98894378,1.0,1.0,16.0,1.0,1,2744,0.0,0.0,2.0,...,0.0,16.0,12.0,16.0,12.0,16.0,12.0,2.0,8.0,4.0


In [10]:
# Consider: 'Onboard_Wifi_Service', 'Onboard_Entertainment'
D = ['Onboard_Wifi_Service', 'Onboard_Entertainment']
train_data_ABCD = create_interaction_features(train_data_ABC, D)
test_data_ABCD = create_interaction_features(test_data_ABC, D)

In [11]:
train_data_ABCD

Unnamed: 0,ID,Gender,Customer_Type,Age,Type_Travel,Travel_Class,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins,Seat_Comfort,...,Online_Support Ease_of_Online_Booking,Online_Support Onboard_Service,Online_Support Online_Boarding,Ease_of_Online_Booking Onboard_Service,Ease_of_Online_Booking Online_Boarding,Onboard_Service Online_Boarding,Seat_Comfort Seat_Class,Seat_Comfort Legroom,Seat_Class Legroom,Onboard_Wifi_Service Onboard_Entertainment
0,98800001,0.0,1.0,52.0,0.0,0,272,0.0,5.0,2.0,...,6.0,6.0,3.0,4.0,2.0,2.0,0.0,6.0,0.0,8.0
1,98800002,1.0,1.0,48.0,1.0,1,2200,9.0,0.0,1.0,...,16.0,20.0,16.0,20.0,16.0,20.0,1.0,2.0,2.0,4.0
2,98800003,0.0,1.0,43.0,0.0,0,1061,77.0,119.0,2.0,...,25.0,25.0,25.0,25.0,25.0,25.0,0.0,10.0,0.0,8.0
3,98800004,0.0,1.0,44.0,0.0,0,780,13.0,18.0,3.0,...,9.0,9.0,9.0,9.0,9.0,9.0,3.0,9.0,3.0,6.0
4,98800005,0.0,1.0,50.0,0.0,0,1981,0.0,0.0,3.0,...,20.0,20.0,20.0,16.0,16.0,16.0,3.0,12.0,4.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94374,98894375,1.0,1.0,32.0,0.0,0,1357,83.0,125.0,1.0,...,1.0,4.0,1.0,4.0,1.0,4.0,1.0,4.0,4.0,1.0
94375,98894376,1.0,1.0,44.0,0.0,0,592,5.0,11.0,4.0,...,15.0,15.0,20.0,9.0,12.0,12.0,4.0,12.0,3.0,10.0
94376,98894377,1.0,1.0,63.0,0.0,0,2794,0.0,0.0,2.0,...,16.0,16.0,12.0,16.0,12.0,12.0,0.0,8.0,0.0,20.0
94377,98894378,1.0,1.0,16.0,1.0,1,2744,0.0,0.0,2.0,...,16.0,12.0,16.0,12.0,16.0,12.0,2.0,8.0,4.0,8.0


In [12]:
# Consider: 'Baggage_Handling', 'CheckIn_Service'
E = ['Baggage_Handling', 'CheckIn_Service']
train_data_ABCDE = create_interaction_features(train_data_ABCD, E)
test_data_ABCDE = create_interaction_features(test_data_ABCD, E)

In [13]:
train_data_ABCDE.shape

(94379, 38)

In [14]:
# Consider: 'Gender', 'Customer_Type', 'Age', 'Type_Travel'
F = ['Gender', 'Customer_Type', 'Age', 'Type_Travel']
train_data_ABCDEF = create_interaction_features(train_data_ABCDE, F)
test_data_ABCDEF = create_interaction_features(test_data_ABCDE, F)
train_data_ABCDEF.shape

(94379, 44)

In [15]:
train_data_ABCDEF.to_csv('X_train_wFE.csv', index=False)
test_data_ABCDEF.to_csv('X_test_wFE.csv', index=False)

In [16]:
pd.read_csv('X_train_wFE.csv')

Unnamed: 0,ID,Gender,Customer_Type,Age,Type_Travel,Travel_Class,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins,Seat_Comfort,...,Seat_Comfort Legroom,Seat_Class Legroom,Onboard_Wifi_Service Onboard_Entertainment,Baggage_Handling CheckIn_Service,Gender Customer_Type,Gender Age,Gender Type_Travel,Customer_Type Age,Customer_Type Type_Travel,Age Type_Travel
0,98800001,0.0,1.0,52.0,0.0,0,272,0.0,5.0,2.0,...,6.0,0.0,8.0,8.0,0.0,0.0,0.0,52.0,0.0,0.0
1,98800002,1.0,1.0,48.0,1.0,1,2200,9.0,0.0,1.0,...,2.0,2.0,4.0,2.0,1.0,48.0,1.0,48.0,1.0,48.0
2,98800003,0.0,1.0,43.0,0.0,0,1061,77.0,119.0,2.0,...,10.0,0.0,8.0,20.0,0.0,0.0,0.0,43.0,0.0,0.0
3,98800004,0.0,1.0,44.0,0.0,0,780,13.0,18.0,3.0,...,9.0,3.0,6.0,12.0,0.0,0.0,0.0,44.0,0.0,0.0
4,98800005,0.0,1.0,50.0,0.0,0,1981,0.0,0.0,3.0,...,12.0,4.0,8.0,16.0,0.0,0.0,0.0,50.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94374,98894375,1.0,1.0,32.0,0.0,0,1357,83.0,125.0,1.0,...,4.0,4.0,1.0,8.0,1.0,32.0,0.0,32.0,0.0,0.0
94375,98894376,1.0,1.0,44.0,0.0,0,592,5.0,11.0,4.0,...,12.0,3.0,10.0,12.0,1.0,44.0,0.0,44.0,0.0,0.0
94376,98894377,1.0,1.0,63.0,0.0,0,2794,0.0,0.0,2.0,...,8.0,0.0,20.0,12.0,1.0,63.0,0.0,63.0,0.0,0.0
94377,98894378,1.0,1.0,16.0,1.0,1,2744,0.0,0.0,2.0,...,8.0,4.0,8.0,16.0,1.0,16.0,1.0,16.0,1.0,16.0
