### Import Libraries

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

### Read File

In [2]:
data = pd.read_csv("data/train.csv")
data.head(5)

Unnamed: 0,Customer Id,Age,Employment Type,GraduateOrNot,AnnualIncome,FamilyMembers,ChronicDiseases,FrequentFlyer,EverTravelledAbroad,TravelInsurance
0,3JUN0VW6F043,34,Private Sector/Self Employed,Yes,1300000,6,0,Yes,No,No
1,VLHY2ABIR4QL,28,Private Sector/Self Employed,Yes,750000,7,0,Yes,No,No
2,6E3F7UNXYNFF,28,Private Sector/Self Employed,Yes,750000,6,0,Yes,No,No
3,JJ8R0ZRYWR31,32,Government Sector,Yes,800000,6,1,No,No,No
4,2WGFUEX6IEHM,34,Private Sector/Self Employed,Yes,700000,4,1,No,No,No


In [3]:
data.shape

(1590, 10)

In [4]:
data.dtypes

Customer Id            object
Age                     int64
Employment Type        object
GraduateOrNot          object
AnnualIncome            int64
FamilyMembers           int64
ChronicDiseases         int64
FrequentFlyer          object
EverTravelledAbroad    object
TravelInsurance        object
dtype: object

### Preprocess Data

In [5]:
def preprocess_data(df):
    # Encode categorical variables
    label_encoders = {}
    for column in ['Employment Type', 'GraduateOrNot', 'FrequentFlyer', 'EverTravelledAbroad']:
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        label_encoders[column] = le
    
    # Encode target variable
    target_le = LabelEncoder()
    df['TravelInsurance'] = target_le.fit_transform(df['TravelInsurance'])

    # Scale numerical variables (excluding binary column 'ChronicDiseases')
    scaler = StandardScaler()
    df[['Age', 'AnnualIncome', 'FamilyMembers']] = scaler.fit_transform(df[['Age', 'AnnualIncome', 'FamilyMembers']])
    
    return df, label_encoders, target_le, scaler

In [6]:
data, label_encoders, target_le, scaler = preprocess_data(data)

In [7]:
X = data.drop(columns=['Customer Id', 'TravelInsurance'])
y = data['TravelInsurance']

### Train Model

In [8]:
model_rf = RandomForestClassifier(random_state=42)
model_rf.fit(X, y)

In [9]:
model_lr = LogisticRegression(random_state=42)
model_lr.fit(X, y)

In [11]:
model_xg = xgb.XGBClassifier(random_state=42, eval_metric='logloss')
model_xg.fit(X, y)

### Test

In [12]:
test_data = pd.read_csv('data/test.csv')

In [13]:
def preprocess_test_data(df, label_encoders, scaler):
    for column in ['Employment Type', 'GraduateOrNot', 'FrequentFlyer', 'EverTravelledAbroad']:
        le = label_encoders[column]
        df[column] = le.transform(df[column])
    
    df[['Age', 'AnnualIncome', 'FamilyMembers']] = scaler.transform(df[['Age', 'AnnualIncome', 'FamilyMembers']])
    
    return df

In [14]:
test_data = preprocess_test_data(test_data, label_encoders, scaler)

In [15]:
X_test = test_data.drop(columns=['Customer Id'])
test_rf_predictions = model_rf.predict_proba(X_test)[:, 1]
test_lr_predictions = model_lr.predict_proba(X_test)[:, 1]
test_xg_predictions = model_xg.predict_proba(X_test)[:, 1]

In [16]:
output_rf = pd.DataFrame({
    'Customer Id': test_data['Customer Id'],
    'prediction': test_rf_predictions
})
output_rf.to_csv('output_rf.csv', index=False)

output_lr = pd.DataFrame({
    'Customer Id': test_data['Customer Id'],
    'prediction': test_lr_predictions
})
output_lr.to_csv('output_lr.csv', index=False)

output_xg = pd.DataFrame({
    'Customer Id': test_data['Customer Id'],
    'prediction': test_xg_predictions
})
output_xg.to_csv('output_xg.csv', index=False)