In [1]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

# Using Vanilla XGBoost Algorithm

In [2]:
df = pd.read_csv('/kaggle/input/playground-series-s4e1/train.csv')

In [3]:
df

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.00,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.00,2,1.0,1.0,49503.50,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.00,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.00,2,1.0,1.0,15068.83,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165029,165029,15667085,Meng,667,Spain,Female,33.0,2,0.00,1,1.0,1.0,131834.75,0
165030,165030,15665521,Okechukwu,792,France,Male,35.0,3,0.00,1,0.0,0.0,131834.45,0
165031,165031,15664752,Hsia,565,France,Male,31.0,5,0.00,1,1.0,1.0,127429.56,0
165032,165032,15689614,Hsiung,554,Spain,Female,30.0,7,161533.00,1,0.0,1.0,71173.03,0


In [4]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['Surname'] = label_encoder.fit_transform(df['Surname'])
df['Geography'] = label_encoder.fit_transform(df['Geography'])
df['Gender'] = label_encoder.fit_transform(df['Gender'])

In [5]:
X = df.drop(['Exited'], axis=1)
y = df['Exited']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
model = xgb.XGBClassifier()

In [8]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.865937528403066


In [9]:
df_test = pd.read_csv('/kaggle/input/playground-series-s4e1/test.csv')

In [10]:
df_test['Surname'] = label_encoder.fit_transform(df_test['Surname'])
df_test['Geography'] = label_encoder.fit_transform(df_test['Geography'])
df_test['Gender'] = label_encoder.fit_transform(df_test['Gender'])

In [11]:
test_pred = model.predict(df_test)

In [12]:
test_pred

array([0, 1, 0, ..., 0, 0, 0])

In [13]:
df_test['Exited'] = test_pred

In [14]:
df_test.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,165034,15773898,1482,586,0,0,23.0,2,0.0,2,0.0,1.0,160976.75,0
1,165035,15782418,1812,683,0,0,46.0,2,0.0,1,1.0,0.0,72549.27,1
2,165036,15807120,1246,656,0,0,34.0,7,0.0,2,1.0,0.0,138882.09,0
3,165037,15808905,1832,681,0,1,36.0,8,0.0,1,1.0,0.0,113931.57,0
4,165038,15607314,1079,752,1,1,38.0,10,121263.62,1,1.0,0.0,139431.0,0


In [15]:
selected_columns = df_test.loc[:, ['id', 'Exited']]
selected_columns.head()

Unnamed: 0,id,Exited
0,165034,0
1,165035,1
2,165036,0
3,165037,0
4,165038,0


In [16]:
selected_columns.to_csv('bank_churn.csv', index=False)

# Using Catboost pipeline

In [17]:
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [18]:
train_df = pd.read_csv('/kaggle/input/playground-series-s4e1/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s4e1/test.csv')

In [19]:
X = train_df.drop(['Exited'], axis=1)
y = train_df['Exited']

In [20]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

In [22]:
numeric_transformer_xgb = StandardScaler()

preprocessor_xgb = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer_xgb, numerical_features),
    ])
categorical_transformer_cat = OneHotEncoder(handle_unknown='ignore')

In [23]:
preprocessor_cat = ColumnTransformer(
transformers=[
    ('num', numeric_transformer_xgb, numerical_features),
    ('cat', categorical_transformer_cat, categorical_features)
])

In [24]:
cat_clf = Pipeline(steps=[('preprocessor', preprocessor_cat),
                         ('classifier', CatBoostClassifier(iterations=100, random_state=42, verbose=0))])

In [25]:
cat_clf.fit(X_train, y_train)

In [26]:
test_predictions_cat = cat_clf.predict_proba(test_df)[:,1]

In [27]:
submission_df = pd.DataFrame({'id': test_df['id'], 'Exited': test_predictions_cat})

In [28]:
submission_df.to_csv('submission.csv', index=False)