In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [15]:
df = pd.read_csv('train.csv')

In [16]:
# Checking the first few rows of the dataset
df.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [17]:
# Checking for missing values
df.isnull().sum()

id                 0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [18]:
# Dropping unnecessary columns (if any)
df = df.drop(['id', 'CustomerId', 'Surname'], axis=1)

In [19]:
# Convert categorical variables to dummy/indicator variables if needed
df = pd.get_dummies(df, columns=['Geography', 'Gender'], drop_first=True)

In [43]:
df.head(20)

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,668,33.0,3,0.0,2,1.0,0.0,181449.97,0,False,False,True
1,627,33.0,1,0.0,2,1.0,1.0,49503.5,0,False,False,True
2,678,40.0,10,0.0,2,1.0,0.0,184866.69,0,False,False,True
3,581,34.0,2,148882.54,1,1.0,1.0,84560.88,0,False,False,True
4,716,33.0,5,0.0,2,1.0,1.0,15068.83,0,False,True,True
5,588,36.0,4,131778.58,1,1.0,0.0,136024.31,1,True,False,True
6,593,30.0,8,144772.69,1,1.0,0.0,29792.11,0,False,False,False
7,678,37.0,1,138476.41,1,1.0,0.0,106851.6,0,False,True,True
8,676,43.0,4,0.0,2,1.0,0.0,142917.13,0,False,False,True
9,583,40.0,4,81274.33,1,1.0,1.0,170843.07,0,True,False,True


In [21]:
X = df.drop('Exited', axis=1)
y = df['Exited']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [22]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [70]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)



In [71]:
y_pred = model.predict(X_test)


In [72]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8589693095404005

Confusion Matrix:
 [[24569  1483]
 [ 3172  3783]]

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.94      0.91     26052
           1       0.72      0.54      0.62      6955

    accuracy                           0.86     33007
   macro avg       0.80      0.74      0.77     33007
weighted avg       0.85      0.86      0.85     33007



In [73]:
df_test=pd.read_csv('test.csv')

In [74]:
df_test.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,165034,15773898,Lucchese,586,France,Female,23.0,2,0.0,2,0.0,1.0,160976.75
1,165035,15782418,Nott,683,France,Female,46.0,2,0.0,1,1.0,0.0,72549.27
2,165036,15807120,K?,656,France,Female,34.0,7,0.0,2,1.0,0.0,138882.09
3,165037,15808905,O'Donnell,681,France,Male,36.0,8,0.0,1,1.0,0.0,113931.57
4,165038,15607314,Higgins,752,Germany,Male,38.0,10,121263.62,1,1.0,0.0,139431.0


In [75]:
df_test.drop(['id','Surname','CustomerId'],axis='columns',inplace=True)

In [76]:
df_test

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,586,France,Female,23.0,2,0.00,2,0.0,1.0,160976.75
1,683,France,Female,46.0,2,0.00,1,1.0,0.0,72549.27
2,656,France,Female,34.0,7,0.00,2,1.0,0.0,138882.09
3,681,France,Male,36.0,8,0.00,1,1.0,0.0,113931.57
4,752,Germany,Male,38.0,10,121263.62,1,1.0,0.0,139431.00
...,...,...,...,...,...,...,...,...,...,...
110018,570,Spain,Male,29.0,7,116099.82,1,1.0,1.0,148087.62
110019,575,France,Female,36.0,4,178032.53,1,1.0,1.0,42181.68
110020,712,France,Male,31.0,2,0.00,2,1.0,0.0,16287.38
110021,709,France,Female,32.0,3,0.00,1,1.0,1.0,158816.58


In [77]:
df = pd.get_dummies(df_test, columns=['Geography', 'Gender'], drop_first=True)

In [78]:
df

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_Germany,Geography_Spain,Gender_Male
0,586,23.0,2,0.00,2,0.0,1.0,160976.75,False,False,False
1,683,46.0,2,0.00,1,1.0,0.0,72549.27,False,False,False
2,656,34.0,7,0.00,2,1.0,0.0,138882.09,False,False,False
3,681,36.0,8,0.00,1,1.0,0.0,113931.57,False,False,True
4,752,38.0,10,121263.62,1,1.0,0.0,139431.00,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...
110018,570,29.0,7,116099.82,1,1.0,1.0,148087.62,False,True,True
110019,575,36.0,4,178032.53,1,1.0,1.0,42181.68,False,False,False
110020,712,31.0,2,0.00,2,1.0,0.0,16287.38,False,False,True
110021,709,32.0,3,0.00,1,1.0,1.0,158816.58,False,False,False


In [79]:
e_pred = model.predict(df)



In [80]:
e_pred

array([1, 0, 1, ..., 1, 0, 0])

In [81]:
import numpy as np
y_pred = np.array([e_pred]) 

# Convert the NumPy array to a Pandas DataFrame
results_df = pd.DataFrame({'Predicted': e_pred})

# Export to CSV
results_df.to_csv('results.csv', index=False)

