In [51]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import os
import pickle


In [52]:
df = pd.read_csv(r"Churn_Modelling.csv")
print(df.shape)
df.head()


(10000, 14)


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


# Wrangle Function

In [53]:
def wrangle(dataframe, encoder_path="ohe_preprocessor.pkl"):

    # Dropping Columns.

    dataframe = dataframe.drop(
        columns=["RowNumber", "CustomerId", "Surname"],
        errors="ignore",axis=1
    )

    # Label Encoding.

    if dataframe["Gender"].dtype == "object":
        dataframe["Gender"] = dataframe["Gender"].map({
            "Male": 1,
            "Female": 0
        })

     # ColumnTransformer for Geography (OHE)
     
    if "Geography" in dataframe.columns:

        preprocessor = ColumnTransformer(
            transformers=[
                ("geo",
                 OneHotEncoder(sparse_output=False, handle_unknown="ignore"),
                 ["Geography"])
            ],
            remainder="passthrough"
        )

        # Load or fit encoder
        if os.path.exists(encoder_path):
            with open(encoder_path, "rb") as file:
                preprocessor = pickle.load(file)
            X_transformed = preprocessor.transform(dataframe)
        else:
            X_transformed = preprocessor.fit_transform(dataframe)
            with open(encoder_path, "wb") as file:
                pickle.dump(preprocessor, file)

        geo_feature_names = (
            preprocessor.named_transformers_["geo"]
            .get_feature_names_out(["Geography"])
        )

        remaining_cols = [col for col in dataframe.columns if col != "Geography"]
        final_columns = list(geo_feature_names) + remaining_cols

        dataframe = pd.DataFrame(X_transformed,columns=final_columns,index=df.index)
            
            
            
        
                           
    










    return dataframe

In [10]:
df=wrangle(df)

In [12]:
df['Geography'].value_counts()

Geography
France     5014
Germany    2509
Spain      2477
Name: count, dtype: int64

In [13]:
df['Gender'].value_counts()

Gender
Male      5457
Female    4543
Name: count, dtype: int64

In [14]:
df.isnull().sum()

CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [19]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
CreditScore,10000.0,650.5288,96.653299,350.0,584.0,652.0,718.0,850.0
Age,10000.0,38.9218,10.487806,18.0,32.0,37.0,44.0,92.0
Tenure,10000.0,5.0128,2.892174,0.0,3.0,5.0,7.0,10.0
Balance,10000.0,76485.889288,62397.405202,0.0,0.0,97198.54,127644.24,250898.09
NumOfProducts,10000.0,1.5302,0.581654,1.0,1.0,1.0,2.0,4.0
HasCrCard,10000.0,0.7055,0.45584,0.0,0.0,1.0,1.0,1.0
IsActiveMember,10000.0,0.5151,0.499797,0.0,0.0,1.0,1.0,1.0
EstimatedSalary,10000.0,100090.239881,57510.492818,11.58,51002.11,100193.915,149388.2475,199992.48
Exited,10000.0,0.2037,0.402769,0.0,0.0,0.0,0.0,1.0


In [54]:
df=wrangle(df)
df.head()


Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1.0,0.0,0.0,619.0,0.0,42.0,2.0,0.0,1.0,1.0,1.0,101348.88,1.0
1,0.0,0.0,1.0,608.0,0.0,41.0,1.0,83807.86,1.0,0.0,1.0,112542.58,0.0
2,1.0,0.0,0.0,502.0,0.0,42.0,8.0,159660.8,3.0,1.0,0.0,113931.57,1.0
3,1.0,0.0,0.0,699.0,0.0,39.0,1.0,0.0,2.0,0.0,0.0,93826.63,0.0
4,0.0,0.0,1.0,850.0,0.0,43.0,2.0,125510.82,1.0,1.0,1.0,79084.1,0.0


In [55]:
#Spliting the data into X and y
X=df.drop('Exited',axis=1)
y=df['Exited']

In [56]:
# Train test Split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=999)

In [57]:
# Scaling of Features
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

In [58]:
X_train

array([[-1.0093773 , -0.58084006,  1.76468061, ...,  0.64821723,
        -1.02895228,  0.33375419],
       [ 0.99070982, -0.58084006, -0.56667478, ...,  0.64821723,
        -1.02895228,  1.14999591],
       [-1.0093773 , -0.58084006,  1.76468061, ..., -1.54269273,
        -1.02895228, -0.81799761],
       ...,
       [ 0.99070982, -0.58084006, -0.56667478, ...,  0.64821723,
        -1.02895228,  1.43992042],
       [ 0.99070982, -0.58084006, -0.56667478, ..., -1.54269273,
         0.97186237, -1.67593991],
       [ 0.99070982, -0.58084006, -0.56667478, ...,  0.64821723,
        -1.02895228,  1.04889782]])

In [59]:
with open('scaler.pkl','wb') as file:
    pickle.dump(scaler,file)