---
Split Data into Training and Testing Sets

In [5]:
import pandas as pd
import numpy as np

parquet_fe_file_path = r"C:\Users\comat\GitProjects\customer-churn-ai\data\telco_churn_feature_engineered.parquet"
try:
    df_model_ready = pd.read_parquet(parquet_fe_file_path)
    print("Featrure-engineered DataFram 'df_model_ready' loaded successfully.\nSensational!\n")
    print(f"Shape of the loaded DataFrame: {df_model_ready.shape}\n")
    df_model_ready.info()
except FileNotFoundError:
    print(f"Error: File not found at {parquet_fe_file_path}.\nPlease ensure it was saved correctly, check the path, and try again.")
except Exception as e:
    print(f"An unexpected error occurred: {e}\n")

Featrure-engineered DataFram 'df_model_ready' loaded successfully.
Sensational!

Shape of the loaded DataFrame: (7043, 33)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 33 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   SeniorCitizen                          7043 non-null   int64  
 1   MonthlyCharges                         7043 non-null   float64
 2   TotalCharges                           7043 non-null   float64
 3   Churn                                  7043 non-null   int64  
 4   HF_neg                                 7043 non-null   float32
 5   HF_nue                                 7043 non-null   float32
 6   HF_pos                                 7043 non-null   float32
 7   gender_Male                            7043 non-null   bool   
 8   Partner_Yes                            7043 non-null   bool   
 9   Dependents_Yes  

In [None]:
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets

# 1. Separate features (X) and target (y)
if 'Churn' in df_model_ready.columns:
    X = df_model_ready.drop('Churn', axis=1)    # df containing all columns except 'Churn'
                                                # all columns are features except 'Churn'
    y = df_model_ready['Churn']                 # series containing target 'Churn'

    print("Features (X) and target (y) have been separated.")
    print(f"Shape of X: {X.shape}")
    print(f"Shape of y: {y.shape}")

    # 2. Split the data into training and testing sets
    # Used 80/20 split and stratify by y to maintain class proportions
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.2,    # 20% of data will be for testing
        random_state=42,  # Ensures reproducibility of the split
        stratify=y        # Recommended for classification. Keeps class proportion samples similar in train/test
    )

    print("\nData successfully split into training and testing sets.")
    print(f"Shape of X_train: {X_train.shape}")
    print(f"Shape of X_test: {X_test.shape}")
    print(f"Shape of y_train: {y_train.shape}")
    print(f"Shape of y_test: {y_test.shape}")

    # Check the Churn Proportions in y_train and y_test
    print("\nChurn proportion in y_train:")
    print(y_train.value_counts(normalize=True).round(3))
    print("\nChurn proportion in y_test:")
    print(y_test.value_counts(normalize=True).round(3))

else:
    print("Error: 'Churn' column not found in df_model_ready. Cannot proceed with splitting.")


Features (X) and target (y) have been separated.
Shape of X: (7043, 32)
Shape of y: (7043,)

Data successfully split into training and testing sets.
Shape of X_train: (5634, 32)
Shape of X_test: (1409, 32)
Shape of y_train: (5634,)
Shape of y_test: (1409,)

Churn proportion in y_train:
Churn
0    0.735
1    0.265
Name: proportion, dtype: float64

Churn proportion in y_test:
Churn
0    0.735
1    0.265
Name: proportion, dtype: float64
