---
Split Data into Training and Testing Sets

In [14]:
import pandas as pd
import numpy as np

parquet_fe_file_path = r"C:\Users\comat\GitProjects\customer-churn-ai\data\telco_churn_feature_engineered.parquet"
try:
    df_model_ready = pd.read_parquet(parquet_fe_file_path)
    print("Featrure-engineered DataFram 'df_model_ready' loaded successfully.\nSensational!\n")
    print(f"Shape of the loaded DataFrame: {df_model_ready.shape}\n")
    df_model_ready.info()
except FileNotFoundError:
    print(f"Error: File not found at {parquet_fe_file_path}.\nPlease ensure it was saved correctly, check the path, and try again.")
except Exception as e:
    print(f"An unexpected error occurred: {e}\n")

Featrure-engineered DataFram 'df_model_ready' loaded successfully.
Sensational!

Shape of the loaded DataFrame: (7043, 33)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 33 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   SeniorCitizen                          7043 non-null   int64  
 1   MonthlyCharges                         7043 non-null   float64
 2   TotalCharges                           7043 non-null   float64
 3   Churn                                  7043 non-null   int64  
 4   HF_neg                                 7043 non-null   float32
 5   HF_nue                                 7043 non-null   float32
 6   HF_pos                                 7043 non-null   float32
 7   gender_Male                            7043 non-null   bool   
 8   Partner_Yes                            7043 non-null   bool   
 9   Dependents_Yes  

In [15]:
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets

# 1. Separate features (X) and target (y)
if 'Churn' in df_model_ready.columns:
    X = df_model_ready.drop('Churn', axis=1)    # df containing all columns except 'Churn'
                                                # all columns are features except 'Churn'
    y = df_model_ready['Churn']                 # series containing target 'Churn'

    print("Features (X) and target (y) have been separated.")
    print(f"Shape of X: {X.shape}")
    print(f"Shape of y: {y.shape}")

# 2. Split the data into training and testing sets. Used 80/20 split
#     and stratify by y to maintain class proportions
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.2,    # 20% of data will be for testing
        random_state=42,  # Ensures reproducibility of the split
        stratify=y        # Recommended for classification. Keeps class proportion samples similar in train/test
    )

    print("\nData successfully split into training and testing sets.")
    print(f"Shape of X_train: {X_train.shape}")
    print(f"Shape of X_test: {X_test.shape}")
    print(f"Shape of y_train: {y_train.shape}")
    print(f"Shape of y_test: {y_test.shape}")

    # Check the Churn Proportions in y_train and y_test
    print("\nChurn proportion in y_train:")
    print(y_train.value_counts(normalize=True).round(3))
    print("\nChurn proportion in y_test:")
    print(y_test.value_counts(normalize=True).round(3))

else:
    print("Error: 'Churn' column not found in df_model_ready. Cannot proceed with splitting.")


Features (X) and target (y) have been separated.
Shape of X: (7043, 32)
Shape of y: (7043,)

Data successfully split into training and testing sets.
Shape of X_train: (5634, 32)
Shape of X_test: (1409, 32)
Shape of y_train: (5634,)
Shape of y_test: (1409,)

Churn proportion in y_train:
Churn
0    0.735
1    0.265
Name: proportion, dtype: float64

Churn proportion in y_test:
Churn
0    0.735
1    0.265
Name: proportion, dtype: float64


---
#### Feature Scaling
- **Identify Numerical Columns** (aka Numerical Features, Columns with Continuous Data) to Scale
- **Choose a Scaler**: StandardScaler from scikit-learn is a common choice. It scales features to have zero mean and unit variance.
- **Fit on Training Data ONLY**: This is a critical rule. We calculate the mean and standard deviation (the "scaling parameters") only from the X_train data.
- **Transform Both Sets**: We then use these parameters learned from X_train to transform both X_train and X_test. This prevents any information from the test set (our "unseen exam") from leaking into the training process.

In [16]:
df_model_ready.head()

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,HF_neg,HF_nue,HF_pos,gender_Male,Partner_Yes,Dependents_Yes,...,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,HF_Label_Neutral,HF_Label_Positive,TenureGrp_13-24 Months,TenureGrp_25-36 Months,TenureGrp_37-48 Months,TenureGrp_49-60 Months,TenureGrp_61-72 Months
0,0,29.85,29.85,0,0.005294,0.021416,0.973291,False,True,False,...,False,True,False,False,True,False,False,False,False,False
1,0,56.95,1889.5,0,0.004702,0.012738,0.98256,True,False,False,...,False,False,True,False,True,False,True,False,False,False
2,0,53.85,108.15,1,0.906199,0.086496,0.007306,True,False,False,...,False,False,True,False,False,False,False,False,False,False
3,0,42.3,1840.75,0,0.004989,0.034239,0.960772,True,False,False,...,False,False,False,False,True,False,False,True,False,False
4,0,70.7,151.65,1,0.930403,0.06082,0.008777,False,False,False,...,False,True,False,False,False,False,False,False,False,False


In [17]:
# Identify numerical columns that need scaling
# Exclude binary (0/1) or already scaled features if any.
# SeniorCitizen is 0/1, boolean columns are 0/1.
# Churn is the target, not in X.
# HF_neg, HF_nue, HF_pos are probabilities (0-1), but scaling can still be beneficial
# if other features have vastly different scales. Let's include them.
cols_to_scale = ['MonthlyCharges', 'TotalCharges', 'HF_neg', 'HF_nue', 'HF_pos']

# Verify columns exist in X_train
missing_cols = [col for col in cols_to_scale if col not in X_train.columns]
if missing_cols:
    print(f"Warning: The following columns are missing from X_train: {missing_cols}")
else:
    print(f"Columns identified for scaling: {cols_to_scale}")


Columns identified for scaling: ['MonthlyCharges', 'TotalCharges', 'HF_neg', 'HF_nue', 'HF_pos']


In [18]:
# Import StandardScaler for scaling
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data for the specified columns and transform X_train
# IMPORTANT: Fit ONLY on X_train to prevent data leakage from X_test
print(f"Fitting StandardScaler on X_train for columns: {cols_to_scale}")
X_train_scaled_cols = scaler.fit_transform(X_train[cols_to_scale])
print("\nX_train columns scaled successfully.")

# Transform the corresponding columns in X_test using the SAME fitted scaler
X_test_scaled_cols = scaler.transform(X_test[cols_to_scale])
print("\nX_test columns scaled using the scaler fitted on X_train.")

# Convert the scaled NumPy arrays back to DataFrames with original column names
X_train_scaled_df = pd.DataFrame(X_train_scaled_cols, columns=cols_to_scale, index=X_train.index)
X_test_scaled_df = pd.DataFrame(X_test_scaled_cols, columns=cols_to_scale, index=X_test.index)

# Update the original X_train and X_test DataFrames with the scaled columns
# Loop ensures we only update the columns that were scaled
for col in cols_to_scale:
    X_train[col] = X_train_scaled_df[col]
    X_test[col] = X_test_scaled_df[col]

print("\nScaled numerical features in X_train and X_test have been updated.")

# Display the head of X_train to see the scaled values for these columns
print("\nHead of X_train after scaling (showing scaled columns and a few others):")

# Selected columns to display: SCALED ones and a couple of BOOLEAN ones for context.
# [:2] takes first TWO elements from the beginning of the list [up to (but not including) index 2]
display_cols = cols_to_scale + [col for col in X_train.columns if col.startswith('gender_') or col.startswith('Contract_')][:2]

# Ensure display_cols only contains columns that actually exist, in case some boolean ones aren't there
display_cols = [col for col in display_cols if col in X_train.columns]
print(X_train[display_cols].head())

Fitting StandardScaler on X_train for columns: ['MonthlyCharges', 'TotalCharges', 'HF_neg', 'HF_nue', 'HF_pos']

X_train columns scaled successfully.

X_test columns scaled using the scaler fitted on X_train.

Scaled numerical features in X_train and X_test have been updated.

Head of X_train after scaling (showing scaled columns and a few others):
      MonthlyCharges  TotalCharges    HF_neg    HF_nue    HF_pos  gender_Male  \
3738       -0.521976     -0.263871 -0.596856 -0.608280  0.631591         True   
3151        0.337478     -0.505423 -0.594176  0.155381  0.535430         True   
4860       -0.809013     -0.751850 -0.578528  2.236056  0.265643         True   
3867        0.284384     -0.174271 -0.599526 -0.571609  0.629585        False   
3810       -0.676279     -0.991514 -0.595537 -0.536079  0.621505         True   

      Contract_One year  
3738              False  
3151              False  
4860              False  
3867              False  
3810              False  


In [19]:
# Check MEAN and STD of the scaled columns in X_train
print("Mean of scaled columns in X_train (should be close to 0):")
print(X_train[cols_to_scale].mean().round(5))
print("\nStandard deviation of scaled columns in X_train (should be close to 1):")
print(X_train[cols_to_scale].std().round(5))

Mean of scaled columns in X_train (should be close to 0):
MonthlyCharges   -0.0
TotalCharges     -0.0
HF_neg            0.0
HF_nue            0.0
HF_pos            0.0
dtype: float64

Standard deviation of scaled columns in X_train (should be close to 1):
MonthlyCharges    1.00009
TotalCharges      1.00009
HF_neg            1.00009
HF_nue            1.00009
HF_pos            1.00009
dtype: float64


---
Save X_train, X_test (DataFrames) and y_train, y_test (Series) into separate Parquet files

In [26]:
import os # Import the os module
#  X_train, X_test, y_train, y_test are split datasets

# Define the base path components
base_project_folder = r"C:\Users\comat\GitProjects\customer-churn-ai" # Raw string is fine here
data_subfolder = "data"
training_input_subfolder_name = "training_input"

# Construct the path to the 'training_input' directory robustly
training_input_path = os.path.join(base_project_folder, data_subfolder, training_input_subfolder_name)

# Ensure the target directory exists, create it if it doesn't
try:
    os.makedirs(training_input_path, exist_ok=True)
    print(f"Ensured directory exists: {training_input_path}")
except Exception as e:
    print(f"Error creating directory {training_input_path}: {e}")
    raise # Stop if directory can't be made

# Define full file paths using os.path.join()
x_train_path = os.path.join(training_input_path, "X_train.parquet")
x_test_path = os.path.join(training_input_path, "X_test.parquet")
y_train_path = os.path.join(training_input_path, "y_train.parquet")
y_test_path = os.path.join(training_input_path, "y_test.parquet")

try:
    # Save DataFrames (X_train, X_test)
    X_train.to_parquet(x_train_path, index=False)
    print(f"X_train saved to {x_train_path}")
    X_test.to_parquet(x_test_path, index=False)
    print(f"X_test saved to {x_test_path}")


    # Save Series (y_train, y_test) by converting to DataFrame first
    y_train.to_frame(name='Churn').to_parquet(y_train_path, index=False)
    print(f"y_train saved to {y_train_path}")
    y_test.to_frame(name='Churn').to_parquet(y_test_path, index=False)
    print(f"y_test saved to {y_test_path}")
    
    print("\nAll split datasets saved successfully as Parquet files in the 'training_input' subfolder!")

except Exception as e:
    print(f"An error occurred while saving the data: {e}")

Ensured directory exists: C:\Users\comat\GitProjects\customer-churn-ai\data\training_input
X_train saved to C:\Users\comat\GitProjects\customer-churn-ai\data\training_input\X_train.parquet
X_test saved to C:\Users\comat\GitProjects\customer-churn-ai\data\training_input\X_test.parquet
y_train saved to C:\Users\comat\GitProjects\customer-churn-ai\data\training_input\y_train.parquet
y_test saved to C:\Users\comat\GitProjects\customer-churn-ai\data\training_input\y_test.parquet

All split datasets saved successfully as Parquet files in the 'training_input' subfolder!
