In [64]:
# --- Cell 1: Load the preprocessed dataset and inspect target distribution ---
import pandas as pd

# Load the final preprocessed dataset
final_path = "/home/danial/Data Science/Churn Prediction/Data/Processed/final_preprocessed.csv"
df = pd.read_csv(final_path)

# Drop leaky column
df = df.drop(columns=['ChurnFlag'], errors='ignore')

# Confirm removal
print("Columns after removing ChurnFlag:", df.columns.tolist())


# Check the shape and class distribution
print("Dataset shape:", df.shape)
print("\nTarget distribution:")
print(df['Churn'].value_counts(normalize=True))


Columns after removing ChurnFlag: ['SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling', 'MonthlyCharges', 'TotalCharges', 'ChargeDiff', 'ChargeDiffPerc', 'NumServices', 'HasInternet', 'InternetService_Fiber optic', 'InternetService_No', 'Contract_One year', 'Contract_Two year', 'PaymentMethod_Credit card (automatic)', 'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check', 'Churn', 'tenure_group_12-24', 'tenure_group_24-48', 'tenure_group_48+']
Dataset shape: (7043, 30)

Target distribution:
Churn
0    0.73463
1    0.26537
Name: proportion, dtype: float64


In [65]:
# --- Cell 2: First split into Train and Temp (Validation+Test) with stratification ---
from sklearn.model_selection import train_test_split

# Separate features and target
X = df.drop(columns=['Churn'])
y = df['Churn']

# Split: 80% Train, 20% Temp (Validation+Test), keeping class balance
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("Train size:", X_train.shape, "Temp size:", X_temp.shape)
print("Train target distribution:\n", y_train.value_counts(normalize=True))


Train size: (5634, 29) Temp size: (1409, 29)
Train target distribution:
 Churn
0    0.734647
1    0.265353
Name: proportion, dtype: float64


In [66]:
# --- Cell 3: Split Temp into Validation and Test sets ---
# Now split the 20% Temp equally into Validation and Test (10% each of the original dataset)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

print("Validation size:", X_val.shape, "Test size:", X_test.shape)
print("Validation target distribution:\n", y_val.value_counts(normalize=True))
print("Test target distribution:\n", y_test.value_counts(normalize=True))


Validation size: (704, 29) Test size: (705, 29)
Validation target distribution:
 Churn
0    0.734375
1    0.265625
Name: proportion, dtype: float64
Test target distribution:
 Churn
0    0.734752
1    0.265248
Name: proportion, dtype: float64


In [67]:
# --- Cell 4: Check for potential data leakage (duplicate rows across splits) ---
# Concatenate indices for each split and check overlaps
train_idx = set(X_train.index)
val_idx = set(X_val.index)
test_idx = set(X_test.index)

overlap_train_val = train_idx.intersection(val_idx)
overlap_train_test = train_idx.intersection(test_idx)
overlap_val_test = val_idx.intersection(test_idx)

print("Overlap between Train and Validation:", len(overlap_train_val))
print("Overlap between Train and Test:", len(overlap_train_test))
print("Overlap between Validation and Test:", len(overlap_val_test))


Overlap between Train and Validation: 0
Overlap between Train and Test: 0
Overlap between Validation and Test: 0


In [69]:
# --- Cell 5: Save the splits for later use ---
train_path = "/home/danial/Data Science/Churn Prediction/Data/Splitted/train.csv"
val_path = "/home/danial/Data Science/Churn Prediction/Data/Splitted/val.csv"
test_path = "/home/danial/Data Science/Churn Prediction/Data/Splitted/test.csv"

X_train.assign(Churn=y_train).to_csv(train_path, index=False)
X_val.assign(Churn=y_val).to_csv(val_path, index=False)
X_test.assign(Churn=y_test).to_csv(test_path, index=False)

print("Datasets saved successfully:")
print(train_path, val_path, test_path)


Datasets saved successfully:
/home/danial/Data Science/Churn Prediction/Data/Splitted/train.csv /home/danial/Data Science/Churn Prediction/Data/Splitted/val.csv /home/danial/Data Science/Churn Prediction/Data/Splitted/test.csv
