# Dataset Exploration
This notebook performs initial exploration of the Telco Customer Churn dataset.

In [5]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [6]:
# Load the dataset
df = pd.read_csv('D:\EDU\DAS 601 ML\Final Project\Data\input\Telco_Customer_kaggle.csv')
print(f"Dataset loaded successfully with shape: {df.shape}")

Dataset loaded successfully with shape: (7043, 21)


In [7]:
# Basic dataset information
print("Dataset Shape:")
print(df.shape)
print("\nData Types:")
print(df.dtypes.value_counts())

Dataset Shape:
(7043, 21)

Data Types:
object     18
int64       2
float64     1
Name: count, dtype: int64


In [8]:
# Display first few rows
print("First 5 rows:")
df.head()

First 5 rows:


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [9]:
# Descriptive statistics
print("Descriptive statistics (all columns):")
df.describe(include='all').T

Descriptive statistics (all columns):


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
customerID,7043.0,7043.0,7590-VHVEG,1.0,,,,,,,
gender,7043.0,2.0,Male,3555.0,,,,,,,
SeniorCitizen,7043.0,,,,0.162147,0.368612,0.0,0.0,0.0,0.0,1.0
Partner,7043.0,2.0,No,3641.0,,,,,,,
Dependents,7043.0,2.0,No,4933.0,,,,,,,
tenure,7043.0,,,,32.371149,24.559481,0.0,9.0,29.0,55.0,72.0
PhoneService,7043.0,2.0,Yes,6361.0,,,,,,,
MultipleLines,7043.0,3.0,No,3390.0,,,,,,,
InternetService,7043.0,3.0,Fiber optic,3096.0,,,,,,,
OnlineSecurity,7043.0,3.0,No,3498.0,,,,,,,


In [10]:
# Check for null values
print("Null values per column:")
null_counts = df.isnull().sum()
print(null_counts[null_counts > 0])
print(f"\nTotal null values: {df.isnull().sum().sum()}")

Null values per column:
Series([], dtype: int64)

Total null values: 0


In [11]:
# Check for duplicates
print(f"Number of duplicate rows: {df.duplicated().sum()}")
print(f"Unique customerID count: {df['customerID'].nunique()}")
print(f"Total rows: {len(df)}")

Number of duplicate rows: 0
Unique customerID count: 7043
Total rows: 7043


In [12]:
# Convert TotalCharges to numeric
print("TotalCharges data type before conversion:", df['TotalCharges'].dtype)
print("Sample TotalCharges values:", df['TotalCharges'].head())

# Convert to numeric, coercing errors to NaN
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

print("\nTotalCharges data type after conversion:", df['TotalCharges'].dtype)
print(f"Number of NaN values in TotalCharges: {df['TotalCharges'].isnull().sum()}")

TotalCharges data type before conversion: object
Sample TotalCharges values: 0      29.85
1     1889.5
2     108.15
3    1840.75
4     151.65
Name: TotalCharges, dtype: object

TotalCharges data type after conversion: float64
Number of NaN values in TotalCharges: 11


In [13]:
# Drop rows with NaN TotalCharges
initial_shape = df.shape
df_clean = df.dropna(subset=['TotalCharges'])
print(f"Shape before dropping NaN: {initial_shape}")
print(f"Shape after dropping NaN: {df_clean.shape}")
print(f"Rows dropped: {initial_shape[0] - df_clean.shape[0]}")

Shape before dropping NaN: (7043, 21)
Shape after dropping NaN: (7032, 21)
Rows dropped: 11


In [17]:
# Save cleaned dataset
import os

# Create interim directory if it doesn't exist
interim_dir = r'D:\EDU\DAS 601 ML\Final Project\Data\interim'
os.makedirs(interim_dir, exist_ok=True)

# Save the cleaned dataset
output_path = r'D:\EDU\DAS 601 ML\Final Project\Data\interim\telco_clean.csv'
df_clean.to_csv(output_path, index=False)
print(f"Cleaned dataset saved to {output_path}")

Cleaned dataset saved to D:\EDU\DAS 601 ML\Final Project\Data\interim\telco_clean.csv


In [18]:
# Stratified train-test split
X = df_clean.drop(['Churn'], axis=1)
y = df_clean['Churn']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Combine features and target for saving
train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

# Create output directory if it doesn't exist
output_dir = r'D:\EDU\DAS 601 ML\Final Project\Data\output'
os.makedirs(output_dir, exist_ok=True)

# Save train and test sets
train_path = r'D:\EDU\DAS 601 ML\Final Project\Data\output\train.csv'
test_path = r'D:\EDU\DAS 601 ML\Final Project\Data\output\test.csv'

train_df.to_csv(train_path, index=False)
test_df.to_csv(test_path, index=False)

print(f"Training set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")
print(f"Training set saved to: {train_path}")
print(f"Test set saved to: {test_path}")
print(f"\nChurn distribution in training set:")
print(y_train.value_counts(normalize=True))
print(f"\nChurn distribution in test set:")
print(y_test.value_counts(normalize=True))

Training set shape: (5625, 21)
Test set shape: (1407, 21)
Training set saved to: D:\EDU\DAS 601 ML\Final Project\Data\output\train.csv
Test set saved to: D:\EDU\DAS 601 ML\Final Project\Data\output\test.csv

Churn distribution in training set:
Churn
No     0.734222
Yes    0.265778
Name: proportion, dtype: float64

Churn distribution in test set:
Churn
No     0.734186
Yes    0.265814
Name: proportion, dtype: float64
