# 🧪 Data Pipeline Development using Pandas and Scikit-learn

## 📌 Objective
To automate the data preprocessing workflow using an ETL pipeline that:
- Loads raw customer churn data
- Cleans and preprocesses it
- Transforms it using encoding and scaling
- Splits it into train/test sets
- Saves the processed data

Dataset used: Telco Customer Churn (CSV)


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import os


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Enable inline plots
%matplotlib inline


In [None]:
data = pd.read_csv("data/projectdata.csv")
data.head()


## 📊 Exploratory Data Analysis (EDA) and Visualizations


In [None]:
sns.countplot(data=data, x="Churn", palette="Set2")
plt.title("Churn Distribution")
plt.xlabel("Churn")
plt.ylabel("Count")
plt.show()


In [None]:
sns.heatmap(data.isnull(), cbar=False, cmap="viridis")
plt.title("Missing Data Heatmap")
plt.show()


In [None]:
numerical_cols = data.select_dtypes(include=["int64", "float64"]).columns

data[numerical_cols].hist(figsize=(12, 10), bins=20, color="skyblue")
plt.suptitle("Distribution of Numerical Features")
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(data[numerical_cols].corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()


In [None]:
selected_cols = ["tenure", "MonthlyCharges", "TotalCharges", "Churn"]
sns.pairplot(data[selected_cols], hue="Churn", palette="husl")
plt.suptitle("Pair Plot of Key Features", y=1.02)
plt.show()


In [None]:
# Drop duplicates
data.drop_duplicates(inplace=True)

# Drop rows with missing values
data.dropna(inplace=True)

# Confirm cleanup
print(f"Dataset shape after cleaning: {data.shape}")


In [None]:
X = data.drop("Churn", axis=1)
y = data["Churn"]


In [None]:
# You can manually define them if auto detection doesn't work well
numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = X.select_dtypes(include=["object", "bool"]).columns.tolist()

print("Numerical:", numerical_cols)
print("Categorical:", categorical_cols)


In [None]:
numerical_pipeline = Pipeline([
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ("num", numerical_pipeline, numerical_cols),
    ("cat", categorical_pipeline, categorical_cols)
])


In [None]:
X_processed = preprocessor.fit_transform(X)
print(f"Processed feature shape: {X_processed.shape}")


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.2, random_state=42)

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")


In [None]:
output_folder = "processed_data"
os.makedirs(output_folder, exist_ok=True)

pd.DataFrame(X_train).to_csv(f"{output_folder}/X_train.csv", index=False)
pd.DataFrame(X_test).to_csv(f"{output_folder}/X_test.csv", index=False)
y_train.to_csv(f"{output_folder}/y_train.csv", index=False)
y_test.to_csv(f"{output_folder}/y_test.csv", index=False)

print("✅ Processed data saved to 'processed_data/' folder.")


## ✅ Conclusion

- The dataset was cleaned, encoded, and scaled.
- Final data was split into training and testing sets.
- Processed files were saved in the `processed_data/` folder.
- Ready for model training or further analysis!
