# Task 1 (Pro Level): Data Pipeline for ETL

**Internship**: CODTECH Data Science Track

---
This notebook implements a professional-level ETL (Extract, Transform, Load) pipeline using the Iris dataset.

## 🔍 Step 1: Import Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings("ignore")

## 📥 Step 2: Define Functions

In [None]:
def load_data():
    iris = load_iris()
    df = pd.DataFrame(iris.data, columns=iris.feature_names)
    df['target'] = iris.target
    return df

def preprocess_data(df):
    df.drop_duplicates(inplace=True)
    return df

def transform_data(df):
    features = df.columns[:-1]
    scaler = StandardScaler()
    df[features] = scaler.fit_transform(df[features])
    encoder = LabelEncoder()
    df['target'] = encoder.fit_transform(df['target'])
    return df

def save_data(df, path="processed_iris_data.csv"):
    df.to_csv(path, index=False)
    print(f"✅ Data saved to: {path}")

## 📊 Step 3: Load and Explore Data

In [None]:
df = load_data()
print(df.head())
print(df.describe())
print(df.info())

## 🧼 Step 4: Preprocess

In [None]:
df = preprocess_data(df)
print(f"✅ Data shape after preprocessing: {df.shape}")

## 🔁 Step 5: Transform

In [None]:
df = transform_data(df)
print(df.head())

## 📈 Step 6: Visualize

In [None]:
sns.pairplot(df, hue='target')
plt.suptitle("Iris Data Visualization", y=1.02)
plt.show()

## 💾 Step 7: Save Data

In [None]:
save_data(df)

## ✅ Step 8: Assertions / Checks

In [None]:
assert not df.isnull().values.any(), "There are still missing values!"
assert df.shape[0] > 0, "No rows in processed data!"
print("✅ All checks passed!")