# 🧼 Data Cleaning & Preprocessing

This notebook includes practical demonstrations for:
- Missing values
- Outliers
- Categorical encoding
- Feature engineering
- Scaling
- Time series
- Basic text preprocessing

## 🔍 Handling Missing Values

In [None]:
import pandas as pd
import numpy as np

df = pd.DataFrame(
    {
        "Name": ["Alice", "Bob", "Charlie", "David"],
        "Age": [25, np.nan, 35, 40],
        "Gender": ["F", "M", np.nan, "M"],
    }
)

print(df)

# Fill missing values
df["Age"] = df["Age"].fillna(df["Age"].median())
df["Gender"] = df["Gender"].fillna("Unknown")
df

## ⚠️ Detecting and Handling Outliers

In [None]:
# IQR method
df_outliers = pd.DataFrame({"Income": [30000, 35000, 40000, 1200000]})
Q1 = df_outliers["Income"].quantile(0.25)
Q3 = df_outliers["Income"].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

df_outliers["CappedIncome"] = np.clip(df_outliers["Income"], lower, upper)
df_outliers

## 🔤 Encoding Categorical Variables

In [None]:
df = pd.DataFrame(
    {"City": ["Paris", "London", "Paris", "Berlin"], "Gender": ["F", "M", "F", "M"]}
)
encoded = pd.get_dummies(df)
encoded

## 🧠 Feature Engineering

In [None]:
df = pd.DataFrame(
    {
        "JoinDate": pd.to_datetime(["2019-01-01", "2021-06-15", "2020-03-01"]),
        "Salary": [40000, 50000, 60000],
        "Age": [28, 32, 40],
    }
)
df["TenureYears"] = (pd.to_datetime("today") - df["JoinDate"]).dt.days // 365
df["SalaryPerAge"] = df["Salary"] / df["Age"]
df

## 📏 Scaling Features

In [None]:
from sklearn.preprocessing import StandardScaler

data = pd.DataFrame({"Feature1": [1, 2, 3], "Feature2": [100, 200, 300]})
scaler = StandardScaler()
scaled = scaler.fit_transform(data)
pd.DataFrame(scaled, columns=["Feature1", "Feature2"])

## ⏳ Time Series Processing

In [None]:
import numpy as np
import pandas as pd

date_range = pd.date_range(start="2023-01-01", periods=10, freq="D")
ts = pd.DataFrame({"Date": date_range, "Value": np.random.randint(10, 100, size=10)})
ts.set_index("Date", inplace=True)

# Rolling average
ts["RollingMean"] = ts["Value"].rolling(window=3).mean()
# Lag feature
ts["Lag1"] = ts["Value"].shift(1)
ts

## 📝 Basic Text Preprocessing

In [None]:
import re
import nltk

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

text = "Data Science is AMAZING. Visit http://example.com for more! 😊"
text = text.lower()
text = re.sub(r"http\S+|[^a-z\s]", "", text)

tokens = word_tokenize(text)
tokens = [w for w in tokens if w not in stopwords.words("english")]

lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(w) for w in tokens]
tokens

## 🔡 Vectorizing Text with TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    "Data science is the best!",
    "I love data and machine learning.",
    "Machine learning is a subset of AI.",
]

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())