# Python Notebook: Outliers and Data Scaling

# Outliers
**Definition**:
Outliers are data points that significantly deviate from the majority of data in a dataset.
They can skew results, distort statistical analyses, and impact machine learning models.


In [None]:
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats

# Generate random data with an outlier
data = np.random.normal(loc=50, scale=10, size=100)
data = np.append(data, [150])  # Add an outlier

# Plot the data
plt.figure(figsize=(10, 6))
plt.boxplot(data)
plt.title("Boxplot with Outlier")
plt.show()

In [None]:
# Exercise: Generate your own data and visualize it
"""
Task:
1. Generate a dataset with a mean of 30 and standard deviation of 5.
2. Add an outlier to the dataset.
3. Plot the data using a boxplot.
"""

## **Finding Outliers**

In [None]:
# Method 1: Interquartile Range (IQR)
def identify_outliers_iqr(data):
    q1, q3 = np.percentile(data, [25, 75])
    iqr = stats.iqr(data)
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    outliers = [x for x in data if x < lower_bound or x > upper_bound]
    return outliers


outliers_iqr = identify_outliers_iqr(data)
print(f"Outliers using IQR: {outliers_iqr}")

In [None]:
# Exercise: Apply IQR to your dataset
"""
Task:
1. Rewrite the function identify_outliers_iqr without using 'np.percentile()' and 'stats.iqr()'
2. Compute the IQR for your generated dataset.
3. Print the outliers.
"""

In [None]:
# Method 2: Z-Score
def identify_outliers_zscore(data):
    mean = np.mean(data)
    std_dev = np.std(data)
    z_scores = stats.zscore(data)  # Calculate Z-scores using scipy
    outliers = [data[i] for i, z in enumerate(z_scores) if abs(z) > 3]
    return outliers


outliers_z = identify_outliers_zscore(data)
print(f"Outliers using Z-Score: {outliers_z}")

In [None]:
# Exercise: Apply ZScore to your dataset
"""
Task:
1. Rewrite the function identify_outliers_zscore without using 'np.mean()' and 'np.std()' and 'stats.zscore()'
2. Compute the ZScore for your generated dataset.
3. Print the outliers.
"""

## **Handling Outliers**

In [None]:
# Strategy 1: Removing Outliers
filtered_data = [x for x in data if x not in outliers_iqr]

plt.figure(figsize=(10, 6))
plt.boxplot(filtered_data)
plt.title("Boxplot After Removing Outliers")
plt.show()

In [None]:
# Exercise: Remove outliers from your dataset and visualize the result




In [None]:
# Strategy 2: Capping Outliers
def cap_outliers(data):
    q1 = np.percentile(data, 25)
    q3 = np.percentile(data, 75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    return [
        lower_bound if x < lower_bound else upper_bound if x > upper_bound else x
        for x in data
    ]


capped_data = cap_outliers(data)
plt.figure(figsize=(10, 6))
plt.boxplot(capped_data)
plt.title("Boxplot After Capping Outliers")
plt.show()

In [None]:
# Exercise: Cap outliers in your dataset and visualize the result



In [None]:
## Exercise (hard)
"""
In this exercise, we will detect and remove outliers based on logical rules rather than statistical methods.
The dataset consists of transaction records with customer age, purchase amount, number of items,
and membership type (Regular or Premium). Some transactions may be valid high-value purchases,
while others may be suspicious depending on the context.

Task
1. Identify suspicious transactions that don't align with typical customer behavior.
2. Implement rule-based outlier detection:
    - High purchase amounts from Regular members are more suspicious than from Premium members.
    - Younger customers making very large purchases could indicate an anomaly.
    - A single-item purchase with an unusually high amount is suspicious.
3. Extend the function to allow dynamic thresholds based on membership type and overall spending trends.

Example dataset:
transactions = [
    {"id": 101, "age": 25, "purchase_amount": 50, "num_items": 2, "membership": "Regular"},
    {"id": 102, "age": 65, "purchase_amount": 1000, "num_items": 1, "membership": "Premium"},
    {"id": 103, "age": 19, "purchase_amount": 5000, "num_items": 1, "membership": "Regular"},  # Suspicious
    {"id": 104, "age": 40, "purchase_amount": 120, "num_items": 5, "membership": "Regular"},
    {"id": 105, "age": 30, "purchase_amount": 20000, "num_items": 1, "membership": "Regular"}, # Suspicious
    {"id": 106, "age": 50, "purchase_amount": 300, "num_items": 10, "membership": "Premium"},
    {"id": 107, "age": 75, "purchase_amount": 4000, "num_items": 2, "membership": "Premium"},
]
"""

# Data Scaling

## Standardization

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
standardized_data = scaler.fit_transform(np.array(data).reshape(-1, 1))  # Reshape for single feature

plt.figure(figsize=(10, 6))
plt.hist(standardized_data, bins=20, alpha=0.7, label="Standardized Data")
plt.legend()
plt.title("Histogram of Standardized Data")
plt.show()

In [None]:
## Exercise:
"""
  1. Write your own standardization function instead of using the StandardScaler
  2. Standardize your dataset and visualize it
"""


In [None]:
from sklearn.preprocessing import MinMaxScaler

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the data
normalized_data = scaler.fit_transform(np.array(data).reshape(-1, 1))

plt.figure(figsize=(10, 6))
plt.hist(normalized_data, bins=20, alpha=0.7, label="Normalized Data")
plt.legend()
plt.title("Histogram of Normalized Data")
plt.show()

In [None]:
## Exercise:
"""
  1. Write your own normalization function instead of using the MinMaxScaler
  2. Normalize your dataset and visualize it
"""


In [None]:

## Exercise
"""
Experiment with other scaling techniques and normalization techniques and write down your findings.
https://scikit-learn.org/1.5/modules/preprocessing.html
"""


In [None]:
## Exercise
"""
  1. Load the Iris Dataset
  2. Remove Outliers
  3. Apply Standardization and/or Normalization (which one is best suited?)
"""
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from scipy import stats

# Load the iris dataset
iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)

# your turn ...