In [5]:
# Install package for UCI repo
!pip install ucimlrepo



In [6]:
import ucimlrepo
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm, zscore

In [7]:
# 1. Fetch a dataset (e.g., Breast Cancer Wisconsin (diagnostic) dataset)
dataset = ucimlrepo.fetch_ucirepo(id=17)
X = pd.DataFrame(dataset.data.features, columns=dataset.feature_names)
y = pd.DataFrame(dataset.data.targets, columns=dataset.target_names)
data = pd.concat([X, y], axis=1)
target = dataset.data.targets.columns.to_list()[0]

In [8]:
# 2. Anomaly Detection using Normal Distribution (Z-score method)
threshold=3 # Anomaly threhold for Z-score
data_with_anomalies = data.copy()  # Create a copy to avoid modifying the original data
for column in data_with_anomalies.columns:
  if pd.api.types.is_numeric_dtype(data_with_anomalies[column]):
    z_scores = np.abs(zscore(data_with_anomalies[column]))
    data_with_anomalies[f'{column}_zscore'] = z_scores  # Store Z-scores
    data_with_anomalies[f'{column}_anomaly'] = z_scores > threshold
print("Anomaly Detection Results:")
print(data_with_anomalies.head())

Anomaly Detection Results:
   radius1  texture1  perimeter1   area1  smoothness1  compactness1  \
0    17.99     10.38      122.80  1001.0      0.11840       0.27760   
1    20.57     17.77      132.90  1326.0      0.08474       0.07864   
2    19.69     21.25      130.00  1203.0      0.10960       0.15990   
3    11.42     20.38       77.58   386.1      0.14250       0.28390   
4    20.29     14.34      135.10  1297.0      0.10030       0.13280   

   concavity1  concave_points1  symmetry1  fractal_dimension1  ...  \
0      0.3001          0.14710     0.2419             0.07871  ...   
1      0.0869          0.07017     0.1812             0.05667  ...   
2      0.1974          0.12790     0.2069             0.05999  ...   
3      0.2414          0.10520     0.2597             0.09744  ...   
4      0.1980          0.10430     0.1809             0.05883  ...   

   compactness3_zscore  compactness3_anomaly  concavity3_zscore  \
0             2.616665                 False           2.1

# Feature Engineering
> A high _prob value for a data point in a given feature means that the data point's value is more likely to occur according to the normal distribution that was fitted to that feature.

1. Normal Distribution Fit: For each numerical feature (e.g., 'radius_mean', 'texture_mean', etc.), the code calculates the mean and standard deviation of that feature in the dataset. It then uses these statistics to define a normal distribution curve that best represents the distribution of that feature's values.
2. Probability Density: The norm.pdf() function calculates the probability density function (PDF) value for each data point's feature value. The PDF essentially measures the "likelihood" of observing that specific value within the fitted normal distribution.
3. High _prob Implication: A high _prob value indicates that the data point's feature value is close to the mean of the fitted normal distribution. In other words, it's a "typical" value for that feature, commonly observed in the data.
4. Low _prob Implication: Conversely, a low _prob value suggests that the data point's feature value is far from the mean, residing in the tails of the normal distribution curve. Such values are less common and potentially more "unusual" for that feature.
---
> For a normal distribution, the maximum value of the probability density function (PDF) occurs at the mean of the distribution.

1. PDF Peak at Mean: The normal distribution curve is highest at its center (the mean) and tapers off symmetrically on both sides. Therefore, data points with feature values very close to the mean will have the highest _prob values.
2. Theoretical Maximum: The exact maximum value of the PDF depends on the specific parameters of the normal distribution (mean and standard deviation). However, it's important to understand that the _prob value is a probability density, not a probability. Therefore, its maximum value can be greater than 1.
3. Practical Range: In practice, you'll observe that the _prob values are relatively small numbers. The important thing is to use these values for comparison. A higher _prob value means the data point is more "central" to the distribution of that feature, while a lower value means it's more "extreme".

In [10]:
# 3. Feature Engineering using Normal Distribution Parameters
# The core idea is to create new features that represent the probability of each data point belonging to the normal distribution that we've fit to the original features.
data_with_engineered_features = data.copy()
for column in data_with_engineered_features.columns:
  if pd.api.types.is_numeric_dtype(data_with_engineered_features[column]):
    mean = data_with_engineered_features[column].mean()
    std = data_with_engineered_features[column].std()
    data_with_engineered_features[f'{column}_prob'] = norm.pdf(data_with_engineered_features[column], mean, std)
print("Feature Engineering Results:")
print(data_with_engineered_features.head())

Feature Engineering Results:
   radius1  texture1  perimeter1   area1  smoothness1  compactness1  \
0    17.99     10.38      122.80  1001.0      0.11840       0.27760   
1    20.57     17.77      132.90  1326.0      0.08474       0.07864   
2    19.69     21.25      130.00  1203.0      0.10960       0.15990   
3    11.42     20.38       77.58   386.1      0.14250       0.28390   
4    20.29     14.34      135.10  1297.0      0.10030       0.13280   

   concavity1  concave_points1  symmetry1  fractal_dimension1  ...  \
0      0.3001          0.14710     0.2419             0.07871  ...   
1      0.0869          0.07017     0.1812             0.05667  ...   
2      0.1974          0.12790     0.2069             0.05999  ...   
3      0.2414          0.10520     0.2597             0.09744  ...   
4      0.1980          0.10430     0.1809             0.05883  ...   

   radius3_prob  texture3_prob  perimeter3_prob  area3_prob  smoothness3_prob  \
0      0.013966       0.025810         0.0