# 03 - Noise Management (MAD)

In [1]:
# import libraries
import numpy as np
import pandas as pd
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# load data
pd.set_option('display.float_format', '{:.1f}'.format)
df = pd.read_csv("Numeric data (last year work).csv")

In [5]:
# define a function to detect outliers using MAD
def detect_outliers_mad(data, threshold=3.5):
    """
    Detect outliers in a dataset using the Median Absolute Deviation (MAD).
    
    Args:
    - data: A Pandas Series or 1D array-like object.
    - threshold: The number of deviations from the median to consider as an outlier.
    
    Returns:
    - A boolean mask where True indicates an outlier.
    """
    median = np.median(data)
    abs_deviation = np.abs(data - median)
    mad = np.median(abs_deviation)
    if mad == 0:  # Handle case where MAD is zero
        return np.zeros(len(data), dtype=bool)
    modified_z_score = 0.6745 * abs_deviation / mad
    return modified_z_score > threshold

# identify numeric columns
numeric_columns = df.select_dtypes(include=np.number).columns


In [7]:
# process each numeric column for outliers
outlier_counts = {}
for col in numeric_columns:
    print(f"Processing column: {col}")
    outliers = detect_outliers_mad(df[col])
    outlier_count = np.sum(outliers)
    outlier_counts[col] = outlier_count
    print(f"Number of outliers in {col}: {outlier_count}")

    
    # remove outliers
    df = df[~outliers]

Processing column: Rating
Number of outliers in Rating: 0
Processing column: Votes
Number of outliers in Votes: 0
Processing column: Meta Score
Number of outliers in Meta Score: 0
Processing column: Year
Number of outliers in Year: 0
Processing column: Duration
Number of outliers in Duration: 0
Processing column: Action
Number of outliers in Action: 0
Processing column: Adventure
Number of outliers in Adventure: 0
Processing column: Animation
Number of outliers in Animation: 0
Processing column: Biography
Number of outliers in Biography: 0
Processing column: Comedy
Number of outliers in Comedy: 0
Processing column: Crime
Number of outliers in Crime: 0
Processing column: Documentary
Number of outliers in Documentary: 0
Processing column: Drama
Number of outliers in Drama: 0
Processing column: Family
Number of outliers in Family: 0
Processing column: Fantasy
Number of outliers in Fantasy: 0
Processing column: History
Number of outliers in History: 0
Processing column: Horror
Number of ou

In [None]:
# it is quite suprisingly that I have zero outliers
# I am not sure that it reflects a real picture

# let's try Noise Management (RANSAC)
# maybe we get other results
# or maybe I am too skeptical :)