In [42]:
import pandas as pd
import numpy as np

# Set seed for reproducibility
np.random.seed(42)

# Generate 20 random data points for two columns
data = {
    'one': np.random.normal(loc=50, scale=5, size=20),
    'two': np.random.normal(loc=50, scale=5, size=20),
    'three': np.random.normal(loc=50, scale=5, size=20),
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Introduce outliers in Column1 (3 outliers)
df.loc[0:1, 'one'] *= 3

# Introduce outliers in Column2 (5 outliers)
df.loc[2:4, 'two'] *= 3

# Introduce outliers in Column2 (5 outliers)
df.loc[5:8, 'three'] *= 3

df


Unnamed: 0,one,two,three
0,157.450712,57.328244,53.692333
1,147.926035,48.871118,50.856841
2,53.238443,151.012923,49.421759
3,57.615149,128.628777,48.494482
4,48.829233,141.834259,42.60739
5,48.829315,50.554613,139.202337
6,57.896064,44.245032,143.090418
7,53.837174,51.87849,165.856833
8,47.652628,46.996807,155.154274
9,52.7128,48.541531,41.184799


In [43]:
def calculate_outliers(df):
    outliers_rows = []
    for col in df.columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        display(Q1, Q3, IQR)
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        display(lower_bound, upper_bound)
        print(f'Number of outliers in {col}: {len(outliers)}, Percentage of outliers: {len(outliers) / len(df) * 100:.2f}%')
        outliers_rows.extend(outliers.index.tolist())
    
    # Get unique row indices with outliers
    outliers_rows = list(set(outliers_rows))
    print('\n')
    print(f'Number of rows with at least one outlier: {len(outliers_rows)}, Percentage of rows with at least one outlier: {len(outliers_rows) / df.shape[0] * 100:.2f}%')

    # Create boolean array indicating which rows have at least one outlier
    outliers_bool = df.index.isin(outliers_rows)
    
    return outliers_bool

iqr_outliers = calculate_outliers(df)
len(iqr_outliers)

no_outliers = df[~iqr_outliers]
no_outliers

46.756391670944836

53.388125429318734

6.631733758373898

36.80879103338399

63.33572606687958

Number of outliers in one: 2, Percentage of outliers: 10.00%


46.42146154294488

54.9166043815389

8.49514283859402

33.67874728505385

67.65931863942993

Number of outliers in two: 3, Percentage of outliers: 15.00%


48.35910074003755

54.947043629828784

6.587942889791236

38.47718640535069

64.82895796451564

Number of outliers in three: 4, Percentage of outliers: 20.00%


Number of rows with at least one outlier: 9, Percentage of rows with at least one outlier: 45.00%


Unnamed: 0,one,two,three
9,52.7128,48.541531,41.184799
10,47.682912,46.991467,51.62042
11,47.671351,59.261391,48.074589
12,51.209811,49.932514,46.61539
13,40.433599,44.711445,53.058381
14,41.375411,54.112725,55.154998
15,47.188562,43.895782,54.656401
16,44.935844,51.044318,45.803912
17,51.571237,40.201649,48.453938
18,45.45988,43.35907,51.656317


In [44]:
def remove_outliers_iqr(clean_df):
    
    for col in clean_df.columns:
        print('Processing column: ', col)
        # Calculate Q1, Q3 and IQR
        Q1 = clean_df[col].quantile(0.25)
        Q3 = clean_df[col].quantile(0.75)
        IQR = Q3 - Q1
        display(Q1, Q3, IQR)
        # Define the acceptable range (values within 1.5 IQR from Q1 and Q3)
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Filter out the outliers
        clean_df = clean_df[(clean_df[col] >= lower_bound) & (clean_df[col] <= upper_bound)]
        display(lower_bound, upper_bound)

    return clean_df

# Remove outliers from the dataset
df_no_outliers = remove_outliers_iqr(df)
display(df_no_outliers)
# Get the number of rows in the cleaned dataset
new_num_rows = df_no_outliers.shape[0]
new_num_rows


Processing column:  one


46.756391670944836

53.388125429318734

6.631733758373898

36.80879103338399

63.33572606687958

Processing column:  two


45.28145075112863

53.55416594331905

8.272715192190418

32.872377962843004

65.96323873160468

Processing column:  three


48.264263359331174

97.17866724327956

48.91440388394839

-25.107342466591412

170.55027306920215

Unnamed: 0,one,two,three
5,48.829315,50.554613,139.202337
6,57.896064,44.245032,143.090418
7,53.837174,51.87849,165.856833
8,47.652628,46.996807,155.154274
9,52.7128,48.541531,41.184799
10,47.682912,46.991467,51.62042
11,47.671351,59.261391,48.074589
12,51.209811,49.932514,46.61539
13,40.433599,44.711445,53.058381
14,41.375411,54.112725,55.154998


15