# Removing outliers Z-Score method

In [1]:
# libraries

import pandas as pd
import numpy as np

In [2]:
# create data with outliers

np.random.seed(42)
data = np.random.normal(0, 1, 1000)
data[0] = 1000  # outlier

df = pd.DataFrame(data, columns=['values'])
df

Unnamed: 0,values
0,1000.000000
1,-0.138264
2,0.647689
3,1.523030
4,-0.234153
...,...
995,-0.281100
996,1.797687
997,0.640843
998,-0.571179


In [3]:
df = df.sort_values(by='values', ascending=True).reset_index(drop=True)
df

Unnamed: 0,values
0,-3.241267
1,-2.696887
2,-2.650970
3,-2.619745
4,-2.471645
...,...
995,2.632382
996,2.720169
997,3.078881
998,3.852731


In [4]:
mean = np.mean(df['values'])
std = np.std(df['values'])

In [5]:
df['Z-score'] = df['values'] - mean / std

In [6]:
df

Unnamed: 0,values,Z-score
0,-3.241267,-3.273487
1,-2.696887,-2.729106
2,-2.650970,-2.683189
3,-2.619745,-2.651965
4,-2.471645,-2.503864
...,...,...
995,2.632382,2.600162
996,2.720169,2.687949
997,3.078881,3.046661
998,3.852731,3.820512


In [7]:
print('---------------------------------------')
print(f'Here is the Data with outliers:\n {df}')
print('---------------------------------------')

---------------------------------------
Here is the Data with outliers:
           values     Z-score
0      -3.241267   -3.273487
1      -2.696887   -2.729106
2      -2.650970   -2.683189
3      -2.619745   -2.651965
4      -2.471645   -2.503864
..           ...         ...
995     2.632382    2.600162
996     2.720169    2.687949
997     3.078881    3.046661
998     3.852731    3.820512
999  1000.000000  999.967780

[1000 rows x 2 columns]
---------------------------------------


In [8]:
df[df['values'] == 1000]

Unnamed: 0,values,Z-score
999,1000.0,999.96778


In [9]:
# maximum Z-score
print(f"The maximum Z-score is: {df['Z-score'].max()}")
print('---------------------------------------')

The maximum Z-score is: 999.9677803088885
---------------------------------------


In [10]:
print(f"Here are the outliers based on the Z-score threshold, 3:\n {df[df['Z-score'] > 3]}")
print('---------------------------------------')

Here are the outliers based on the Z-score threshold, 3:
           values     Z-score
997     3.078881    3.046661
998     3.852731    3.820512
999  1000.000000  999.967780
---------------------------------------


In [11]:
df = df[df['Z-score'] <= 4]
df

Unnamed: 0,values,Z-score
0,-3.241267,-3.273487
1,-2.696887,-2.729106
2,-2.650970,-2.683189
3,-2.619745,-2.651965
4,-2.471645,-2.503864
...,...,...
994,2.573360,2.541140
995,2.632382,2.600162
996,2.720169,2.687949
997,3.078881,3.046661


# Removing outliers using scipy

In [12]:
import numpy as np
from scipy import stats

In [13]:
# create a data with outliers and new pattern

np.random.seed(42)
data = np.random.normal(0, 1, 1000)
data[0] = 1000  # outlier

df = pd.DataFrame(data, columns=['values'])

In [14]:
df

Unnamed: 0,values
0,1000.000000
1,-0.138264
2,0.647689
3,1.523030
4,-0.234153
...,...
995,-0.281100
996,1.797687
997,0.640843
998,-0.571179


In [15]:
# Calculate the Z-score of each data point
z_scores = np.abs(stats.zscore(df['values']))

# Set a threshold for identifying outliers
threshold = 3
outliers = np.where(z_scores > threshold)[0]

print('------------------------------------------')
print('Data:', df)
print('------------------------------------------')

print('Indices of Outliers:', outliers)
print('Outliers:', df.iloc[outliers]['values'].tolist())

# Remove outliers
df_cleaned = df.drop(index=outliers)

print('------------------------------------------')
print('Data without outliers:', df_cleaned)


------------------------------------------
Data:           values
0    1000.000000
1      -0.138264
2       0.647689
3       1.523030
4      -0.234153
..           ...
995    -0.281100
996     1.797687
997     0.640843
998    -0.571179
999     0.572583

[1000 rows x 1 columns]
------------------------------------------
Indices of Outliers: [0]
Outliers: [1000.0]
------------------------------------------
Data without outliers:        values
1   -0.138264
2    0.647689
3    1.523030
4   -0.234153
5   -0.234137
..        ...
995 -0.281100
996  1.797687
997  0.640843
998 -0.571179
999  0.572583

[999 rows x 1 columns]


# Remove outliers with IQR method

In [16]:
# Remove outliers with IQR method

Q1 = df['values'].quantile(0.25)
Q3 = df['values'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - (1.5 * IQR)
upper_bound = Q3 + (1.5 * IQR)

outliers = df[(df['values'] < lower_bound) | (df['values'] > upper_bound)]
print('------------------------------------------')
print('Data:', df)
print('Indices of Outliers:', outliers.index.tolist())
print('Outliers:', outliers['values'].tolist())

# Remove outliers
df_cleaned = df.drop(index=outliers.index)
print('------------------------------------------')
print('Data without outliers:', df_cleaned)

------------------------------------------
Data:           values
0    1000.000000
1      -0.138264
2       0.647689
3       1.523030
4      -0.234153
..           ...
995    -0.281100
996     1.797687
997     0.640843
998    -0.571179
999     0.572583

[1000 rows x 1 columns]
Indices of Outliers: [0, 74, 179, 209, 262, 478, 646, 668, 755]
Outliers: [1000.0, -2.6197451040897444, 2.720169166589619, 3.852731490654721, -3.2412673400690726, 3.0788808084552377, -2.6968866429415717, -2.650969808393012, 2.632382064837391]
------------------------------------------
Data without outliers:        values
1   -0.138264
2    0.647689
3    1.523030
4   -0.234153
5   -0.234137
..        ...
995 -0.281100
996  1.797687
997  0.640843
998 -0.571179
999  0.572583

[991 rows x 1 columns]


# K Means clustering

In [17]:
# important library
from sklearn.cluster import KMeans

# sample data
data = [[2,2], [3,3], [3,4], [30,30], [31,31], [32,32]]

# Kmeans model with two clusters (normal and outliers)
kmeans = KMeans(n_clusters=2, n_init=10)
kmeans.fit(data)

# predict the cluster for each data point
labels = kmeans.predict(data)

# identify outliers based on cluster labels 
outliers = [data[i] for i, label in enumerate(labels) if label == 1]

# print data
print('Data:', data)
print('------------------------------------------')
print('Outliers:', outliers)

# Remove outliers
data = [data[i] for i, label in enumerate(labels) if label == 0]
print('data without outliers:', data)

Data: [[2, 2], [3, 3], [3, 4], [30, 30], [31, 31], [32, 32]]
------------------------------------------
Outliers: [[30, 30], [31, 31], [32, 32]]
data without outliers: [[2, 2], [3, 3], [3, 4]]
