# Removing outliers Z-Score method

In [34]:
# libraries
import pandas as pd
import numpy as np

In [35]:
# data
data = pd.DataFrame({'Age' : [20,21,22,23,24,25,26,27,28,29,30,50]})
data

Unnamed: 0,Age
0,20
1,21
2,22
3,23
4,24
5,25
6,26
7,27
8,28
9,29


In [36]:

# mean and std 
mean = np.mean(data['Age'])
std = np.std(data['Age'])

# z score
data['Z Score'] = (data['Age'] - mean) / std
data

Unnamed: 0,Age,Z Score
0,20,-0.938954
1,21,-0.806396
2,22,-0.673838
3,23,-0.54128
4,24,-0.408721
5,25,-0.276163
6,26,-0.143605
7,27,-0.011047
8,28,0.121512
9,29,0.25407


In [37]:
# print the data
print(f'Here is the data with outliers:\n {data}')

Here is the data with outliers:
     Age   Z Score
0    20 -0.938954
1    21 -0.806396
2    22 -0.673838
3    23 -0.541280
4    24 -0.408721
5    25 -0.276163
6    26 -0.143605
7    27 -0.011047
8    28  0.121512
9    29  0.254070
10   30  0.386628
11   50  3.037793


In [38]:
print(f"Here are the outliers based on Z score threshold, 3:\n {data[data['Z Score'] > 3 ]}")

Here are the outliers based on Z score threshold, 3:
     Age   Z Score
11   50  3.037793


# Removing outliers using scipy

In [39]:
import numpy as np
from scipy import stats

In [40]:
# create a data with outliers and new pattern

np.random.seed(42)
data = np.random.normal(0, 1, 1000)
data[0] = 1000  # outlier

df = pd.DataFrame(data, columns=['values'])

In [41]:
df

Unnamed: 0,values
0,1000.000000
1,-0.138264
2,0.647689
3,1.523030
4,-0.234153
...,...
995,-0.281100
996,1.797687
997,0.640843
998,-0.571179


In [42]:
# Calculate the Z-score of each data point
z_scores = np.abs(stats.zscore(df['values']))

# Set a threshold for identifying outliers
threshold = 3
outliers = np.where(z_scores > threshold)[0]

print('------------------------------------------')
print('Data:', df)
print('------------------------------------------')

print('Indices of Outliers:', outliers)
print('Outliers:', df.iloc[outliers]['values'].tolist())

# Remove outliers
df_cleaned = df.drop(index=outliers)

print('------------------------------------------')
print('Data without outliers:', df_cleaned)


------------------------------------------
Data:           values
0    1000.000000
1      -0.138264
2       0.647689
3       1.523030
4      -0.234153
..           ...
995    -0.281100
996     1.797687
997     0.640843
998    -0.571179
999     0.572583

[1000 rows x 1 columns]
------------------------------------------
Indices of Outliers: [0]
Outliers: [1000.0]
------------------------------------------
Data without outliers:        values
1   -0.138264
2    0.647689
3    1.523030
4   -0.234153
5   -0.234137
..        ...
995 -0.281100
996  1.797687
997  0.640843
998 -0.571179
999  0.572583

[999 rows x 1 columns]


# Remove outliers with IQR method

In [43]:
# Remove outliers with IQR method

Q1 = df['values'].quantile(0.25)
Q3 = df['values'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - (1.5 * IQR)
upper_bound = Q3 + (1.5 * IQR)

outliers = df[(df['values'] < lower_bound) | (df['values'] > upper_bound)]
print('------------------------------------------')
print('Data:', df)
print('Indices of Outliers:', outliers.index.tolist())
print('Outliers:', outliers['values'].tolist())

# Remove outliers
df_cleaned = df.drop(index=outliers.index)
print('------------------------------------------')
print('Data without outliers:', df_cleaned)

------------------------------------------
Data:           values
0    1000.000000
1      -0.138264
2       0.647689
3       1.523030
4      -0.234153
..           ...
995    -0.281100
996     1.797687
997     0.640843
998    -0.571179
999     0.572583

[1000 rows x 1 columns]
Indices of Outliers: [0, 74, 179, 209, 262, 478, 646, 668, 755]
Outliers: [1000.0, -2.6197451040897444, 2.720169166589619, 3.852731490654721, -3.2412673400690726, 3.0788808084552377, -2.6968866429415717, -2.650969808393012, 2.632382064837391]
------------------------------------------
Data without outliers:        values
1   -0.138264
2    0.647689
3    1.523030
4   -0.234153
5   -0.234137
..        ...
995 -0.281100
996  1.797687
997  0.640843
998 -0.571179
999  0.572583

[991 rows x 1 columns]
