In [1]:
import pandas as pd
import numpy as np
from scipy import stats

## Outlier removal

<b> Aim: </b> </br> For multiple columns in df if we need to remove all rows that have outliers in at least one column </br>

<b> Solution: </b> </br> 
1. For each column, it first computes the Z-score of each value in the column, relative to the column mean and standard deviation.
2. It then takes the absolute Z-score because the direction does not matter, only if it is below the threshold.
3. all(axis=1) ensures that for each row, all column satisfy the constraint.
4. Finally, the result of this condition is used to index the dataframe.

In [2]:
import numpy as np
from scipy import stats

In [3]:
def fetch_df_without_outliers(df):
    col_len = len(df.columns)
    return df[(np.abs(stats.zscore(df)) < col_len).all(axis=1)]

In [4]:
df = pd.DataFrame(np.random.randn(100, 3))

In [5]:
print(df)

           0         1         2
0  -1.205940 -0.171950  1.233098
1  -0.541471  0.784178 -0.773538
2   1.005979  0.234469  0.139138
3  -0.163023 -0.235651  0.277707
4   1.656640 -0.427673  0.559094
..       ...       ...       ...
95 -1.075334 -0.140123 -0.173869
96  1.875462 -2.121093 -0.138070
97  0.147739 -1.831472 -0.003133
98 -0.925892 -1.732796 -0.059455
99  0.702822  2.295511 -0.771203

[100 rows x 3 columns]


In [6]:
# Call function to remove outliers
new_df = fetch_df_without_outliers(df)
print(new_df)

           0         1         2
0  -1.205940 -0.171950  1.233098
1  -0.541471  0.784178 -0.773538
2   1.005979  0.234469  0.139138
3  -0.163023 -0.235651  0.277707
4   1.656640 -0.427673  0.559094
..       ...       ...       ...
95 -1.075334 -0.140123 -0.173869
96  1.875462 -2.121093 -0.138070
97  0.147739 -1.831472 -0.003133
98 -0.925892 -1.732796 -0.059455
99  0.702822  2.295511 -0.771203

[99 rows x 3 columns]
