<a href="https://colab.research.google.com/github/elyorbek8/data_processing/blob/main/handling_outliers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Handling Outliers

*   Finding outliers
*   Fixing outlier-related problems if it is possible
*   Removing outliers





In [1]:
import numpy as np
import pandas as pd

In [27]:
df = pd.read_csv("https://raw.githubusercontent.com/anvarnarz/praktikum_datasets/refs/heads/main/MOCK_USER_DATA.csv", index_col="id")
df

Unnamed: 0_level_0,first_name,last_name,email,gender,age
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Dorothy,Verryan,dverryan0@who.int,Non-binary,1
2,Pietrek,Scanlin,pscanlin1@yolasite.com,Male,91
3,Lilith,Sarjeant,lsarjeant2@creativecommons.org,Genderfluid,1233
4,Knox,Cunningham,kcunningham3@yellowpages.com,Genderfluid,6
5,Mohammed,Habbershon,mhabbershon4@sourceforge.net,Female,48
...,...,...,...,...,...
96,Baxy,Marcu,bmarcu2n@list-manage.com,Agender,64
97,Kattie,Claxson,kclaxson2o@booking.com,Female,18
98,Clem,Fullerd,cfullerd2p@creativecommons.org,Bigender,55
99,Desdemona,Jurasek,djurasek2q@aboutads.info,Genderfluid,46


In [28]:
df.describe()

Unnamed: 0,age
count,100.0
mean,145.66
std,565.631411
min,-85.0
25%,22.75
50%,53.5
75%,84.25
max,4444.0


In [29]:
# finding outliers
df[(df['age'] > 123) | (df['age'] < 0)]

Unnamed: 0_level_0,first_name,last_name,email,gender,age
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3,Lilith,Sarjeant,lsarjeant2@creativecommons.org,Genderfluid,1233
11,Elijah,Heine,eheinea@imdb.com,Bigender,-85
19,Luisa,Bradborne,lbradbornei@fema.gov,Female,-75
21,Levin,Worner,lwornerk@ihg.com,Genderqueer,-43
33,Ebenezer,Greeves,egreevesw@myspace.com,Male,-26
38,Hana,Langwade,hlangwade11@washingtonpost.com,Genderqueer,-42
39,Godard,Miere,gmiere12@1688.com,Polygender,3434
47,Cleve,Sparshatt,csparshatt1a@economist.com,Genderfluid,333
49,Trace,Stockney,tstockney1c@flavors.me,Female,344
65,Eloise,Snoddon,esnoddon1s@elegantthemes.com,Genderfluid,332


In [38]:
# fixing minus age outliers
df['age'] = df['age'].apply(np.abs)

In [42]:
df[df['age'] < 0]

Unnamed: 0_level_0,first_name,last_name,email,gender,age
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


In [47]:
# removing outliers
idx = df[df['age'] > 123].index

pandas.core.indexes.base.Index

In [49]:
df.drop(index= idx, inplace= True)

In [50]:
df[df['age'] > 123]

Unnamed: 0_level_0,first_name,last_name,email,gender,age
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


In [53]:
# removing another outliers
df['gender'].value_counts()

Unnamed: 0_level_0,count
gender,Unnamed: 1_level_1
Male,21
Bigender,12
Female,11
Genderfluid,11
Non-binary,10
Genderqueer,10
Agender,9
Polygender,9


In [56]:
df = df[df['gender'].isin(['Male', 'Female'])]

In [58]:
df.gender.value_counts()

Unnamed: 0_level_0,count
gender,Unnamed: 1_level_1
Male,21
Female,11


In [59]:
df

Unnamed: 0_level_0,first_name,last_name,email,gender,age
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,Pietrek,Scanlin,pscanlin1@yolasite.com,Male,91
5,Mohammed,Habbershon,mhabbershon4@sourceforge.net,Female,48
10,David,Meredyth,dmeredyth9@telegraph.co.uk,Female,68
12,Raffaello,Firminger,rfirmingerb@bloomberg.com,Female,74
17,Byram,Martynikhin,bmartynikhing@jiathis.com,Male,40
19,Luisa,Bradborne,lbradbornei@fema.gov,Female,75
22,Ray,Padly,rpadlyl@twitpic.com,Male,85
26,Ephrayim,Westmacott,ewestmacottp@ning.com,Male,77
29,Biron,Acklands,backlandss@comsenz.com,Male,13
33,Ebenezer,Greeves,egreevesw@myspace.com,Male,26
