## Missing Values Handling

In [3]:
import pandas as pd
import numpy as np

dict = {'Score_A':[80, 90, np.nan, 80],
        'Score_B': [30, 45, np.nan, np.nan],
        'Score_C':[np.nan, 50, 80, 90]}

df = pd.DataFrame(dict)
print(df)

   Score_A  Score_B  Score_C
0     80.0     30.0      NaN
1     90.0     45.0     50.0
2      NaN      NaN     80.0
3     80.0      NaN     90.0


In [2]:
df.isnull().sum()

Score_A    1
Score_B    2
Score_C    1
dtype: int64

In [3]:
print(df.fillna(0))

   Score_A  Score_B  Score_C
0     80.0     30.0      0.0
1     90.0     45.0     50.0
2      0.0      0.0     80.0
3     80.0      0.0     90.0


In [4]:
print(df.fillna(method = "pad"))

   Score_A  Score_B  Score_C
0     80.0     30.0      NaN
1     90.0     45.0     50.0
2     90.0     45.0     80.0
3     80.0     45.0     90.0


In [5]:
dict = {'Gender':["남자", "여자", np.nan, "남자"],
        'Salary': [30, 45, 90, 70]}

df = pd.DataFrame(dict)
print(df)

  Gender  Salary
0     남자      30
1     여자      45
2    NaN      90
3     남자      70


In [6]:
df['Gender'].fillna("성별 없음")

0       남자
1       여자
2    성별 없음
3       남자
Name: Gender, dtype: object

In [7]:
dict = {'Score_A': [80, 90, np.nan, 80],
        'Score_B': [30, 45, np.nan, 70],
        'Score_C': [np.nan, 50, 80, 90], 
        'Score_D': [50, 30, 80, 60]}

df = pd.DataFrame(dict)
print(df)

   Score_A  Score_B  Score_C  Score_D
0     80.0     30.0      NaN       50
1     90.0     45.0     50.0       30
2      NaN      NaN     80.0       80
3     80.0     70.0     90.0       60


In [8]:
print(df.dropna(axis = 1))

   Score_D
0       50
1       30
2       80
3       60


In [9]:
print(df.dropna(axis = 0))

   Score_A  Score_B  Score_C  Score_D
1     90.0     45.0     50.0       30
3     80.0     70.0     90.0       60


## Outliers

In [7]:
sales = pd.read_csv("data/supermarket_sales.csv")
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Invoice ID     1000 non-null   object 
 1   Branch         1000 non-null   object 
 2   City           1000 non-null   object 
 3   Customer type  1000 non-null   object 
 4   Gender         1000 non-null   object 
 5   Product line   1000 non-null   object 
 6   Unit price     1000 non-null   float64
 7   Quantity       1000 non-null   int64  
 8   Date           1000 non-null   object 
 9   Time           1000 non-null   object 
 10  Payment        1000 non-null   object 
dtypes: float64(1), int64(1), object(9)
memory usage: 86.1+ KB


In [10]:
print(sales[['Unit price']].describe())

        Unit price
count  1000.000000
mean     55.672130
std      26.494628
min      10.080000
25%      32.875000
50%      55.230000
75%      77.935000
max      99.960000


In [12]:
sales[['Unit price']].quantile(0.25)

Unit price    32.875
Name: 0.25, dtype: float64

In [13]:
sales[['Unit price']].quantile(0.75)

Unit price    77.935
Name: 0.75, dtype: float64

In [21]:
Q1 = sales['Unit price'].quantile(0.25)
Q3 = sales['Unit price'].quantile(0.75)

- Q1 보다 낮은 것을 이상치로 간주
- Q3 보다 높은 것을 이상치로 간주

In [23]:
outliers_q1 = (sales['Unit price'] < Q1)
outliers_q3 = (sales['Unit price'] > Q3)

len(sales['Unit price']) - (len(sales['Unit price'][outliers_q1]) + len(sales['Unit price'][outliers_q3]))

500

In [29]:
print(sales['Unit price'][(outliers_q1 | outliers_q3)])

1      15.28
4      86.31
5      85.39
10     14.48
11     25.51
       ...  
988    82.34
993    17.49
996    97.38
997    31.84
999    88.34
Name: Unit price, Length: 500, dtype: float64


In [30]:
print(sales['Unit price'][~(outliers_q1 | outliers_q3)])

0      74.69
2      46.33
3      58.22
6      68.84
7      73.56
       ...  
991    76.60
992    58.03
994    60.95
995    40.35
998    65.82
Name: Unit price, Length: 500, dtype: float64
