In [None]:
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

sales = pd.read_csv("data/raw_sales.csv", parse_dates=['datesold'])
sales['year'] = sales['datesold'].dt.year
sales['month'] = sales['datesold'].dt.month

sales.head(1)

In [None]:
# 2007 ~ 2010추출
df = sales.loc[sales['year'].isin([2007, 2008, 2009, 2010]), :]

fig, ax = plt.subplots(figsize=(10, 6))
sns.boxplot(data = df, x = 'year', y = 'price', hue = 'propertyType', ax = ax)
plt.show()

In [None]:
# Sample DataFrame
data = {'values': [10, 12, 12, 13, 12, 11, 14, 13, 15, 102, 12, 14, 14, 17, 18, 19, 20]}
temp_df = pd.DataFrame(data)

In [None]:
# 1단계 : Q1, Q3 and IQR
Q1 = temp_df['values'].quantile(0.25)
Q3 = temp_df['values'].quantile(0.75)
IQR = Q3 - Q1

# 2단계 : 이상치 임계값 설정
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# 3단계 : 이상치 확인
outliers = temp_df.loc[(temp_df['values'] < lower_bound) | (temp_df['values'] > upper_bound), :]

# 결과 확인
outliers

# 4단계 : 이상치 제거
df_clean = temp_df.loc[(temp_df['values'] >= lower_bound) & (temp_df['values'] <= upper_bound), :]
df_clean

In [None]:
import pandas as pd

def remove_outliers_iqr(df, column):

    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = df.loc[(df[column] < lower_bound) | (df[column] > upper_bound), :]
    df_clean = df.loc[(df[column] >= lower_bound) & (df[column] <= upper_bound), :].reset_index(drop=True)

    return df_clean, outliers

In [None]:
df = sales.loc[sales['year'].isin([2007, 2008, 2009, 2010]), :]
df_clean, outliers = remove_outliers_iqr(df, 'price')
print(outliers)

In [None]:
# 2007 ~ 2010추출
from matplotlib.ticker import ScalarFormatter
formatter = ScalarFormatter()
formatter.set_scientific(False)

fig, ax = plt.subplots(nrows=2, figsize=(10, 12))
sns.boxplot(data = df, x = 'year', y = 'price', hue = 'propertyType', ax = ax[0])
ax[0].set_title("with outliers")
ax[0].yaxis.set_major_formatter(formatter)

sns.boxplot(data = df_clean, x = 'year', y = 'price', hue = 'propertyType', ax = ax[1])
ax[1].set_title("without outliers")

plt.tight_layout()
plt.show()