In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import RANSACRegressor, HuberRegressor
from sklearn.metrics import mean_absolute_error
from scipy.stats import zscore
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
data_route = "C:\\workspace\\project_final\\data\\dataset_sample_10percent.csv"
data = pd.read_csv(data_route, encoding='utf-8')

In [3]:
# Z-score 계산 및 이상치 여부 확인 함수
def calculate_z_scores(df, group_column):
    # 그룹 내에서 가격에 대한 Z-score 계산
    df[f'{group_column}_zscore'] = df.groupby(group_column)['price'].transform(lambda x: zscore(x, ddof=0))
    return df

# 기준 컬럼별로 Z-score 계산
data = calculate_z_scores(data, 'cate2_nm')
data = calculate_z_scores(data, 'supplier_code')
data = calculate_z_scores(data, 'class_name')
data = calculate_z_scores(data, 'brand_name')

# NaN이 있는 행 제거
data = data.dropna(subset=['supplier_code_zscore', 'cate2_nm_zscore', 'class_name_zscore', 'brand_name_zscore'])


In [4]:
def robust_regression_outliers(data, features, target_column, model_type='ransac'):
    X = data[features]
    y = data[target_column]
    
    # 강인 회귀 모델 선택
    if model_type == 'ransac':
        model = RANSACRegressor()
    elif model_type == 'huber':
        model = HuberRegressor()
    else:
        raise ValueError("model_type should be either 'ransac' or 'huber'")
    
    # 모델 학습
    model.fit(X, y)
    y_pred = model.predict(X)
    
    # 잔차 계산
    residuals = y - y_pred
    data['residuals'] = residuals
    
    # 잔차 Z-score 계산
    data['residuals_zscore'] = zscore(residuals)
    
    return data, model


In [5]:
# 적용할 Z-score 컬럼 지정
features = ['supplier_code_zscore', 'cate2_nm_zscore', 'class_name_zscore', 'brand_name_zscore']

# 회귀 모델과 잔차 Z-score 계산
data, model = robust_regression_outliers(data, features, 'price', model_type='ransac')

# 잔차 Z-score를 통해 이상치 여부를 확인하고, 결과 출력
print(data[['price', 'residuals', 'residuals_zscore']])

           price     residuals  residuals_zscore
0        2313000  2.208136e+06          2.909326
1           8000  5.510304e+03         -0.173868
2         199900  1.656339e+05          0.050270
3         986900  8.720556e+05          1.039106
4         183400  1.322065e+05          0.003479
...          ...           ...               ...
1591786    15000 -1.576845e+04         -0.203653
1591787    15000 -1.576845e+04         -0.203653
1591788     5000  7.907922e+03         -0.170512
1591789    10000 -3.930266e+03         -0.187083
1591790     5000  7.907922e+03         -0.170512

[1587168 rows x 3 columns]


In [6]:
outliers = data[data['comprehensive_outlier'] == True]

# 필요한 컬럼만 선택해서 출력
display(outliers[['prd_id', 'supplier_code', 'cate2_nm', 'class_name', 'brand_name', 
                  'supplier_code_zscore', 'cate2_nm_zscore', 'class_name_zscore', 'brand_name_zscore', 
                  'comprehensive_outlier', 'price']])

KeyError: 'comprehensive_outlier'