In [None]:
# 1. 표준 라이브러리
import sys
from pathlib import Path
import platform

# 2. 서드파티 라이브러리
import pandas as pd

# 2-2. 시각화
import plotly.express as px
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.font_manager as fm

# 2-3. 통계
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy.stats import spearmanr

# 3. 로컬 모듈
sys.path.append(str(Path.cwd().parent))
from utils import DATA_DIR, FONT_DIR

In [5]:
# 한글 폰트 설정
if platform.system() == 'Windows':
    plt.rcParams['font.family'] = 'Malgun Gothic'
elif platform.system() == 'Darwin':  # macOS
    plt.rcParams['font.family'] = 'AppleGothic'
else:  # Linux
    plt.rcParams['font.family'] = 'NanumGothic'

plt.rcParams['axes.unicode_minus'] = False

mpl.rcParams['axes.unicode_minus'] = False

# 로컬 폰트 지정
font_path = FONT_DIR / 'HakgyoansimBareondotumB.ttf'
fm.fontManager.addfont(str(font_path))
font_prop = fm.FontProperties(fname=font_path)
plt.rcParams['font.family'] = font_prop.get_name()

# 컬러 지정
colors = ["#0A4EE4", "#AED6E0", "#9BC4D5", "#5271B7", "#758A98", "#3E4E5F", "#5D3A99", "#9370DB"]


In [6]:
pandas_kwargs = {
    'parse_dates': ['검정일자'],
    'date_format': '%Y-%m-%d'
}

milk: pd.DataFrame = pd.read_csv(DATA_DIR /'processed' / 'milk.csv', **pandas_kwargs)
milk.head()

Unnamed: 0,농장아이디,개체번호,검정일자,누적착유일(연계),유지방율,유단백,무지고형분,체세포수,MUN,305일유량,305일유지량,305일유단백,305일무지고형분,전산차비유지속성,전산차건유전유량,출생일자,산차,농후사료비(연계),농후사료급여량(연계),공태일수,분만후첫수정일까지일수,질병군,비유단계,계절,농장구분,가격,가격미달,분만간격,분만월령
0,21133,20071210010043,2021-03-18,445,6.6,3.28,8.87,274,11.6,15158.0,509.0,504.0,1346.0,,38.0,2007-12-10,8,0.0,0.0,,236.0,0,2,1,1,1062.31,0,,144.0
1,21133,20071210010043,2021-04-28,486,3.64,3.33,8.91,166,11.0,15158.0,509.0,504.0,1346.0,,38.0,2007-12-10,8,0.0,0.0,,236.0,0,2,1,1,1029.4,0,,144.0
2,21133,20071210010043,2021-05-20,508,2.61,3.35,9.14,156,10.8,15158.0,509.0,504.0,1346.0,,38.0,2007-12-10,8,0.0,0.0,,236.0,0,2,1,1,916.1,0,,144.0
3,21133,20071210010043,2021-06-25,544,3.75,3.49,8.99,307,14.1,15158.0,509.0,504.0,1346.0,,38.0,2007-12-10,8,0.0,0.0,,236.0,0,2,2,1,1036.56,0,,144.0
4,21133,20071210010043,2021-07-07,556,3.68,3.74,9.31,34,16.2,15158.0,509.0,504.0,1346.0,,38.0,2007-12-10,8,0.0,0.0,,236.0,0,2,2,1,1039.7,0,,144.0


In [7]:
df = milk

In [8]:
df.describe()

Unnamed: 0,농장아이디,개체번호,검정일자,누적착유일(연계),유지방율,유단백,무지고형분,체세포수,MUN,305일유량,305일유지량,305일유단백,305일무지고형분,전산차비유지속성,전산차건유전유량,산차,농후사료비(연계),농후사료급여량(연계),공태일수,분만후첫수정일까지일수,질병군,비유단계,계절,농장구분,가격,가격미달,분만간격,분만월령
count,40264.0,40264.0,40264,40264.0,40264.0,40264.0,40264.0,40264.0,40264.0,31766.0,40264.0,40264.0,31766.0,17618.0,24721.0,40264.0,40264.0,40264.0,24538.0,37533.0,40264.0,40264.0,40264.0,40264.0,40264.0,40264.0,16195.0,40263.0
mean,20909.488774,20165020000000.0,2021-01-21 09:32:45.587124736,197.482515,4.199805,3.325569,8.882036,178.330245,14.438876,10815.97469,345.534944,272.757376,943.111566,75.316512,24.522285,2.291799,810.311569,2.321714,126.411199,92.329657,0.001515,2.697025,2.563183,0.268081,961.174764,0.106994,438.400988,43.930234
min,20249.0,20071210000000.0,2020-01-01 00:00:00,0.0,0.66,1.51,4.1,0.0,0.0,2416.0,0.0,0.0,174.0,11.8,2.4,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,300.0,0.0,181.0,19.0
25%,20269.0,20151210000000.0,2020-08-19 00:00:00,90.0,3.65,3.09,8.6,33.0,11.9,9469.0,272.0,241.0,825.0,61.7,19.5,1.0,0.0,0.0,51.0,59.0,0.0,2.0,2.0,0.0,998.5,0.0,364.0,26.0
50%,21137.0,20170420000000.0,2021-02-02 00:00:00,181.0,4.16,3.3,8.88,70.0,14.0,10737.0,395.0,321.0,934.0,74.2,24.6,2.0,0.0,0.0,106.0,79.0,0.0,3.0,3.0,0.0,1060.3,0.0,407.0,38.0
75%,21276.0,20180410000000.0,2021-07-06 00:00:00,280.0,4.7,3.54,9.16,164.0,16.6,12132.75,481.0,376.0,1059.0,88.2,29.7,3.0,0.0,0.0,182.0,111.0,0.0,4.0,3.0,1.0,1075.75,0.0,489.0,55.0
max,21876.0,20210100000000.0,2022-02-04 00:00:00,1050.0,9.99,8.9,13.93,20484.0,57.2,21284.0,1235.0,659.0,1768.0,235.7,58.0,10.0,8400.0,14.0,974.0,538.0,1.0,4.0,4.0,1.0,1075.75,1.0,1167.0,179.0
std,584.671255,19370250000.0,,133.528471,0.930125,0.36005,0.44365,443.933403,3.830026,2023.436154,203.111118,152.419993,177.270165,19.972068,7.809552,1.44734,1971.355382,4.336625,111.593139,51.360161,0.038894,1.069442,1.072019,0.442965,233.757091,0.309109,105.323199,21.302751


In [12]:
features = ['누적착유일(연계)', '전산차비유지속성', '전산차건유전유량', '산차', '농후사료급여량(연계)',
       '공태일수', '비유단계', '계절', '농장구분', '분만간격', '분만월령']

target = '가격미달'

## 변수들 간의 상관관계 확인해서 상관계수 높은 것들은 제거하거나 처리

In [13]:
vars = list(set(features + [target]))
corr_matrix = df[vars].corr('spearman').round(3)

fig_corr = px.imshow(
    corr_matrix,
    text_auto=True,
    color_continuous_scale="RdBu_r",
    title="Correlation Matrix (Plotly)",
    template="plotly_dark"
)
fig_corr.update_layout(
    xaxis_title="Features",
    yaxis_title="Features",
    width=1200, height=1000
)
fig_corr.show()

## 다중공선성 조정

In [14]:
# 공태일수 - 누적착유일(연계)
# 산차 - 분만얼령
# 305일 무지고형분 - 305일 유량

vars = [
    # '누적착유일(연계)',
    '전산차비유지속성',
    '산차', 
    '농후사료급여량(연계)',
    '공태일수',
    # '305일유량',
    '농장구분',
    # '분만간격',
    # '분만월령',
    '비유단계',
    '계절'
]

In [15]:
X = df[vars].dropna()

vif_data = pd.DataFrame({
    "Feature": vars,
    "VIF": [variance_inflation_factor(X, i) for i in range(X.shape[1])]
})

fig_vif = px.bar(
    vif_data.sort_values("VIF", ascending=True),
    x="VIF",
    y="Feature",
    orientation="h",
    title="Variance Inflation Factor (VIF) — 다중공선성 진단",
    template="plotly_dark",
    color="VIF",
    color_continuous_scale="Viridis"
)
fig_vif.add_vline(x=5, line_dash="dash", line_color="orange",
                  annotation_text="주의 경계선 (VIF=5)")
fig_vif.add_vline(x=10, line_dash="dot", line_color="red",
                  annotation_text="심각 (VIF=10)")
fig_vif.show()

## 스피어만 상관분석

In [16]:
for feature in features:
    print('='*50)
    print(f'{feature}와 {target} 상관분석')
    print('='*50)
    
    temp = df.dropna(subset=[feature])
    r, pval = spearmanr(temp[target], temp[feature])
    print(f"Pearson correlation: r = {r:.3f}, p-value = {pval:.5f}")
    abs_r = abs(r)
    
    if abs_r < 0.1:
        effect = '매우 작은 효과'
    elif abs_r < 0.2:
        effect = '작은 효과'
    elif abs_r < 0.4:
        effect = '중간 효과'
    elif abs_r < 0.6:
        effect = '비교적 큰 효과'
    elif abs_r < 0.8:
        effect = '큰 효과'
    else:
        effect = '매우 큰 효과'
    print(effect)

누적착유일(연계)와 가격미달 상관분석
Pearson correlation: r = 0.060, p-value = 0.00000
매우 작은 효과
전산차비유지속성와 가격미달 상관분석
Pearson correlation: r = -0.064, p-value = 0.00000
매우 작은 효과
전산차건유전유량와 가격미달 상관분석
Pearson correlation: r = -0.051, p-value = 0.00000
매우 작은 효과
산차와 가격미달 상관분석
Pearson correlation: r = 0.154, p-value = 0.00000
작은 효과
농후사료급여량(연계)와 가격미달 상관분석
Pearson correlation: r = -0.004, p-value = 0.39835
매우 작은 효과
공태일수와 가격미달 상관분석
Pearson correlation: r = 0.038, p-value = 0.00000
매우 작은 효과
비유단계와 가격미달 상관분석
Pearson correlation: r = 0.003, p-value = 0.52449
매우 작은 효과
계절와 가격미달 상관분석
Pearson correlation: r = -0.012, p-value = 0.01588
매우 작은 효과
농장구분와 가격미달 상관분석
Pearson correlation: r = -0.006, p-value = 0.26087
매우 작은 효과
분만간격와 가격미달 상관분석
Pearson correlation: r = 0.048, p-value = 0.00000
매우 작은 효과
분만월령와 가격미달 상관분석
Pearson correlation: r = 0.153, p-value = 0.00000
작은 효과
