In [3]:
import pandas as pd

file_path='/Users/gimhagyeong/test/B0005_with_lowess_features.csv'
df=pd.read_csv(file_path)

# 기본 피처 (lowess 파생피처 사용 X)

In [4]:
import numpy as np
import plotly.express as px
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler



# 1. 사용할 feature 선택
features1 = [
    'Voltage_measured', 'Current_measured', 'Temperature_measured',
    'Current_load', 'Voltage_load'
]
X = df[features1]

# 2. 스케일링
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

lof = LocalOutlierFactor(
        n_neighbors=30,
        contamination=0.005   # 예상 이상치 비율
    )

y_pred = lof.fit_predict(X_scaled)
    
lof_scores = -lof.negative_outlier_factor_

    # 결과를 데이터 프레임에 추가

df['lof_pred'] = y_pred
df['lof_scores'] = lof_scores
df['lof_anomaly'] = np.where(y_pred == -1, 1, 0)  # -1을 이상치(Anomaly)로 변환

print("LOF 이상치:", df['lof_anomaly'].sum())



LOF 이상치: 252


In [5]:
contamination=0.005 # 정한 이상치 비율
threshold = np.quantile(lof_scores, 1 - contamination)
print("LOF 임계값(threshold):", threshold)

LOF 임계값(threshold): 1.676333654209548


In [6]:
anom_cycles=df[df['lof_anomaly']==1]['cycle_idx'].unique()
print('Anomalic cycles:', anom_cycles)
print('number of Anomalic cycles:', len(anom_cycles))

Anomalic cycles: [ 46 118 140 144 148 150 154 158 206 274 298 313 348 379 395 431 433 614]
number of Anomalic cycles: 18


In [7]:
cycle_anomaly_ratio = df.groupby('cycle_idx')['lof_anomaly'].mean().reset_index() # 이상 윈도우비율
# cycle_anomaly_ratio
cycle_anomaly_ratio['is_cycle_anomaly'] = (cycle_anomaly_ratio['lof_anomaly'] > 0.1).astype(int)
print("이상치윈도우가 0.1 초과인 싸이클 수:", cycle_anomaly_ratio['is_cycle_anomaly'].sum())
print("이상치윈도우가 0.1 초과인 싸이클 번호:", cycle_anomaly_ratio[cycle_anomaly_ratio['is_cycle_anomaly']==1]['cycle_idx'].values)

이상치윈도우가 0.1 초과인 싸이클 수: 2
이상치윈도우가 0.1 초과인 싸이클 번호: [ 46 158]


In [8]:
cycle_anomaly_ratio = df.groupby('cycle_idx')['lof_anomaly'].mean().reset_index() # 이상 윈도우비율
# cycle_anomaly_ratio
cycle_anomaly_ratio['is_cycle_anomaly'] = (cycle_anomaly_ratio['lof_anomaly'] > 0.05).astype(int)
print("이상치윈도우가 0.05 초과인 싸이클 수:", cycle_anomaly_ratio['is_cycle_anomaly'].sum())
print("이상치윈도우가 0.05 초과인 싸이클 번호:", cycle_anomaly_ratio[cycle_anomaly_ratio['is_cycle_anomaly']==1]['cycle_idx'].values)

이상치윈도우가 0.05 초과인 싸이클 수: 4
이상치윈도우가 0.05 초과인 싸이클 번호: [ 46 150 154 158]


In [9]:
from sklearn.ensemble import IsolationForest

iforest = IsolationForest(
    n_estimators=300,
    contamination=0.005,
    random_state=42
)
y_if = iforest.fit_predict(X_scaled)   # 1 = normal, -1 = anomaly
df['if_anomaly'] = (y_if == -1).astype(int)

both = (df['lof_anomaly'] == 1) & (df['if_anomaly'] == 1)
print("LOF 이상치:", df['lof_anomaly'].sum())
print("IF 이상치 :", df['if_anomaly'].sum())
print("둘 다 이상 :", both.sum())


LOF 이상치: 252
IF 이상치 : 252
둘 다 이상 : 15


In [10]:
both_mask = (df['lof_anomaly'] == 1) & (df['if_anomaly'] == 1)
both_cycles = (
    df.loc[both_mask, 'cycle_idx']   # 공통 이상인 윈도우들의 cycle 번호
      .dropna()
      .unique()
)

both_cycles = np.sort(both_cycles)   # 정렬 (선택)
print("공통 이상 cycle 인덱스들:", both_cycles)
print("공통 이상 cycle 개수:", len(both_cycles))

공통 이상 cycle 인덱스들: [118 140 144 148 150 154 274 313 431 433]
공통 이상 cycle 개수: 10


In [11]:
import plotly.express as px

cycle_df = df.groupby('cycle_idx')['lof_scores'].mean().reset_index()

fig = px.line(
    cycle_df,
    x='cycle_idx',
    y='lof_scores',      
    title='Average LOF Score per Cycle',
    markers=True
)
fig.show()

In [12]:
cycle_anom = df.groupby('cycle_idx')['lof_anomaly'].sum().reset_index()

fig = px.bar(
    cycle_anom,
    x='cycle_idx',
    y='lof_anomaly',
    title='Number of Anomalies per Cycle'
)
fig.show()


In [13]:
cycle_peak = df.groupby('cycle_idx')['lof_scores'].max().reset_index()

fig = px.line(
    cycle_peak,
    x='cycle_idx',
    y='lof_scores',
    title='Peak LOF Score per Cycle',
    markers=True
)
fig.show()

In [14]:
# 히스토그램으로 점수 분포 겹쳐서 그리기
import plotly.express as px

fig_score = px.histogram(df, x='lof_scores', color='lof_anomaly',
                         nbins=100,
                         title='LOF Score 분포: 정상(0) vs 이상(1)',
                         labels={'lof_anomaly': 'Label (0:Normal, 1:Anomaly)'},
                         opacity=0.7,
                         log_y=True) # 데이터 개수 차이가 크므로 y축을 로그 스케일로
fig_score.show()

In [15]:
# Boxplot으로 잔차 분포 비교
fig_res = px.box(df, x='lof_anomaly', y='Voltage_measured_residual',
                 color='lof_anomaly',
                 title='정상 vs 이상: 전압 잔차(Residual) 크기 비교',
                 points='outliers') # 모든 점을 찍지 않고 이상치만 점으로 표시
fig_res.show()

In [16]:
# Cycle별 이상치 빈도 (Aging Correlation)
cycle_anomaly_counts = df[df['lof_anomaly'] == 1]['cycle_idx'].value_counts().sort_index()

fig_cycle = px.bar(
    x=cycle_anomaly_counts.index,
    y=cycle_anomaly_counts.values,
    title='<b>[Temporal Trend] Anomaly Frequency per Cycle</b>',
    labels={'x': 'Cycle Index', 'y': 'Anomaly Count'},
    color=cycle_anomaly_counts.values,
    color_continuous_scale='Viridis'
)

fig_cycle.update_layout(xaxis_title="Battery Aging (Cycle)", yaxis_title="Number of Anomalies")
fig_cycle.show()

In [17]:
import plotly.graph_objects as go

# 158번 사이클(이상치 폭발) vs 86번 사이클(정상) 비교
# 1. 비교 대상 자동 선택
# (1) 이상치 사이클: 사용자가 지정한 134번
bad_cycle_idx = 158

# (2) 정상 사이클: 데이터 개수가 가장 많은(끊김 없는) 정상 사이클 하나를 자동 선택
# 정상 데이터(Label 0) 중에서 사이클별 데이터 개수를 센다
normal_counts = df[df['lof_anomaly'] == 0]['cycle_idx'].value_counts()
# 가장 데이터가 풍부한 사이클 번호를 가져옵니다.
normal_cycle_idx = normal_counts.index[0]

print(f"=== 비교 대상 선정 ===")
print(f"이상치 사이클: {bad_cycle_idx}번")
print(f"정상 사이클 (자동선택): {normal_cycle_idx}번")

# 2. 데이터 추출
cycle_bad = df[df['cycle_idx'] == bad_cycle_idx]
cycle_normal = df[df['cycle_idx'] == normal_cycle_idx]

# 데이터 개수 확인 (혹시 0개인지 체크)
print(f" - 이상치 데이터 개수: {len(cycle_bad)}개")
print(f" - 정상 데이터 개수: {len(cycle_normal)}개")

# 3. 그래프 그리기
if len(cycle_normal) > 0 and len(cycle_bad) > 0:
    fig_compare = go.Figure()

    # 정상 사이클 (파란색 실선)
    fig_compare.add_trace(go.Scatter(
        x=np.arange(len(cycle_normal)),
        y=cycle_normal['Voltage_measured'],
        mode='lines',
        name=f'Normal Cycle ({normal_cycle_idx})',
        line=dict(color='blue', width=2),
        opacity=0.7
    ))

    # 이상치 사이클 (빨간색 점선)
    fig_compare.add_trace(go.Scatter(
        x=np.arange(len(cycle_bad)),
        y=cycle_bad['Voltage_measured'],
        mode='lines',
        name=f'Anomaly Cycle ({bad_cycle_idx})',
        line=dict(color='red', width=3, dash='dot') # 두께를 키움
    ))

    fig_compare.update_layout(
        title=f'<b>[Comparison] Normal vs Anomaly (Cycle {normal_cycle_idx} vs {bad_cycle_idx})</b>',
        xaxis_title='Time Step (Sequence)',
        yaxis_title='Voltage (V)',
        template='plotly_white',
        hovermode="x unified"
    )

    fig_compare.show()
else:
    print("오류: 선택된 사이클에 데이터가 없습니다.")

=== 비교 대상 선정 ===
이상치 사이클: 158번
정상 사이클 (자동선택): 86번
 - 이상치 데이터 개수: 353개
 - 정상 데이터 개수: 371개


In [18]:
# 누적 이상치 그래프 (Cumulative Anomaly Count)
# 시간이 지날수록 이상치가 얼마나 '쌓이는가'를 봅니다.

# 사이클별 이상치 개수 집계 (없는 사이클은 0으로 채움)
all_cycles = pd.DataFrame({'cycle_idx': df['cycle_idx'].unique()})
anomaly_counts = df[df['lof_anomaly'] == 1]['cycle_idx'].value_counts().reset_index()
anomaly_counts.columns = ['cycle_idx', 'count']

# 병합 및 정렬
trend_df = pd.merge(all_cycles, anomaly_counts, on='cycle_idx', how='left').fillna(0)
trend_df = trend_df.sort_values('cycle_idx')

# 누적 합 계산
trend_df['cumulative_count'] = trend_df['count'].cumsum()

# 시각화
fig_trend = px.area(
    trend_df,
    x='cycle_idx',
    y='cumulative_count',
    title='<b>[Cumulative Trend] Total Anomalies Over Time</b>',
    labels={'cumulative_count': 'Accumulated Anomalies', 'cycle_idx': 'Cycle Index'}
)

fig_trend.update_layout(template='plotly_white')
fig_trend.show()

In [19]:
# Feature Contribution (원인 분석)
# 로직: (이상치 그룹의 평균 - 정상 그룹의 평균) / 데이터 전체의 표준편차
# 의미: "이 피처가 정상보다 몇 시그마(Standard Deviation)만큼 벗어나 있는가?"

# 데이터 분리
normal_df = df[df['lof_anomaly'] == 0][features1]
anomaly_df = df[df['lof_anomaly'] == 1][features1]

# 차이 계산 (Z-score 관점의 차이)
diff_series = (anomaly_df.mean() - normal_df.mean()) / df[features1].std()
diff_df = diff_series.reset_index()
diff_df.columns = ['Feature', 'Deviation_Score']
diff_df = diff_df.sort_values(by='Deviation_Score', key=abs, ascending=True) # 절대값 크기순 정렬

# 시각화
fig_reason = px.bar(
    diff_df,
    x='Deviation_Score',
    y='Feature',
    orientation='h',
    title='<b>이상치 원인분석 (Feature Deviation)</b>',
    color='Deviation_Score',
    color_continuous_scale='RdBu_r', # 빨강: 높아서 문제, 파랑: 낮아서 문제
    text_auto='.2f'
)

fig_reason.add_vline(x=0, line_width=2, line_color='black')
fig_reason.update_layout(height=800)
fig_reason.show()

In [20]:
from sklearn.preprocessing import StandardScaler  
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance 
import pandas as pd

features = [
    'Voltage_measured', 'Current_measured', 'Temperature_measured',
    'Current_load', 'Voltage_load'
]

X = df[features]  
y = df['lof_anomaly'].astype(int)

scaler = StandardScaler()  
X_scaled = scaler.fit_transform(X)

rf_model = RandomForestClassifier(random_state=42,class_weight='balanced')  # 이상치가 적으므로 class_weight로 가중치 조절
rf_model.fit(X_scaled, y)

perm_result = permutation_importance(
    rf_model,
    X_scaled,
    y,
    n_repeats=10,
    random_state=42,
    scoring='f1'  # F1은 불균형 데이터에서 더 안정적
)
				
# importance_df 결과 정리
importance_df = pd.DataFrame({  
    'feature': features,
    'importance': perm_result.importances_mean,  
    'std': perm_result.importances_std  
}).sort_values('importance', ascending=False)
				
print("\n=== Permutation Importance ===")
print(importance_df.head())

# importance_df 시각화

import plotly.express as px

fig = px.bar(
    importance_df.head(),  
    x='importance',  
    y='feature', 
    orientation='h',  
    title='Permutation Importance Result'
)

fig.update_layout(
    yaxis={'categoryorder': 'total ascending'}  # 큰 값이 위로
)

fig.show()


=== Permutation Importance ===
                feature  importance       std
2  Temperature_measured    0.924774  0.014449
0      Voltage_measured    0.816497  0.023688
4          Voltage_load    0.809647  0.023453
1      Current_measured    0.337335  0.012284
3          Current_load    0.288725  0.021667


In [21]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import LocalOutlierFactor
import plotly.graph_objects as go


# 1. Train/Validation/Test 분할 (6:2:2)

# 고유 cycle 개수
total_cycles = df['cycle_idx'].nunique()

# 6:2:2 split 기준 cycle 개수
train_cycles = int(total_cycles * 0.6)
val_cycles = int(total_cycles * 0.8)  # train + val

# 정렬된 cycle 리스트
cycle_list = sorted(df['cycle_idx'].unique())

# 기준 cycle 번호
train_threshold_cycle = cycle_list[train_cycles - 1]
val_threshold_cycle = cycle_list[val_cycles - 1]

print(f"총 Cycle 수: {total_cycles}")
print(f"Train Cycles: 0 ~ {train_threshold_cycle} ({train_cycles}개)")
print(f"Validation Cycles: {train_threshold_cycle+1} ~ {val_threshold_cycle} ({val_cycles - train_cycles}개)")
print(f"Test Cycles: {val_threshold_cycle+1} ~ {cycle_list[-1]} ({total_cycles - val_cycles}개)")

# 1. Train / Validation / Test split
train_df = df[df['cycle_idx'] <= train_threshold_cycle].copy()
val_df = df[(df['cycle_idx'] > train_threshold_cycle) & (df['cycle_idx'] <= val_threshold_cycle)].copy()
test_df = df[df['cycle_idx'] > val_threshold_cycle].copy()

print(f"\nTrain size: {len(train_df)}")
print(f"Validation size: {len(val_df)}")
print(f"Test size: {len(test_df)}")


# 2. Feature 준비

features1 = [
    'Voltage_measured', 'Current_measured', 'Temperature_measured',
    'Current_load', 'Voltage_load'
]

X_train = train_df[features1]
X_val = val_df[features1]
X_test = test_df[features1]


# 3. Scaler: Train만으로 학습

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

print("\n Scaler 학습 완료 (Train 기준)")


# 4. LOF: Train만으로 학습

lof = LocalOutlierFactor(
    n_neighbors=30,
    contamination='auto',
    novelty=True
)

lof.fit(X_train_scaled)

print("LOF 모델 학습 완료 (Train 기준)")


# 5. 점수 계산 (Train/Validation/Test)

train_scores = -lof.negative_outlier_factor_
val_scores = -lof.score_samples(X_val_scaled)
test_scores = -lof.score_samples(X_test_scaled)

print(f"\n LOF 점수 범위:")
print(f"   Train: {train_scores.min():.4f} ~ {train_scores.max():.4f}")
print(f"   Validation: {val_scores.min():.4f} ~ {val_scores.max():.4f}")
print(f"   Test: {test_scores.min():.4f} ~ {test_scores.max():.4f}")


# 6. Validation으로 Quantile 기반 Threshold 찾기

# 여러 quantile 시도
quantiles = [0.99, 0.995, 0.999]

print("\n" + "="*70)
print(" Quantile 기반 Threshold 비교 (Validation 기준)")
print("="*70)

quantile_results = []

for q in quantiles:
    # Threshold 계산
    thr = np.quantile(val_scores, q)
    
    # 각 데이터셋의 이상치 개수
    val_anom = (val_scores >= thr).sum()
    test_anom = (test_scores >= thr).sum()
    
    quantile_results.append({
        'quantile': q,
        'threshold': thr,
        'val_count': val_anom,
        'val_pct': val_anom / len(val_df) * 100,
        'test_count': test_anom,
        'test_pct': test_anom / len(test_df) * 100
    })
    
    print(f"\nQuantile {q:.3f} (상위 {(1-q)*100:.1f}%):")
    print(f"   Threshold: {thr:.4f}")
    print(f"   Validation 이상치개수/비율: {val_anom}개 ({val_anom/len(val_df)*100:.2f}%)")
    print(f"   Test 이상치개수/비율: {test_anom}개 ({test_anom/len(test_df)*100:.2f}%)")





총 Cycle 수: 168
Train Cycles: 0 ~ 352 (100개)
Validation Cycles: 353 ~ 483 (34개)
Test Cycles: 484 ~ 614 (34개)

Train size: 29361
Validation size: 10664
Test size: 10260

 Scaler 학습 완료 (Train 기준)
LOF 모델 학습 완료 (Train 기준)

 LOF 점수 범위:
   Train: 0.9562 ~ 6.8677
   Validation: 0.9640 ~ 3.9085
   Test: 0.9681 ~ 6.1001

 Quantile 기반 Threshold 비교 (Validation 기준)

Quantile 0.990 (상위 1.0%):
   Threshold: 2.3855
   Validation 이상치개수/비율: 107개 (1.00%)
   Test 이상치개수/비율: 1797개 (17.51%)

Quantile 0.995 (상위 0.5%):
   Threshold: 2.6356
   Validation 이상치개수/비율: 54개 (0.51%)
   Test 이상치개수/비율: 978개 (9.53%)

Quantile 0.999 (상위 0.1%):
   Threshold: 3.2174
   Validation 이상치개수/비율: 11개 (0.10%)
   Test 이상치개수/비율: 215개 (2.10%)


✔ 0.99 threshold는 너무 낮아서 test에서 과도한 이상치가 검출됨

(= 정상도 많이 이상치로 판단하는 false positive 증가)

✔ 0.995는 괜찮지만 test에서 아직 10% 가까이 잡힘

(= threshold가 여전히 낮음)

✔ 0.999는 정상 section에서는 거의 FP 없음

= test에서도 열화 구간만 적당히 잡힘
→ 가장 안정적인 threshold

In [22]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

selected_quantile = 0.999
threshold = np.quantile(val_scores, selected_quantile)


# 1. 점수 & split 정보 붙이기
train_plot = train_df.copy()
val_plot   = val_df.copy()
test_plot  = test_df.copy()

train_plot['score'] = train_scores
val_plot['score']   = val_scores
test_plot['score']  = test_scores

train_plot['split'] = 'train'
val_plot['split']   = 'val'
test_plot['split']  = 'test'

full = pd.concat([train_plot, val_plot, test_plot], ignore_index=True)

# 이상치 여부
full['is_anom'] = full['score'] >= threshold

# 2. cycle 단위 요약 (예: max score, 이상치 비율)
cycle_summary = (
    full.groupby(['split', 'cycle_idx'])
        .agg(
            max_score=('score', 'max'),
            mean_score=('score', 'mean'),
            frac_anom=('is_anom', 'mean')   # 해당 cycle 내 이상치 비율
        )
        .reset_index()
)

# cycle에 하나라도 이상치 있으면 True
cycle_summary['has_anom'] = cycle_summary['frac_anom'] > 0
cycle_summary['has_anom'] = cycle_summary['has_anom'].map({True: 'anomaly cycle', False: 'normal cycle'})

# 3. cycle-wise anomaly plot (max score 기준)
fig = px.scatter(
    cycle_summary,
    x='cycle_idx',
    y='max_score',
    color='split',            # train/val/test 구분
    symbol='has_anom',        # 이상 cycle 표시
    hover_data=['frac_anom', 'mean_score'],
    title='Cycle-wise LOF anomaly (max score per cycle)'
)

# threshold 수평선 추가
fig.add_hline(
    y=thr,
    line_dash='dash',
    line_color='red',
    annotation_text=f'threshold={threshold:.4f}',
    annotation_position='top left'
)

fig.update_layout(xaxis_title='cycle_idx', yaxis_title='max LOF score')
fig.show()


In [23]:
import plotly.graph_objects as go

fig_hist = go.Figure()

# Histogram - Validation
fig_hist.add_trace(go.Histogram(
    x=val_scores,
    name="Validation",
    opacity=0.6,
    marker_color='blue',
    nbinsx=50
))

# Histogram - Test
fig_hist.add_trace(go.Histogram(
    x=test_scores,
    name="Test",
    opacity=0.6,
    marker_color='orange',
    nbinsx=50
))

# Threshold 표시
fig_hist.add_vline(
    x=threshold,
    line_dash="dash",
    line_color="red",
    annotation_text=f"Threshold={threshold:.3f}",
)

fig_hist.update_layout(
    title="Histogram: Validation vs Test LOF Scores",
    barmode='overlay',
    xaxis_title="LOF anomaly score",
    yaxis_title="Count"
)

fig_hist.show()


In [24]:
import plotly.figure_factory as ff

fig_kde = ff.create_distplot(
    [val_scores, test_scores], 
    group_labels=["Validation", "Test"],
    show_hist=False,     # 히스토그램 숨김
    show_rug=False       # rug 숨김
)

# Threshold 선 표시
fig_kde.add_vline(
    x=threshold,
    line_dash="dash",
    line_color="red",
    annotation_text=f"Threshold={threshold:.3f}"
)

fig_kde.update_layout(
    title="KDE Density: Validation vs Test LOF Scores",
    xaxis_title="LOF anomaly score",
    yaxis_title="Density"
)

fig_kde.show()


In [25]:
full

Unnamed: 0,Voltage_measured,Current_measured,Temperature_measured,Current_load,Voltage_load,cycle_idx,Voltage_measured_smooth,Voltage_measured_residual,Voltage_measured_trend,Current_measured_smooth,...,Voltage_load_smooth,Voltage_load_residual,Voltage_load_trend,lof_pred,lof_scores,lof_anomaly,if_anomaly,score,split,is_anom
0,4.191492,-0.004902,24.330034,-0.0006,0.000,2,4.191492,0.000000,-0.000743,-2.012707,...,0.000,0.0,4.2060,1,1.024591,0,1,1.002931,train,False
1,4.190749,-0.001478,24.325993,-0.0006,4.206,2,4.190749,0.000000,-0.108310,-2.012723,...,4.206,0.0,1.5310,1,1.037682,0,1,1.007765,train,False
2,3.974871,-2.012528,24.389085,-1.9982,3.062,2,3.974871,0.000000,-0.119516,-2.012746,...,3.062,0.0,-0.5880,1,1.005383,0,0,1.005709,train,False
3,3.951717,-2.013979,24.544752,-1.9982,3.030,2,3.951717,0.000000,-0.020259,-2.012799,...,3.030,0.0,-0.0255,1,1.018462,0,0,1.023338,train,False
4,3.934352,-2.011144,24.731385,-1.9982,3.011,2,3.934352,0.000000,-0.015829,-2.012904,...,3.011,0.0,-0.0195,1,1.017859,0,0,1.016981,train,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50280,3.579262,-0.001569,34.864823,0.0006,0.000,614,3.578546,0.000716,0.003035,-0.000686,...,0.000,0.0,0.0000,1,1.405421,0,0,2.875240,test,False
50281,3.581964,-0.003067,34.814770,0.0006,0.000,614,3.581578,0.000386,0.003028,-0.000689,...,0.000,0.0,0.0000,1,1.311484,0,0,3.066022,test,False
50282,3.584484,-0.003079,34.676258,0.0006,0.000,614,3.584602,-0.000118,0.003018,-0.000717,...,0.000,0.0,0.0000,1,1.469193,0,0,3.452818,test,True
50283,3.587336,0.001219,34.565580,0.0006,0.000,614,3.587614,-0.000278,0.003007,-0.000761,...,0.000,0.0,0.0000,1,1.626031,0,0,3.791177,test,True


In [26]:
from sklearn.preprocessing import StandardScaler  
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance 
import pandas as pd

features = [
    'Voltage_measured', 'Current_measured', 'Temperature_measured',
    'Current_load', 'Voltage_load'
]

X = full[features]  
y = full['is_anom'].astype(int)

scaler = StandardScaler()  
X_scaled = scaler.fit_transform(X)

rf_model = RandomForestClassifier(random_state=42,class_weight='balanced')  # 이상치가 적으므로 class_weight로 가중치 조절
rf_model.fit(X_scaled, y)

perm_result = permutation_importance(
    rf_model,
    X_scaled,
    y,
    n_repeats=10,
    random_state=42,
    scoring='f1'  # F1은 불균형 데이터에서 더 안정적
)
				
# importance_df 결과 정리
importance_df = pd.DataFrame({  
    'feature': features,
    'importance': perm_result.importances_mean,  
    'std': perm_result.importances_std  
}).sort_values('importance', ascending=False)
				
print("\n=== Permutation Importance ===")
print(importance_df.head())

# importance_df 시각화

import plotly.express as px

fig = px.bar(
    importance_df.head(),  
    x='importance',  
    y='feature', 
    orientation='h',  
    title='Permutation Importance Result'
)

fig.update_layout(
    yaxis={'categoryorder': 'total ascending'}  # 큰 값이 위로
)

fig.show()


=== Permutation Importance ===
                feature  importance       std
1      Current_measured    0.599999  0.014329
0      Voltage_measured    0.577707  0.034668
2  Temperature_measured    0.563521  0.019233
4          Voltage_load    0.370892  0.016343
3          Current_load    0.031885  0.005758
