In [2]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pip install seaborn

In [None]:
pip install imbalanced-learn

In [3]:
import os
from pprint import pprint
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from tqdm import tqdm

In [1]:
ROOT_DIR = "/content/drive/MyDrive/LG_AIMERS/data"
RANDOM_STATE = 110

# Load data
train_data = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))

# Target distribution visualization
target_counts = train_data['target'].value_counts(normalize=True)
plt.figure(figsize=(6, 4))
target_counts.plot(kind='bar', color=['blue', 'orange'])
plt.title('Distribution of Normal and AbNormal in target')
plt.xlabel('Class')
plt.ylabel('Proportion')
plt.xticks(rotation=0)
plt.show()

# Drop columns with common NaNs
nan_columns_train = train_data.columns[train_data.isna().any()].tolist()
nan_columns_test = test_data.columns[test_data.isna().any()].tolist()
common_nan_columns = list(set(nan_columns_train) & set(nan_columns_test))
train_data = train_data.drop(columns=common_nan_columns)
test_data = test_data.drop(columns=common_nan_columns)

# Split features and target
train_x = train_data.drop(columns=["target"])
train_y = train_data["target"]
test_x = test_data.drop(columns=["Set ID", "target"], errors='ignore')
test_set_id = test_data['Set ID']  # Save the Set ID column

# Separate numeric and categorical columns
numeric_cols = train_x.select_dtypes(include=['number']).columns
categorical_cols = train_x.select_dtypes(include=['object']).columns

# Fill missing values
numeric_imputer = SimpleImputer(strategy='median')
categorical_imputer = SimpleImputer(strategy='most_frequent')
train_x[numeric_cols] = numeric_imputer.fit_transform(train_x[numeric_cols])
test_x[numeric_cols] = numeric_imputer.transform(test_x[numeric_cols])
train_x[categorical_cols] = categorical_imputer.fit_transform(train_x[categorical_cols])
test_x[categorical_cols] = categorical_imputer.transform(test_x[categorical_cols])

# Encode categorical variables
def preprocess_data(df, categorical_cols):
    label_encoders = {}
    for column in categorical_cols:
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        label_encoders[column] = le
    return df

train_x = preprocess_data(train_x, categorical_cols)
test_x = preprocess_data(test_x, categorical_cols)

# Additional Feature Engineering

# WorkMode_High feature creation
train_x['WorkMode_High'] = train_x['WorkMode Collect Result_Fill2'] > 2.0
test_x['WorkMode_High'] = test_x['WorkMode Collect Result_Fill2'] > 2.0

train_x['WorkMode_High'] = train_x['WorkMode_High'].astype(int)
test_x['WorkMode_High'] = test_x['WorkMode_High'].astype(int)

# Create Model_Equipment_Combined feature
train_x['Model_Equipment_Combined'] = train_x['Model.Suffix_Dam'].astype(str) + "_" + train_x['Equipment_Dam'].astype(str)
test_x['Model_Equipment_Combined'] = test_x['Model.Suffix_Dam'].astype(str) + "_" + test_x['Equipment_Dam'].astype(str)

# Binning WorkMode
train_x['WorkMode_Binned'] = pd.cut(train_x['WorkMode Collect Result_Fill2'], bins=[-np.inf, 1.0, 2.0, np.inf], labels=['Low', 'Medium', 'High'])
test_x['WorkMode_Binned'] = pd.cut(test_x['WorkMode Collect Result_Fill2'], bins=[-np.inf, 1.0, 2.0, np.inf], labels=['Low', 'Medium', 'High'])

# Encode the new features
combined_factor_le = LabelEncoder()
train_x['Combined_Factor'] = (
    train_x['Equipment_Dam'].astype(str) + '_' +
    train_x['Model.Suffix_Dam'].astype(str) + '_' +
    train_x['WorkMode_High'].astype(str)
)

test_x['Combined_Factor'] = (
    test_x['Equipment_Dam'].astype(str) + '_' +
    test_x['Model.Suffix_Dam'].astype(str) + '_' +
    test_x['WorkMode_High'].astype(str)
)

train_x['Combined_Factor'] = combined_factor_le.fit_transform(train_x['Combined_Factor'])
test_x['Combined_Factor'] = combined_factor_le.transform(test_x['Combined_Factor'])

# Use concat to add new columns to DataFrame at once
train_x = pd.concat([train_x, pd.get_dummies(train_x[['Combined_Factor', 'Model_Equipment_Combined', 'WorkMode_Binned']], drop_first=True)], axis=1)
test_x = pd.concat([test_x, pd.get_dummies(test_x[['Combined_Factor', 'Model_Equipment_Combined', 'WorkMode_Binned']], drop_first=True)], axis=1)

# Drop original columns if necessary
train_x = train_x.drop(columns=['Combined_Factor', 'Model_Equipment_Combined', 'WorkMode_Binned'])
test_x = test_x.drop(columns=['Combined_Factor', 'Model_Equipment_Combined', 'WorkMode_Binned'])

# Scaling features
scaler = StandardScaler()
train_x = pd.DataFrame(scaler.fit_transform(train_x), columns=train_x.columns)
test_x = pd.DataFrame(scaler.transform(test_x), columns=test_x.columns)

# Balancing dataset using SMOTE
smote = SMOTE(random_state=RANDOM_STATE)
train_x, train_y = smote.fit_resample(train_x, train_y)

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    train_x, train_y, test_size=0.3, stratify=train_y, random_state=RANDOM_STATE
)

from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Random Forest, Gradient Boosting, KNN 모델 정의
rf_model = RandomForestClassifier(random_state=RANDOM_STATE)
gb_model = GradientBoostingClassifier(random_state=RANDOM_STATE)
knn_model = KNeighborsClassifier()

# Voting Classifier 정의 (소프트 보팅 방식)
voting_clf = VotingClassifier(estimators=[
    ('rf', rf_model),
    ('gb', gb_model),
    ('knn', knn_model)
], voting='soft')  # 'soft' 보팅을 사용해 모델의 확률을 기반으로 예측

# 앙상블 모델 학습
voting_clf.fit(X_train, y_train)

# 검증 데이터로 예측 수행
val_predictions = voting_clf.predict(X_val)

# F1 Score 계산
f1 = f1_score(y_val, val_predictions, pos_label="AbNormal")
print(f"Voting Classifier F1 Score: {f1:.4f}")

# Classification Report 출력
print("\nClassification Report:")
print(classification_report(y_val, val_predictions))

# Confusion Matrix 시각화
cm = confusion_matrix(y_val, val_predictions, labels=["Normal", "AbNormal"])
plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Normal", "AbNormal"], yticklabels=["Normal", "AbNormal"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Voting Classifier Confusion Matrix")
plt.show()


NameError: name 'pd' is not defined

In [None]:
# 테스트 데이터 예측 및 결과 저장
test_predictions = voting_clf.predict(test_x)

# 'Set ID' 열을 포함하여 결과를 저장
output = pd.DataFrame({'Set ID': test_set_id, 'Prediction': test_predictions})

# 'target' 열 추가
output["target"] = test_predictions

# 제출 파일 저장 (Set ID가 포함된 상태로 저장)
output.to_csv("submission.csv", index=False)