#### csv 저장 시 index 함께 저장할 경우에 unnamed 컬럼 삭제 시 명령
- df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

#### 조건에 따라 모든 열값 바꾸기
- df.loc[df["isDefective"] == 1, "isDefective"] = "Y"

#### row option
- pd.set_option('display.max_row', 20)

### 1. Load & merge data

In [6]:
df_sfa = pd.read_csv("SFA_Faulty_Dataset.csv")
df_rwa = pd.read_csv("RWA_Faulty_Dataset.csv")

In [7]:
df = df_sfa.append(df_rwa).reset_index(drop=True)
df = df.drop(columns=["Order", "swcName", "swcVer", "createdDate"], axis=1)

In [227]:
df.loc[df["isDefective"] == 1, "isDefective"] = "Y"
df.loc[df["isDefective"] == 0, "isDefective"] = "N"

In [228]:
# df["isDefective"].value_counts()

N    360
Y     86
Name: isDefective, dtype: int64

### 2. Save merged data

In [1]:
df.to_csv("sbw_faulty_dataset,csv", index=False)

NameError: name 'df' is not defined

### 3. Load merged data to Data Frame

In [3]:
df = pd.read_csv("sbw_faulty_dataset,csv")

In [4]:
df.tail(2)

Unnamed: 0,isDefective,progLength,vocaCnt,volume,difficultyLv,progLv,effortToImpl,timeToImpl,nloc,minParaCnt,maxParaCnt,avgParaCnt,funCnt,minCyclo,maxCyclo,avgCyclo,expLvOfDev,affectedBoundary,revFreq
444,Y,214,57,1248.238463,14.325581,6.082373,17881.7417,993.430095,48,1,1,1.0,2,1,9,5.0,22.393443,1,6
445,N,116,31,574.686772,12.272727,3.815479,7052.97402,391.83189,15,1,1,1.0,0,2,10,5.416667,49.47541,3,35


In [232]:
y = df["isDefective"]
X = df.drop("isDefective", axis=1)

In [233]:
y.replace({"Y":1, "N":-1}, inplace=True)

In [234]:
y.value_counts()

-1    360
 1     86
Name: isDefective, dtype: int64

In [242]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score

In [236]:
train_X, test_X, train_y, test_y = train_test_split(X, y)

In [237]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import *

In [276]:
DTC_mdl = DecisionTreeClassifier(max_depth=20, criterion='entropy', random_state=0).fit(train_X, train_y)
GNB_mdl = GaussianNB().fit(train_X, train_y)

In [277]:
pred_y_DTC = DTC_mdl.predict(test_X)
pred_y_GNB = GNB_mdl.predict(test_X)

In [283]:
from sklearn.model_selection import ParameterGrid
grid = ParameterGrid({"max_depth":[3, 5, 8, 15, 20],
                     "min_samples_leaf":[1, 2, 3, 5 ,10]}) # 총 25개의 파라미터가 포함

In [291]:
best_score = 0 # best score를 초기화할때는 score 가 가질 수 있는 최소값보다 작거나 같아야 함

for parameter in grid:
    model = DecisionTreeClassifier(**parameter).fit(train_X, train_y) # 모델 인스턴스화 작업
    pred_y = model.predict(test_X)
    score = accuracy_score(test_y, pred_y)
    
    if score > best_score:
        best_score = score
        best_parameter = parameter
        best_model = model

In [289]:
best_score, best_model

(0.9196428571428571,
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=20, max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=None, splitter='best'))

In [290]:
# Decision Tree Classifier
print(classification_report(test_y, pred_y_DTC))

              precision    recall  f1-score   support

          -1       0.98      0.94      0.96        93
           1       0.74      0.89      0.81        19

    accuracy                           0.93       112
   macro avg       0.86      0.92      0.88       112
weighted avg       0.94      0.93      0.93       112



In [281]:
cv = KFold(5, shuffle=True, random_state=0)
cross_val_score(DTC_mdl, X, y, scoring="accuracy", cv=cv).mean()

0.9217478152309614

In [297]:
print(classification_report(test_y, pred_y_GNB))

              precision    recall  f1-score   support

          -1       0.84      0.91      0.88        93
           1       0.27      0.16      0.20        19

    accuracy                           0.79       112
   macro avg       0.56      0.54      0.54       112
weighted avg       0.75      0.79      0.76       112



In [298]:
probs = GNB_mdl.predict_proba(test_X)
probs_df = pd.DataFrame(probs, columns=model.classes_, index=test_X.index)
probs_df.tail(2)

Unnamed: 0,-1,1
87,0.000934,0.999066
435,0.942051,0.057949


In [299]:
cut_off_value = 0.2
pred_Y_with_cut_off = 2 * (probs_df.iloc[:, 1] >= cut_off_value) - 1

In [301]:
print(classification_report(test_y, pred_Y_with_cut_off))

              precision    recall  f1-score   support

          -1       0.92      0.71      0.80        93
           1       0.33      0.68      0.44        19

    accuracy                           0.71       112
   macro avg       0.62      0.70      0.62       112
weighted avg       0.82      0.71      0.74       112

