In [1]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import random
import seaborn as sns
import warnings
import time
from collections import Counter
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from scipy import stats
# ignore python warnings
warnings.filterwarnings("ignore")

In [2]:
seed = 101

In [3]:
featureNames= ['Region-centroid-col', 'Region-centroid-row', 'Region-pixel-count',
       'Short-line-density-5', 'Short-line-density-2', 'Vedge-mean',
       'Vedge-sd', 'Hedge-mean', 'Hedge-sd', 'Intensity-mean',
       'Rawred-mean', 'Rawblue-mean', 'Rawgreen-mean', 'Exred-mean',
       'Exblue-mean', 'Exgreen-mean', 'Value-mean', 'Saturation-mean',
       'Hue-mean', 'class']
df = pd.read_csv(r'dataset/segment.csv', header=None, names=featureNames)

In [4]:
k = df.describe()

In [5]:
df.head()

Unnamed: 0,Region-centroid-col,Region-centroid-row,Region-pixel-count,Short-line-density-5,Short-line-density-2,Vedge-mean,Vedge-sd,Hedge-mean,Hedge-sd,Intensity-mean,Rawred-mean,Rawblue-mean,Rawgreen-mean,Exred-mean,Exblue-mean,Exgreen-mean,Value-mean,Saturation-mean,Hue-mean,class
0,218,178,9,0.111111,0.0,0.833333,0.547722,1.111109,0.544331,59.62963,52.444443,75.22222,51.22222,-21.555555,46.77778,-25.222221,75.22222,0.318996,-2.040554,negative
1,113,130,9,0.0,0.0,0.277778,0.250924,0.333333,0.365148,0.888889,0.0,2.555556,0.111111,-2.666667,5.0,-2.333333,2.555556,1.0,-2.123254,negative
2,202,41,9,0.0,0.0,0.944448,0.772202,1.111112,1.025597,123.03704,111.888885,139.77779,117.44444,-33.444443,50.22222,-16.777779,139.77779,0.199347,-2.299918,negative
3,32,173,9,0.0,0.0,1.722222,1.781593,9.0,6.749488,43.592594,39.555557,52.88889,38.333336,-12.111111,27.88889,-15.777778,52.88889,0.266914,-1.998858,negative
4,61,197,9,0.0,0.0,1.444444,1.515353,2.611111,1.925463,49.592594,44.22222,61.555557,43.0,-16.11111,35.88889,-19.777779,61.555557,0.302925,-2.022274,negative


In [6]:
df = df.drop(columns=['Short-line-density-5', 'Short-line-density-2', 'Region-pixel-count'])
df = df.drop(columns=['Region-centroid-col', 'Region-centroid-row'])

In [7]:
def removeOutlier(df, col_name, threshold, upper=True):    
    if(upper==True):
        df = df.drop(df[(df[col_name] > threshold)].index)
    else:
        df = df.drop(df[(df[col_name] < threshold)].index)
    return df
    
df = removeOutlier(df, 'Hedge-sd', 10)
df = removeOutlier(df, 'Vedge-sd', 8)
df = removeOutlier(df, 'Hedge-mean', 7)
df = removeOutlier(df, 'Vedge-mean', 8)
df = removeOutlier(df, 'Hue-mean', -.25)
df = removeOutlier(df, 'Hue-mean', -2.75, False)

In [8]:
df = df.reset_index()
df = df.drop(columns='index')

In [9]:
X = df
y = X.pop('class')

In [10]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [11]:
X = pd.DataFrame(X, columns=['Vedge-mean',
       'Vedge-sd', 'Hedge-mean', 'Hedge-sd', 'Intensity-mean',
       'Rawred-mean', 'Rawblue-mean', 'Rawgreen-mean', 'Exred-mean',
       'Exblue-mean', 'Exgreen-mean', 'Value-mean', 'Saturation-mean',
       'Hue-mean'])

In [12]:
p_test = pd.DataFrame(columns=['name', 'p-value'])
p_values = []
p_names = []
for column in X:
   stat, p = stats.shapiro(X[column])
   p_names.append(column)
   p_values.append(p)
   
for i in range(len(p_values)):
    p_test.loc[i] = [p_names[i]] + [p_values[i]]
p_test

Unnamed: 0,name,p-value
0,Vedge-mean,8.299744e-39
1,Vedge-sd,9.430739e-43
2,Hedge-mean,1.105662e-37
3,Hedge-sd,5.324934e-44
4,Intensity-mean,1.608171e-37
5,Rawred-mean,3.707636e-37
6,Rawblue-mean,1.944802e-36
7,Rawgreen-mean,2.2430750000000002e-39
8,Exred-mean,6.128499e-24
9,Exblue-mean,1.765567e-22


In [13]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(random_state=seed)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

In [19]:
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression


# Parameter decider
from sklearn.model_selection import GridSearchCV

svm_par = {'C': [0.01, 0.1, 0.5, 1, 10, 100], 'kernel': ['rbf', 'poly', 'sigmoid', 'linear']}
mlp_par = {'activation': ['logistic', 'tanh', 'relu'], 'solver': ['lbfgs', 'sgd', 'adam'], 'hidden_layer_sizes': [(32,), (16,), (16,16), (8,8)]}

In [20]:
classifier = GridSearchCV(MLPClassifier(random_state=seed), mlp_par).fit(X, y).best_estimator_
clf_svm = GridSearchCV(SVC(random_state=seed), svm_par).fit(X, y).best_estimator_

In [21]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def GetMetrics(estimator, X, y):
    y_pred = estimator.predict(X)
    acc = accuracy_score(y, y_pred)
    pre, rec, fs, sup = precision_recall_fscore_support(y, y_pred)
    return acc, pre, rec, fs

In [22]:
# 5 cv, 6+1 resampling, 5 classifier
scores = 0
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    c = clf_svm.fit(X_train, y_train)
    scores = GetMetrics(c, X_test, y_test)

In [23]:
print(scores)

(0.9970760233918129, array([0.99640288, 1.        ]), array([1.        , 0.98461538]), array([0.9981982 , 0.99224806]))


In [25]:
print("Akurasi\t\t\t: %.3f" % scores[0])
print("Presisi Majority\t: %.3f" % scores[1][0])
print("Presisi Minority\t: %.3f" % scores[1][1])
print("Recall Majority\t\t: %.3f" % scores[2][0])
print("Recall Minority\t\t: %.3f" % scores[2][1])
print("F-Score Majority\t: %.3f" % scores[3][0])
print("F-Score Minority\t: %.3f" % scores[3][1])

Akurasi			: 0.997
Presisi Majority	: 0.996
Presisi Minority	: 1.000
Recall Majority		: 1.000
Recall Minority		: 0.985
F-Score Majority	: 0.998
F-Score Minority	: 0.992
