In [36]:
import pandas as pd
import numpy as np

from sklearn.cross_validation import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import confusion_matrix,\
    recall_score,precision_recall_curve,auc,roc_curve,\
    roc_auc_score,classification_report,accuracy_score,\
    precision_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
data = pd.read_csv('final.csv')
used = data.loc[:,['mean_pras', 'y']]
used.groupby('y').count()

Unnamed: 0_level_0,mean_pras
y,Unnamed: 1_level_1
hati hati,624
lulus,907
tidak lulus,27


In [3]:
used = used.dropna()

In [4]:
used.groupby('y').count()

Unnamed: 0_level_0,mean_pras
y,Unnamed: 1_level_1
hati hati,624
lulus,907
tidak lulus,27


# Dengan Normalisasi

In [7]:
col = 'mean_pras'
col_zscore = col
used[col_zscore] = (used[col] - used[col].mean())/used[col].std(ddof=0)

# Tanpa Over Sampling

In [75]:
target = used['y']
features = used['mean_pras']
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.3, random_state=10)

# Dengan Over Sampling

In [68]:
from imblearn.over_sampling import SMOTE 
target = used['y']
features = used['mean_pras']
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.3, random_state=10)

sm = SMOTE(random_state=20)
features_train, target_train = sm.fit_sample(features_train.values.reshape(-1, 1), target_train)

# Random Forest

In [78]:
clf = RandomForestClassifier(n_estimators=600, random_state=100, min_samples_split=3)
clf = clf.fit(features_train.values.reshape(-1, 1), target_train)
y_pred = clf.predict(features_test.values.reshape(-1, 1))

precision, recall, fscore, support = score(target_test, y_pred)

print("accuracy : {}".format(accuracy_score(target_test, y_pred)))
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

confusion_matrix(target_test, y_pred)

accuracy : 0.5854700854700855
precision: [0.42857143 0.58785249 0.        ]
recall: [0.01630435 0.98545455 0.        ]
fscore: [0.03141361 0.73641304 0.        ]
support: [184 275   9]


  'precision', 'predicted', average, warn_for)


array([[  3, 181,   0],
       [  4, 271,   0],
       [  0,   9,   0]], dtype=int64)

# Extra Trees

In [79]:
clf = ExtraTreesClassifier(n_estimators=600, random_state=100, min_samples_split=2)
clf = clf.fit(features_train.values.reshape(-1, 1), target_train)
y_pred = clf.predict(features_test.values.reshape(-1, 1))

precision, recall, fscore, support = score(target_test, y_pred)

print("accuracy : {}".format(accuracy_score(target_test, y_pred, normalize=True)))
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

confusion_matrix(target_test, y_pred)

accuracy : 0.5854700854700855
precision: [0.42857143 0.58785249 0.        ]
recall: [0.01630435 0.98545455 0.        ]
fscore: [0.03141361 0.73641304 0.        ]
support: [184 275   9]


  'precision', 'predicted', average, warn_for)


array([[  3, 181,   0],
       [  4, 271,   0],
       [  0,   9,   0]], dtype=int64)

# Ada Boost

In [80]:
clf = AdaBoostClassifier(n_estimators=600, random_state=100)
clf = clf.fit(features_train.values.reshape(-1, 1), target_train)
y_pred = clf.predict(features_test.values.reshape(-1, 1))

precision, recall, fscore, support = score(target_test, y_pred)

print("accuracy : {}".format(accuracy_score(target_test, y_pred, normalize=True)))
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

confusion_matrix(target_test, y_pred)

accuracy : 0.5876068376068376
precision: [0.5        0.58874459 0.        ]
recall: [0.01630435 0.98909091 0.        ]
fscore: [0.03157895 0.73812754 0.        ]
support: [184 275   9]


  'precision', 'predicted', average, warn_for)


array([[  3, 181,   0],
       [  3, 272,   0],
       [  0,   9,   0]], dtype=int64)

In [81]:
clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),n_estimators=600, learning_rate=1, random_state=100)
clf = clf.fit(features_train.values.reshape(-1, 1), target_train)
y_pred = clf.predict(features_test.values.reshape(-1, 1))

precision, recall, fscore, support = score(target_test, y_pred)

print("accuracy : {}".format(accuracy_score(target_test, y_pred, normalize=True)))
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

confusion_matrix(target_test, y_pred)

accuracy : 0.5854700854700855
precision: [0.42857143 0.58785249 0.        ]
recall: [0.01630435 0.98545455 0.        ]
fscore: [0.03141361 0.73641304 0.        ]
support: [184 275   9]


  'precision', 'predicted', average, warn_for)


array([[  3, 181,   0],
       [  4, 271,   0],
       [  0,   9,   0]], dtype=int64)

In [95]:
clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),n_estimators=600, learning_rate=1.5, algorithm="SAMME", random_state=100)
clf = clf.fit(features_train.values.reshape(-1, 1), target_train)
y_pred = clf.predict(features_test.values.reshape(-1, 1))

precision, recall, fscore, support = score(target_test, y_pred)

print("accuracy : {}".format(accuracy_score(target_test, y_pred, normalize=True)))
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

confusion_matrix(target_test, y_pred)

accuracy : 0.561965811965812
precision: [0.35      0.5817757 0.       ]
recall: [0.07608696 0.90545455 0.        ]
fscore: [0.125     0.7083926 0.       ]
support: [184 275   9]


  'precision', 'predicted', average, warn_for)


array([[ 14, 170,   0],
       [ 26, 249,   0],
       [  0,   9,   0]], dtype=int64)

# Gradient Boosting

In [82]:
clf = GradientBoostingClassifier(n_estimators=600, learning_rate=0.5, max_depth=None, random_state=100)
clf = clf.fit(features_train.values.reshape(-1, 1), target_train)
y_pred = clf.predict(features_test.values.reshape(-1, 1))

precision, recall, fscore, support = score(target_test, y_pred)

print("accuracy : {}".format(accuracy_score(target_test, y_pred, normalize=True)))
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

confusion_matrix(target_test, y_pred)

accuracy : 0.5854700854700855
precision: [0.42857143 0.58785249 0.        ]
recall: [0.01630435 0.98545455 0.        ]
fscore: [0.03141361 0.73641304 0.        ]
support: [184 275   9]


  'precision', 'predicted', average, warn_for)


array([[  3, 181,   0],
       [  4, 271,   0],
       [  0,   9,   0]], dtype=int64)

# Ada Boost Naive Bayes

In [84]:
clf = AdaBoostClassifier(GaussianNB(), n_estimators=600, learning_rate=0.1, random_state=100)
clf = clf.fit(features_train.values.reshape(-1, 1), target_train)
y_pred = clf.predict(features_test.values.reshape(-1, 1))

precision, recall, fscore, support = score(target_test, y_pred)

print("accuracy : {}".format(accuracy_score(target_test, y_pred, normalize=True)))
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

confusion_matrix(target_test, y_pred)

accuracy : 0.5876068376068376
precision: [0.         0.58760684 0.        ]
recall: [0. 1. 0.]
fscore: [0.         0.74024226 0.        ]
support: [184 275   9]


  'precision', 'predicted', average, warn_for)


array([[  0, 184,   0],
       [  0, 275,   0],
       [  0,   9,   0]], dtype=int64)

# Bagging

In [93]:
clf = BaggingClassifier(n_estimators=600, random_state=100)
clf = clf.fit(features_train.values.reshape(-1, 1), target_train)
y_pred = clf.predict(features_test.values.reshape(-1, 1))

precision, recall, fscore, support = score(target_test, y_pred)

print("accuracy : {}".format(accuracy_score(target_test, y_pred, normalize=True)))
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

confusion_matrix(target_test, y_pred)

accuracy : 0.5854700854700855
precision: [0.42857143 0.58785249 0.        ]
recall: [0.01630435 0.98545455 0.        ]
fscore: [0.03141361 0.73641304 0.        ]
support: [184 275   9]


  'precision', 'predicted', average, warn_for)


array([[  3, 181,   0],
       [  4, 271,   0],
       [  0,   9,   0]], dtype=int64)

In [94]:
clf = BaggingClassifier(GaussianNB(),n_estimators=600, random_state=100)
clf = clf.fit(features_train.values.reshape(-1, 1), target_train)
y_pred = clf.predict(features_test.values.reshape(-1, 1))

precision, recall, fscore, support = score(target_test, y_pred)

print("accuracy : {}".format(accuracy_score(target_test, y_pred, normalize=True)))
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

confusion_matrix(target_test, y_pred)

accuracy : 0.5876068376068376
precision: [0.         0.58760684 0.        ]
recall: [0. 1. 0.]
fscore: [0.         0.74024226 0.        ]
support: [184 275   9]


  'precision', 'predicted', average, warn_for)


array([[  0, 184,   0],
       [  0, 275,   0],
       [  0,   9,   0]], dtype=int64)