In [1]:
import pandas
import numpy
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, mutual_info_classif
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import matplotlib.pyplot as plt

In [2]:
# load data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pandas.read_csv(url, names=names)
data_values = dataframe.values # 768 samples, 9 features

In [3]:
print data_values
print data_values.shape

[[   6.     148.      72.    ...,    0.627   50.       1.   ]
 [   1.      85.      66.    ...,    0.351   31.       0.   ]
 [   8.     183.      64.    ...,    0.672   32.       1.   ]
 ..., 
 [   5.     121.      72.    ...,    0.245   30.       0.   ]
 [   1.     126.      60.    ...,    0.349   47.       1.   ]
 [   1.      93.      70.    ...,    0.315   23.       0.   ]]
(768, 9)


In [4]:
feature_matrix = data_values[:,0:8]
labels = data_values[:,8]

print len(labels)

768


In [5]:
train_features = feature_matrix[:450]
train_labels = labels[:450]

test_features = feature_matrix[450:]
test_labels = labels[450:]

In [6]:
model = MultinomialNB()

model.fit(train_features,train_labels)

  if np.rank(self.data) != 1 or np.rank(self.indices) != 1 or np.rank(self.indptr) != 1:
  if np.rank(self.data) != 1 or np.rank(self.row) != 1 or np.rank(self.col) != 1:


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [7]:
result = model.predict(test_features)

cfm = metrics.confusion_matrix(test_labels,result)

#True Positive
tp = cfm[0][0]
#FalsePositive
fp = cfm[0][1]
#FalseNegative
fn = cfm[1][0]
#TrueNegative
tn = cfm[1][1]

print('Ergebnisse von Model Naive Bayes')
print(cfm)

klassifikationsfehler = float(fp + fn)/(tp+fp+tn+fn)
print('Klassifikationsfehler: ', klassifikationsfehler)
guete = float(tp + tn)/(tp+fp+tn+fn)
print('Guete: ', guete)

Ergebnisse von Model Naive Bayes
[[142  80]
 [ 49  47]]
('Klassifikationsfehler: ', 0.4056603773584906)
('Guete: ', 0.5943396226415094)


In [8]:
print(metrics.classification_report(test_labels, result, target_names=['Gesund','Diabetes']))

             precision    recall  f1-score   support

     Gesund       0.74      0.64      0.69       222
   Diabetes       0.37      0.49      0.42        96

avg / total       0.63      0.59      0.61       318



In [9]:
# feature selection with SelectKBest
test = SelectKBest(score_func=chi2, k=5)
fit = test.fit(feature_matrix, labels)
# summarize scores
numpy.set_printoptions(precision=3)
print(fit.scores_)
selected_features = fit.transform(feature_matrix)
# summarize selected features
print(selected_features[0:5,:])

# show selected features
for i, selected in enumerate(fit.get_support()):
    if selected:
        print names[i]

[  111.52   1411.887    17.605    53.108  2175.565   127.669     5.393
   181.304]
[[   6.   148.     0.    33.6   50. ]
 [   1.    85.     0.    26.6   31. ]
 [   8.   183.     0.    23.3   32. ]
 [   1.    89.    94.    28.1   21. ]
 [   0.   137.   168.    43.1   33. ]]
preg
plas
test
mass
age


In [10]:
selected_train_features = selected_features[:450]
selected_test_features = selected_features[450:]

model2 = MultinomialNB()

model2.fit(selected_train_features, train_labels)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [11]:
result2 = model2.predict(selected_test_features)

cfm = metrics.confusion_matrix(test_labels,result2)

#True Positive
tp = cfm[0][0]
#FalsePositive
fp = cfm[0][1]
#FalseNegative
fn = cfm[1][0]
#TrueNegative
tn = cfm[1][1]

print('Ergebnisse von Model Naive Bayes')
print(cfm)

klassifikationsfehler = float(fp + fn)/(tp+fp+tn+fn)
print('Klassifikationsfehler: ', klassifikationsfehler)
guete = float(tp + tn)/(tp+fp+tn+fn)
print('Guete: ', guete)

Ergebnisse von Model Naive Bayes
[[138  84]
 [ 53  43]]
('Klassifikationsfehler: ', 0.4308176100628931)
('Guete: ', 0.5691823899371069)


In [12]:
print(metrics.classification_report(test_labels, result2, target_names=['Gesund','Diabetes']))

             precision    recall  f1-score   support

     Gesund       0.72      0.62      0.67       222
   Diabetes       0.34      0.45      0.39        96

avg / total       0.61      0.57      0.58       318



In [13]:
# feature selection RFE
model = LogisticRegression()
rfe = RFE(model, 3)
fit = rfe.fit(feature_matrix, labels)

selected_features = fit.transform(feature_matrix)
# summarize selected features
print(selected_features[0:5,:])

# show selected features
for i, selected in enumerate(fit.support_):
    if selected:
        print names[i]

[[  6.     33.6     0.627]
 [  1.     26.6     0.351]
 [  8.     23.3     0.672]
 [  1.     28.1     0.167]
 [  0.     43.1     2.288]]
preg
mass
pedi


In [14]:
selected_train_features = selected_features[:450]
selected_test_features = selected_features[450:]

model3 = MultinomialNB()

model3.fit(selected_train_features, train_labels)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [15]:
result3 = model3.predict(selected_test_features)

cfm = metrics.confusion_matrix(test_labels,result3)

#True Positive
tp = cfm[0][0]
#FalsePositive
fp = cfm[0][1]
#FalseNegative
fn = cfm[1][0]
#TrueNegative
tn = cfm[1][1]

print('Ergebnisse von Model Naive Bayes')
print(cfm)

klassifikationsfehler = float(fp + fn)/(tp+fp+tn+fn)
print('Klassifikationsfehler: ', klassifikationsfehler)
guete = float(tp + tn)/(tp+fp+tn+fn)
print('Guete: ', guete)

Ergebnisse von Model Naive Bayes
[[174  48]
 [ 59  37]]
('Klassifikationsfehler: ', 0.33647798742138363)
('Guete: ', 0.6635220125786163)


In [16]:
print(metrics.classification_report(test_labels, result3, target_names=['Gesund','Diabetes']))

             precision    recall  f1-score   support

     Gesund       0.75      0.78      0.76       222
   Diabetes       0.44      0.39      0.41        96

avg / total       0.65      0.66      0.66       318



In [17]:
# feature selection with Trees
model = ExtraTreesClassifier()
model.fit(feature_matrix, labels)

print names[:-1]
print model.feature_importances_

importances = model.feature_importances_
indices = numpy.argsort(importances)[::-1]

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(feature_matrix.shape[1]), importances[indices],
       color="r", align="center")
plt.xticks(range(feature_matrix.shape[1]), indices)
plt.xlim([-1, feature_matrix.shape[1]])
plt.show()

#plas, age, mass
selected_indices = [1, 5, 7]
selected_features = []
for sample in range(len(feature_matrix)):
    sample_features = []
    for i in selected_indices:
        sample_features.append(feature_matrix[sample, i])
    selected_features.append(sample_features)

['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age']
[ 0.105  0.238  0.099  0.089  0.061  0.141  0.123  0.143]


In [18]:
selected_train_features = selected_features[:450]
selected_test_features = selected_features[450:]

model4 = MultinomialNB()

model4.fit(selected_train_features, train_labels)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [19]:
result4 = model4.predict(selected_test_features)

cfm = metrics.confusion_matrix(test_labels,result4)

#True Positive
tp = cfm[0][0]
#FalsePositive
fp = cfm[0][1]
#FalseNegative
fn = cfm[1][0]
#TrueNegative
tn = cfm[1][1]

print('Ergebnisse von Model Naive Bayes')
print(cfm)

klassifikationsfehler = float(fp + fn)/(tp+fp+tn+fn)
print('Klassifikationsfehler: ', klassifikationsfehler)
guete = float(tp + tn)/(tp+fp+tn+fn)
print('Guete: ', guete)

Ergebnisse von Model Naive Bayes
[[186  36]
 [ 62  34]]
('Klassifikationsfehler: ', 0.3081761006289308)
('Guete: ', 0.6918238993710691)


In [20]:
print(metrics.classification_report(test_labels, result3, target_names=['Gesund','Diabetes']))

             precision    recall  f1-score   support

     Gesund       0.75      0.78      0.76       222
   Diabetes       0.44      0.39      0.41        96

avg / total       0.65      0.66      0.66       318

