In [None]:
# Part 1

# 1.1
# Accuracy is how close a measured value is to the actual (true) value
# Precision is how close the measured values are to each other

# 1.2
# precision is the fraction of retrieved documents that are relevant to the query
# For example, for a text search on a set of documents, 
# precision is the number of correct results divided by the number of all returned results

# recall is the fraction of the relevant documents that are successfully retrieved
# For example, for a text search on a set of documents, 
# recall is the number of correct results divided by the number of results that should have been returned.


In [2]:
import pandas as pd
import numpy as np
import sklearn
from sklearn import linear_model, metrics
from sklearn.model_selection import KFold, cross_val_predict

In [3]:
df = pd.read_csv('BreastCancer.csv')
df.head()

Unnamed: 0,ID,Diagnosis,Mean radius,Mean texture,Mean perimeter,Mean area,Mean smoothness,Mean compactness,Mean concavity,Mean concave points,...,Worst radius,Worst texture,Worst perimeter,Worst area,Worst smoothness,Worst compactness,Worst concavity,Worst concave points,Worst symmetry,Worst fractal dimension
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [17]:
logistic = linear_model.LogisticRegression()
variables = ['Mean radius','Mean texture','Mean perimeter','Mean area','Mean smoothness','Mean compactness','Mean symmetry']

x = df[variables].values.reshape(-1,len(variables))
y = df['Diagnosis']

folds = KFold(n_splits=10)

accuracies = []
    
print()
print("accuracies:")

for train_indices, test_indices in folds.split(x, y):
    x_train, x_test = x[train_indices], x[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]

    logreg.fit(x_train, y_train)

    prediction = cross_val_predict(logistic,x_test,y_test, cv=10)

    accuracy = metrics.classification_report(y_test, prediction, target_names=["Benign","Malignant"])

    accuracies.append(accuracy)

    print(accuracy)



accuracies:
             precision    recall  f1-score   support

     Benign       0.62      0.45      0.53        11
  Malignant       0.88      0.93      0.91        46

avg / total       0.83      0.84      0.83        57

             precision    recall  f1-score   support

     Benign       0.92      0.94      0.93        35
  Malignant       0.90      0.86      0.88        22

avg / total       0.91      0.91      0.91        57

             precision    recall  f1-score   support

     Benign       0.87      0.94      0.91        36
  Malignant       0.89      0.76      0.82        21

avg / total       0.88      0.88      0.87        57

             precision    recall  f1-score   support

     Benign       0.93      0.90      0.91        29
  Malignant       0.90      0.93      0.91        28

avg / total       0.91      0.91      0.91        57

             precision    recall  f1-score   support

     Benign       0.93      0.93      0.93        29
  Malignant       0.

In [None]:
# most of our guesses lay around a score of 0.9
# which means we are very consistent with guessing right
# but our support is'nt that great, so to be more secure in our guesses
# we need a bigger data sample. 