## TASK1

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as pt
from sklearn.model_selection import train_test_split
from models.five_fold_validation import k_fold_validation

In [2]:
wine = pd.read_csv("data/winequality/winequality-red.csv", sep=";")

In [3]:
wine['classified']=[1 if x>=6 else 0 for x in wine["quality"] ]
wine

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,classified
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,0
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,0
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,0
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,1
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5,0
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6,1
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6,1
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5,0


In [4]:
cancer = pd.read_csv("data/breastcancer/breast-cancer-wisconsin.data", delimiter=",")

# remove entries where cancer.bare_nuclie == ? and convert the column to int
cancer = cancer[cancer.bare_nuclei != "?"]
cancer.bare_nuclei = pd.to_numeric(cancer.bare_nuclei)

In [5]:
cancer['classified'] = [1 if x == 4 else 0 for x in cancer["class"]]

In [6]:
wine_X = np.array(wine[['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol']])
wine_y = np.array(wine[['classified']])

In [7]:
cancer_X = np.array(cancer[['thickness', 'cell_size', 'cell_shape', 'adhesion', 'e_cell_size',
       'bare_nuclei', 'chromatin', 'nucleoli', 'mitosis']])
cancer_y = np.array(cancer[["classified"]])

## TASK 2

#### Logistic Regression

In [8]:
# built in LR from sklearn

In [9]:
from sklearn.linear_model import LogisticRegression

In [10]:
wine_model=LogisticRegression(solver="lbfgs", max_iter=1000)
k_fold_validation(wine_X, wine_y.reshape(len(wine_y),), wine_model,5)

0.690625

In [11]:
cancer_model=LogisticRegression(solver="lbfgs", max_iter=1000)
k_fold_validation(cancer_X, cancer_y.reshape(len(cancer_y),), cancer_model,5)

0.9343065693430657

In [12]:
# custom LR

In [13]:
from models.logistic_regression import LogisticRegression as LR

In [14]:
learnrate, max_iter = 0.0001, 1000
wine_custom_model = LR( learnrate, max_iter)
k_fold_validation(wine_X, wine_y, wine_custom_model,5)

array([0.321875])

In [15]:
cancer_custom_model = LR(learnrate, max_iter)
k_fold_validation(cancer_X, cancer_y, cancer_custom_model, 5)

array([0.79562044])

#### Linear Discriminant Analysis

In [85]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [86]:
wine_model = LinearDiscriminantAnalysis()
k_fold_validation(wine_X, wine_y.reshape(len(wine_y),), wine_model,5)

0.69375

In [87]:
cancer_model = LinearDiscriminantAnalysis()
k_fold_validation(cancer_X, cancer_y.reshape(len(cancer_y),), cancer_model,5)

0.8905109489051095

In [88]:
from models.linear_discriminant_analysis import LearnDiscriminantAnalysis as LDA

In [89]:
wine_custom_lda = LDA()
k_fold_validation(wine_X, wine_y, wine_custom_lda,5)

array([0.69375])

In [90]:
cancer_custom_model = LDA()
k_fold_validation(cancer_X, cancer_y, cancer_custom_model, 5)

array([0.89051095])

## TASK 3

In [94]:
#do runtime measurements
lr = [0.1, 0.01, 0.001, 0.0001, 0.00001]
n_iter = 300

In [96]:
for learn in lr:
    wine_custom_model = LR(learn, n_iter)
    print("accuracy of learning rate " + str(learn) + ": " + k_fold_validation(wine_X, wine_y, wine_custom_model,5))


Model Trained with params [  -81.41164614 -1278.87633125   659.17546804  -370.36087741
   -81.03362504  3962.97509646 -3503.1344365   -349.73833589
 -1273.77059922   713.84308198  3101.61930208]
Model Trained with params [ -764.27980944 -1154.84359609   459.99702224  -299.46869531
  -119.39094312  4971.31261907 -1254.483898    -263.05015047
  -881.86295813   551.72624194  3839.83943801]
Model Trained with params [ 1034.37667684 -1157.00227092   524.18427467  -147.43197824
  -117.79998809  4695.89954972 -1242.31039978  -271.48083872
  -972.58673939   693.88560451  3662.2212854 ]
Model Trained with params [  607.72055329 -1019.37893418   521.13944363   538.97083147
   -70.09071699  3214.29087712  -424.08063382  -270.94410527
  -934.26677903   608.9109946   3090.11170456]
Model Trained with params [  956.78431395 -1105.61965501   543.60972859  -482.48434405
  -103.08604612  4740.34880695 -1269.62033882  -274.0943443
 -1026.40818715   587.0352352   3606.10156926]
[0.509375]
Model Trained w

In [98]:
for learn in lr:
    cancer_custom_model = LR(learnrate, max_iter)
    acc = k_fold_validation(cancer_X, cancer_y, cancer_custom_model, 5)
    print("accuracy of learning rate " + str(learn) + ": " + str(acc))

Model Trained with params [-0.49600497  1.14398828  0.22088083  0.26713495 -1.21210938  0.73346922
 -0.46622025  0.37694076 -0.43737134]
Model Trained with params [-0.37420852  0.8615293   0.18302236  0.12155875 -0.81862282  0.52428512
 -0.43296915  0.4206701  -0.23937308]
Model Trained with params [-0.38093246  0.91163657  0.25894011  0.11819197 -0.72017841  0.47040403
 -0.51237423  0.33900947 -0.20705145]
Model Trained with params [-0.26098629  1.01061371  0.12527862  0.19141175 -0.73089829  0.55224301
 -0.71148925  0.3672397  -0.2588642 ]
Model Trained with params [-0.26952521  0.70271451  0.24362436  0.10533021 -0.58740615  0.5068322
 -0.59282275  0.29666805 -0.19794776]
accuracy of learning rate0.1: [0.79562044]
Model Trained with params [-0.49600497  1.14398828  0.22088083  0.26713495 -1.21210938  0.73346922
 -0.46622025  0.37694076 -0.43737134]
Model Trained with params [-0.37420852  0.8615293   0.18302236  0.12155875 -0.81862282  0.52428512
 -0.43296915  0.4206701  -0.23937308]