## TASK1

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as pt
from sklearn.model_selection import train_test_split
from models.five_fold_validation import k_fold_validation

In [2]:
wine = pd.read_csv("data/winequality/winequality-red.csv", sep=";")

In [3]:
wine['classified']=[1 if x>=6 else 0 for x in wine["quality"] ]
wine

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,classified
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,0
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,0
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,0
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,1
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5,0
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6,1
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6,1
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5,0


In [4]:
cancer = pd.read_csv("data/breastcancer/breast-cancer-wisconsin.data", delimiter=",")

# remove entries where cancer.bare_nuclie == ? and convert the column to int
cancer = cancer[cancer.bare_nuclei != "?"]
cancer.bare_nuclei = pd.to_numeric(cancer.bare_nuclei)

In [5]:
cancer['classified'] = [1 if x == 4 else 0 for x in cancer["class"]]

In [6]:
wine_X = np.array(wine[['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol']])
wine_y = np.array(wine[['classified']])

In [7]:
cancer_X = np.array(cancer[['thickness', 'cell_size', 'cell_shape', 'adhesion', 'e_cell_size',
       'bare_nuclei', 'chromatin', 'nucleoli', 'mitosis']])
cancer_y = np.array(cancer[["classified"]])

## TASK 2

#### Logistic Regression

In [8]:
# built in LR from sklearn

In [9]:
from sklearn.linear_model import LogisticRegression

In [10]:
wine_model=LogisticRegression(solver="lbfgs", max_iter=1000)
k_fold_validation(wine_X, wine_y.reshape(len(wine_y),), wine_model,5)

score: 0.690625
time: 0.23902416229248047


0.690625

In [11]:
cancer_model=LogisticRegression(solver="lbfgs", max_iter=1000)
k_fold_validation(cancer_X, cancer_y.reshape(len(cancer_y),), cancer_model,5)

score: 0.9343065693430657
time: 0.02443385124206543


0.9343065693430657

In [12]:
# custom LR

In [13]:
from models.logistic_regression import LogisticRegression as LR

In [14]:
learnrate, max_iter = 0.0001, 1000
wine_custom_model = LR( learnrate, max_iter)
k_fold_validation(wine_X, wine_y, wine_custom_model,5)

score: [0.321875]
time: 19.620015144348145


array([0.321875])

In [15]:
cancer_custom_model = LR(learnrate, max_iter)
k_fold_validation(cancer_X, cancer_y, cancer_custom_model, 5)

score: [0.79562044]
time: 9.725764036178589


array([0.79562044])

#### Linear Discriminant Analysis

In [16]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [17]:
wine_model = LinearDiscriminantAnalysis()
k_fold_validation(wine_X, wine_y.reshape(len(wine_y),), wine_model,5)

score: 0.69375
time: 0.03426790237426758


0.69375

In [18]:
cancer_model = LinearDiscriminantAnalysis()
k_fold_validation(cancer_X, cancer_y.reshape(len(cancer_y),), cancer_model,5)

score: 0.8905109489051095
time: 0.004364728927612305


0.8905109489051095

In [19]:
from models.linear_discriminant_analysis import LearnDiscriminantAnalysis as LDA

In [20]:
wine_custom_lda = LDA()
k_fold_validation(wine_X, wine_y, wine_custom_lda,5)

score: [0.69375]
time: 0.04214000701904297


array([0.69375])

In [21]:
cancer_custom_model = LDA()
k_fold_validation(cancer_X, cancer_y, cancer_custom_model, 5)

score: [0.89051095]
time: 0.024304866790771484


array([0.89051095])

## TASK 3

In [24]:
#do runtime measurements
lr = [0.1, 0.01, 0.001, 0.0001, 0.00001]
n_iter = 300

In [25]:
for learn in lr:
    print("accuracy of learning rate " + str(learn) + ": ")
    wine_custom_model = LR(learn, n_iter)
    k_fold_validation(wine_X, wine_y, wine_custom_model,5)
    print("")


accuracy of learning rate 0.1: 
score: [0.509375]
time: 6.410499095916748

accuracy of learning rate 0.01: 
score: [0.50625]
time: 6.231107711791992

accuracy of learning rate 0.001: 
score: [0.315625]
time: 6.278414011001587

accuracy of learning rate 0.0001: 
score: [0.39375]
time: 6.084130048751831

accuracy of learning rate 1e-05: 
score: [0.50625]
time: 6.041225910186768



In [26]:
for learn in lr:
    print("accuracy of learning rate " + str(learn) + ": ")
    cancer_custom_model = LR(learnrate, max_iter)
    k_fold_validation(cancer_X, cancer_y, cancer_custom_model, 5)
    print("")

score: [0.79562044]
time: 9.389914751052856
accuracy of learning rate 0.1: [0.79562044]
score: [0.79562044]
time: 10.347959041595459
accuracy of learning rate 0.01: [0.79562044]
score: [0.79562044]
time: 10.600045204162598
accuracy of learning rate 0.001: [0.79562044]
score: [0.79562044]
time: 11.596494913101196
accuracy of learning rate 0.0001: [0.79562044]
score: [0.79562044]
time: 9.325804948806763
accuracy of learning rate 1e-05: [0.79562044]
