In [1]:
%matplotlib inline
import matplotlib
import numpy as np
import pandas as pd
from sklearn import datasets

# load dataset and transform to pandas df
x1, y1 = datasets.load_breast_cancer(return_X_y=True)
x1 = pd.DataFrame(x1, columns=[f'feat_{i}' for i in range(1,31)])
y1 = pd.DataFrame(y1, columns=['Diagnosis'])

columns = ["ID Number", "Diagnosis", "Mean Radius", "Mean texture", "Mean perimeter", "Mean area", "Mean smoothness (local variation in radius lengths)", "Mean compactness (perimeter^2 / area - 1.0)", "Mean concavity (severity of concave portions of the contour)", "Mean concave points (number of concave portions of the contour)", "Mean symmetry", "Mean fractal dimension ('coastline approximation' - 1)", "SE radius", "SE texture", "SE perimeter", "SE area", "SE smoothness (local variation in radius lengths)", "SE compactness (perimeter^2 / area - 1.0)", "SE concavity (severity of concave portions of the contour)", "SE concave points (number of concave portions of the contour)", "SE symmetry", "SE fractal dimension ('coastline approximation' - 1)", "Worse radius", "Worse texture", "Worse perimeter", "Worse area", "Worse smoothness (local variation in radius lengths)", "Worse compactness (perimeter^2 / area - 1.0)", "Worse concavity (severity of concave portions of the contour)", "Worse concave points (number of concave portions of the contour)", "Worse symmetry", "Worse fractal dimension ('coastline approximation' - 1)"]
df = pd.read_csv(r"data - Copy.csv", index_col = False, names = columns)
del df["ID Number"]
classby = df.groupby(df["Diagnosis"])
m = classby.get_group("M")
del m["Diagnosis"]
b = classby.get_group("B")
del b["Diagnosis"]

In [2]:
#Converting dataframes to matrices
m = m.to_numpy()
b = b.to_numpy()

#reshaping to change the shape from (30,) to (30,1)
mean_a = m.mean(axis=0).reshape(-1,1)
mean_b = b.mean(axis=0).reshape(-1,1)

#Calculating scatter within class for m and b
Sw1 = np.cov(m.T)
Sw2 = np.cov(b.T)
Sw = Sw1 + Sw2
inv_Sw = np.linalg.inv(Sw)

#Calculating scatter between classes
Sb = (mean_a-mean_b)*((mean_a-mean_b).T)

#calculating eigen values and eigen vectors
eig_vals, eig_vecs = np.linalg.eig(inv_Sw.dot(Sb))
eig_vals = eig_vals.real #since both matrices are symmetric, imaginary values can be discarded
eig_vecs = eig_vecs.real

#List of eigen values and vectors
eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]

#Sorting in decreasing order
eig_pairs = sorted(eig_pairs, key=lambda k: k[0], reverse=True)

#converting to matrix
x = x1.to_numpy()
y = y1.to_numpy()

#Maximising function
W = np.hstack((eig_pairs[0][1].reshape(30,1))).reshape(30,1)
print(x.shape)

#Transforming the data
x_flda = x.dot(W)
#transformation along with the "Diagnosis" attribute
flda = pd.concat([pd.DataFrame(x_flda, index=range(len(x_flda))), y1], 1)
print(flda)
#flda.to_csv("flda.csv")

(569, 30)
            0  Diagnosis
0   -0.159679          0
1   -0.152098          0
2   -0.166145          0
3   -0.164152          0
4   -0.151759          0
..        ...        ...
564 -0.172394          0
565 -0.159236          0
566 -0.141741          0
567 -0.183787          0
568 -0.098718          1

[569 rows x 2 columns]


In [3]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=3)
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
score = cross_val_score(classifier, X = x_flda, y = flda["Diagnosis"], 
                        cv = KFold(x_flda.shape[0]))
count = 0
err = 0
er = []
for i in range (len(score)):
    if score[i] == 1:
        count = count + 1
    if score[i] == 0:
        err = err + 1
        er.append(i)
#print(er)
print((count)/(err+count))

0.968365553602812
