In [None]:
import pandas as pd
import numpy as np
import plotly.io as pio
pio.renderers.default = "notebook_connected"
import plotly.express as px
import plotly.graph_objects as go
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics 
import statsmodels.api as sm
from sklearn.model_selection import GridSearchCV
import seaborn as sns
from mlxtend.plotting import plot_decision_regions
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import plot_confusion_matrix
from sklearn.preprocessing import scale
from sklearn.svm import SVC
from sklearn import neighbors
from sklearn.neighbors import KNeighborsClassifier

<h1> Functions <h1>

In [None]:
#logistic regression with RFE
def RFE_logistic(dataframe, y, n_features):
    logreg = LogisticRegression()
    y = y.astype(int)
    
    rfe = RFE(logreg, n_features_to_select=n_features)
    rfe = rfe.fit(dataframe, y)
    print(rfe.support_)
    print(rfe.ranking_)

In [None]:
def fit_logistic (dataframe, y):
    X_train, X_test, y_train, y_test = train_test_split(dataframe, y, test_size = 0.3)
    y_train = y_train.astype(int)

    logreg = LogisticRegression()
    logreg.fit(X_train, y_train)
    
    y_pred = logreg.predict(X_test)
    print('Accuracy on test set: {:.2f}'.format (logreg.score(X_test, y_test)))
    plot_confusion_matrix(logreg, X_test, y_test)
    print(classification_report(y_test, y_pred))

In [None]:
def knn_comparison(data, k):
    x = data[['radius_mean','texture_mean']].values
    y = data['prediction'].astype(int).values
    clf = neighbors.KNeighborsClassifier(n_neighbors=k)
    clf.fit(x, y)
    #Plotting decision region
    plot_decision_regions(x, y, clf=clf, legend=2)
    #Adding axes annotations
    plt.xlabel('radius_mean')
    plt.ylabel('texture_mean')
    plt.title('Knn with K='+ str(k))
    plt.show()

In [None]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

<h1> Loading dataset and EDA <h1>

In [None]:
 df = pd.read_csv("/Users/dominik/Desktop/breast-cancer.csv")  
 df.head()

In [None]:
df.dtypes


In [None]:
df.describe()

In [None]:
del df['id']

In [None]:
df['diagnosis'].value_counts()

**we can see that whe have enough "malignant" labels, so we don't have to worry about not sufficient amount of the true labels**

In [None]:
df.loc[df['diagnosis'] == 'M', 'diagnosis'] = 1
df.loc[df['diagnosis'] == 'B', 'diagnosis'] = 0

In [None]:
df.head()

In [None]:
df.isna().sum()

In [None]:
columns = df.columns[1:]

In [None]:
for col in df.columns[1:]:
    
    fig = px.histogram(df[col], nbins = 60)
    fig.update_layout(bargap=0)
    fig.show()

In [None]:
df_corr = df.corr()

fig = go.Figure()
fig.add_trace(
    go.Heatmap(
    z=np.array(df_corr),
    x=df_corr.index ,
    y=df_corr.columns,
    colorscale=px.colors.diverging.RdBu,
    zmin=-1,
    zmax=1
    )
)
fig.show()


In [None]:
df.groupby('diagnosis').mean()

**This gives us a really clear view on which variables can matter while building a classification algorithm. For example, area_mean or concavity_mean seem to be really important, whereas fractal_dimension_mean or symmetry_mean have similar values for both labels**

In [None]:
plot_data = df.iloc[:, 1:]


for col in df.columns[1:]:

    fig = px.histogram(
    df,
    x = "diagnosis",
    y = col,
    color = "diagnosis",
    histfunc = "avg"
    )
    fig.show()

**I've decided to create a classification model based on parameters describing mean values**

In [None]:
df_model = [df[col] for col in df.columns if 'mean' in col]
df_model = pd.DataFrame(df_model).T
y = df['diagnosis']
df_model.head()

<h1> Configurating classification algorithms <h1>

<h2> I will try to fit 3 logistic regression models: <h2>
<h3>
1. Model with all variables from df_model dataframe <br>
2. 2 RFE models, which builds multiple models and checks if adding an additional variables makes the model better (will select 6 and 8 features) 
<h3>


In [None]:
smlog = sm.Logit(y, sm.add_constant(df_model))
results = smlog.fit()
print(results.summary2())

In [None]:
probability = 1 / (1 + np.exp(-results.fittedvalues))
px.histogram(probability)

In [None]:
fit_logistic(df_model, y)

**Second model (RFE with 8 features)**

In [None]:
df_model2 = df_model.copy()

In [None]:
RFE_logistic(df_model2, y, 8)

In [None]:
df_model2 = df_model2.drop(['area_mean', 'fractal_dimension_mean'], axis = 1)

In [None]:
smlog = sm.Logit(y, sm.add_constant(df_model2))
results = smlog.fit()
print(results.summary2())

In [None]:
probability = 1 / (1 + np.exp(-results.fittedvalues))
px.histogram(probability)

In [None]:
fit_logistic(df_model2, y)

**Third model: RFE Logistic with 5 features**

In [None]:
df_model3 = df_model.copy()

In [None]:
RFE_logistic(df_model3, y, 5)

In [None]:
df_model3 = df_model3.drop(['texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'fractal_dimension_mean'], axis = 1)

In [None]:
smlog_3 = sm.Logit(y, sm.add_constant(df_model3))
results = smlog_3.fit()
print(results.summary2())

In [None]:
probability = 1 / (1 + np.exp(-results.fittedvalues))
px.histogram(probability)

In [None]:
fit_logistic(df_model3, y)

<h2> Using SVM <h2>

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_model, y, test_size = 0.3, random_state = 0)
X_train_scaled = scale(X_train)
X_test_scaled = scale(X_test)

svm = SVC()
svm.fit(X_train_scaled, y_train)
y_pred = svm.predict(X_test_scaled)
plot_confusion_matrix(svm, X_test_scaled, y_test)
print("Accuracy on test set: {}".format(svm.score(X_test_scaled, y_test)))

In [None]:
print(classification_report(y_test, y_pred))

**As we can see, SVM perform better. It is more often chosen when it comes to handling little datasets. Let's find out if we can somehow optimize the parameters by executing Cross Validation (GridSearchCV)**

In [None]:
initial_gamma = ['scale']
additional_gammas = np.arange(0, 1, 0.01)
additional_gammas = additional_gammas.tolist()
gammas = initial_gamma + additional_gammas

param_grid = [
    {'C': np.arange(1, 100, 1),
    'gamma': gammas,
    'kernel': ['rbf']}
]

optimal_params = GridSearchCV(
    SVC(),
    param_grid,
    cv = 5,
    scoring = 'accuracy',
    verbose = 2
)

optimal_params.fit(X_train_scaled, y_train)
print(optimal_params.best_params_)

**It seems like C = 81, gamma = 0.02 are the optimal values of parameters, let's fit a final SVM model**

In [None]:
reg_svm = SVC(C = 81, gamma = 0.02)
reg_svm.fit(X_train_scaled, y_train)
plot_confusion_matrix(reg_svm, X_test_scaled, y_test)

In [None]:
y_pred = reg_svm.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

**We were able to achieve better performance of the model with a regularization**

**Now let's try other classification algorithms (KNN, Decision Tree Classification, Random Forest)**

In [None]:
n_neighbors = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]

In [None]:
for neighbor in n_neighbors:
     knn = KNeighborsClassifier(n_neighbors = neighbor)
     knn.fit(X_train_scaled, y_train)
     y_pred = knn.predict(X_test_scaled)
     print('For the number of neighbors {:.2f}'.format(neighbor) + ' accuracy = {:.2f}'.format(knn.score(X_test_scaled, y_test)))

**The smallest number of neighbors giving the best predictions is 6, let's take a closer look at this configuration**

In [None]:
knn = KNeighborsClassifier(n_neighbors = 6)
knn.fit(X_train_scaled, y_train)
y_pred = knn.predict(X_test_scaled)

plot_confusion_matrix(knn, X_test_scaled, y_test)
print(classification_report(y_test, y_pred))

**Once again we've found a better performance than logistic regression, however, a little bit worse than SVM. Let's try to visualize the Test Set results**

In [None]:
df_visualize = X_test_scaled.copy()
df_visualize = pd.DataFrame(df_visualize)
df_visualize.columns = X_test.columns
df_visualize ['prediction'] = y_pred
sns.relplot(x = 'radius_mean', y = 'texture_mean', hue = 'prediction', data = df_visualize)

**As we can see - larger mean of the tumor radius and texture leads to the positive prediction of the cancer 
Let's find out how the decision surface looks like for these two characteristics for differenct configurations of the number of neighbors.**

In [None]:
for i in [1, 5, 6, 20, 30, 40, 80]:
    knn_comparison(df_visualize, i)