In [None]:
import pandas as pd
import numpy as np
import plotly.io as pio
pio.renderers.default = "notebook_connected"
import plotly.express as px
import plotly.graph_objects as go
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
import statsmodels.api as sm
from sklearn.model_selection import GridSearchCV
import seaborn as sns
from mlxtend.plotting import plot_decision_regions
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier

In [None]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)



   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(\"/Users/dominik/Desktop/breast-cancer.csv\")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**DATASET DESCRIPTION**\n",
    "\n",
    "radius_mean - the mean of the radius of the breast lobe \\\n",
    "texture_mean - mean of surface texture \\\n",
    "perimeter_mean - outer perimeter of lobes (obwód) \\\n",
    "area_mean - mean area of lobes \\\n",
    "smoothness_mean - mean of smoothness levels \\\n",
    "compactness_mean - mean of compactness \\\n",
    "concavity_mean - mean of concavity \\\n",
    "concave points_mean - mean of concave points (punkty wklęsłe) \\\n",
    "symmetry_mean - mean of symmetry \\\n",
    "fractal_dimension_mean - mean of fractal dimension \n",
    "\n",
    "SE means the standard error of the mean of the breast examination variables \n",
    "\n",
    "worst means the worst examination result\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "del df['id']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['diagnosis'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.loc[df['diagnosis'] == 'M', 'diagnosis'] = 1\n",
<<<<<<< HEAD
    "df.loc[df['diagnosis'] == 'B', 'diagnosis'] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.head()"
=======
    "df.loc[df['diagnosis'] == 'B', 'diagnosis'] = 0\n"
>>>>>>> main
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = px.histogram(df['radius_mean'], nbins = 60)\n",
    "fig.update_layout(bargap=0)\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "df_corr = df.corr()\n",
    "\n",
    "fig = go.Figure()\n",
    "fig.add_trace(\n",
    "    go.Heatmap(\n",
    "    z=np.array(df_corr),\n",
    "    x=df_corr.index ,\n",
    "    y=df_corr.columns,\n",
    "    colorscale=px.colors.diverging.RdBu,\n",
    "    zmin=-1,\n",
    "    zmax=1\n",
    "    )\n",
    ")\n",
    "fig.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.groupby('diagnosis').mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
<<<<<<< HEAD
=======
   "metadata": {},
   "outputs": [],
   "source": [
    "df.iloc[:,1:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
>>>>>>> main
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_data = df.iloc[:, 1:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for col in plot_data.columns:\n",
    "\n",
    "    fig = px.histogram(\n",
    "        df,\n",
    "        x =\"diagnosis\",\n",
    "        y = col,\n",
    "        color = \"diagnosis\",\n",
    "        histfunc = \"avg\"\n",
    "    )\n",
    "\n",
    "    fig.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_model = [df[col] for col in df.columns if 'mean' in col]\n",
    "df_model = pd.DataFrame(df_model).T\n",
    "df_model['diagnosis'] = df['diagnosis']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_model.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y = df_model['diagnosis']\n",
    "del df_model['diagnosis']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "    I will try to fit 3 logistic regression models:\n",
    "1. Model with all variables from df_model dataframe\n",
    "2. 2 RFE models, which builds multiple models and checks if adding an additional variables makes the model better (will select 6 and 8 features) "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "    First model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y = y.astype(int)\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(df_model, y, test_size = 0.3, random_state = 0)\n",
    "logreg = LogisticRegression()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "smlog = sm.Logit(y, sm.add_constant(df_model)).fit()\n",
    "smlog.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "probability = 1 / (1 + np.exp(-smlog.fittedvalues))\n",
    "px.histogram(probability)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_train = y_train.astype(int)\n",
    "logreg = LogisticRegression()\n",
    "logreg.fit(X_train, y_train)\n",
    "y_pred = logreg.predict(X_test)\n",
    "print('Accuracy on test set: {:.2f}'.format (logreg.score(X_test, y_test)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "confusion_matrix = confusion_matrix(y_test, y_pred)\n",
    "print(confusion_matrix)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "    Second model (RFE with 8 features)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "logreg = LogisticRegression()\n",
    "df_model2 = df_model\n",
    "y = y.astype(int)\n",
    "\n",
    "rfe = RFE(logreg, n_features_to_select=8)\n",
    "rfe = rfe.fit(df_model2, y)\n",
    "print(rfe.support_)\n",
    "print(rfe.ranking_)\n",
    "print(rfe)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_model.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_model2 = df_model2.drop(['area_mean', 'fractal_dimension_mean'], axis = 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "smlog_2 = sm.Logit(y, sm.add_constant(df_model2))\n",
    "results = smlog_2.fit()\n",
    "print(results.summary2())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "probability = 1 / (1 + np.exp(-results.fittedvalues))\n",
    "px.histogram(probability)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(df_model2, y, test_size=0.3, random_state=0)\n",
    "logreg = LogisticRegression()\n",
    "logreg.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = logreg.predict(X_test)\n",
    "print('Accuracy on test set: {:.2f}'.format (logreg.score(X_test, y_test)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "confusion_matrix = confusion_matrix(y_test, y_pred)\n",
    "print(confusion_matrix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(classification_report(y_test, y_pred))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "    Third model (RFE with 5 features)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "rfe = RFE(logreg, n_features_to_select=5)\n",
    "rfe = rfe.fit(df_model, y)\n",
    "print(rfe.support_)\n",
    "print(rfe.ranking_)\n",
    "print(rfe)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_model.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_model3 = df_model.drop(['texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'fractal_dimension_mean'], axis = 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "smlog_3 = sm.Logit(y, sm.add_constant(df_model3))\n",
    "results = smlog_3.fit()\n",
    "print(results.summary2())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "probability = 1 / (1 + np.exp(-results.fittedvalues))\n",
    "px.histogram(probability)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(df_model3, y, test_size=0.3, random_state=0)\n",
    "logreg.fit(X_train, y_train)\n",
    "y_pred = logreg.predict(X_test)\n",
    "\n",
    "print ('Accuracy on the test test {:.2f}'.format (logreg.score(X_test, y_test)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "confusion_matrix = confusion_matrix(y_test, y_pred)\n",
    "print(confusion_matrix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(classification_report(y_test, y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_test['predicted'] = y_pred.reshape(-1,1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_test.groupby('predicted').mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "    Let's try to fit the data with another algorithm - SVM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import scale\n",
    "from sklearn.metrics import plot_confusion_matrix\n",
    "\n",
    "y = y.astype(int)\n",
    "X_train, X_test, y_train, y_test = train_test_split(df_model, y, test_size = 0.3, random_state = 0)\n",
    "\n",
    "X_train_scaled = scale(X_train)\n",
    "X_test_scaled = scale(X_test)\n",
    "\n",
    "\n",
    "from sklearn.svm import SVC\n",
    "svm = SVC()\n",
    "svm.fit(X_train_scaled, y_train)\n",
    "\n",
    "y_pred = svm.predict(X_test_scaled)\n",
    "plot_confusion_matrix(svm, X_test_scaled, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(classification_report(y_test, y_pred))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "    As we can see, SVM perform better. It is more often chosen when it comes to handling little observations. Let's find out if we can somehow optimize the parameters by executing Cross Validation (GridSearchCV)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from numpy import linspace\n",
    "\n",
    "initial_gamma = ['scale']\n",
    "additional_gammas = np.arange(0, 1, 0.01)\n",
    "additional_gammas = additional_gammas.tolist()\n",
    "gammas = initial_gamma + additional_gammas\n",
    "\n",
    "param_grid = [\n",
    "    {'C': np.arange(1, 100, 1),\n",
    "    'gamma': gammas,\n",
    "    'kernel': ['rbf']}\n",
    "]\n",
    "\n",
    "optimal_params = GridSearchCV(\n",
    "    SVC(),\n",
    "    param_grid,\n",
    "    cv = 5,\n",
    "    scoring = 'accuracy',\n",
    "    verbose = 2\n",
    ")\n",
    "\n",
    "optimal_params.fit(X_train_scaled, y_train)\n",
    "print(optimal_params.best_params_)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "    It seems like C = 81, gamma = 0.02 are the optimal values of parameters, let's fit a final SVM model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "reg_svm = SVC(C = 81, gamma = 0.02)\n",
    "reg_svm.fit(X_train_scaled, y_train)\n",
    "\n",
    "plot_confusion_matrix(reg_svm, X_test_scaled, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = reg_svm.predict(X_test_scaled)\n",
    "print(classification_report(y_test, y_pred))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "    We were able to achieve better performance of the model with a regularization"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "    Now let's start with non-linear classification models (KNN, Decision Tree Classification, Random Forest)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn import neighbors\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "n_neighbors = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]\n",
    "\n",
    "for neighbor in n_neighbors:\n",
    "    knn = KNeighborsClassifier(n_neighbors = neighbor)\n",
    "    knn.fit(X_train_scaled, y_train)\n",
    "    y_pred = knn.predict(X_test_scaled)\n",
    "    print('For the number of neighbors {:.2f}'.format(neighbor) + ' accuracy = {:.2f}'.format(knn.score(X_test_scaled, y_test)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "    The smallest number of neighbors giving the best predictions is 6, let's take a closer look at this configuration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "knn = KNeighborsClassifier(n_neighbors = 6)\n",
    "knn.fit(X_train_scaled, y_train)\n",
    "y_pred = knn.predict(X_test_scaled)\n",
    "\n",
    "plot_confusion_matrix(knn, X_test_scaled, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(classification_report(y_test, y_pred))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "    Once again we've found a better performance than logistic regression, however, a little bit worse than SVM. Let's try to visualize the Test Set results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_visualize = X_test_scaled.copy()\n",
    "df_visualize = pd.DataFrame(df_visualize)\n",
    "df_visualize.columns = X_test.columns\n",
    "df_visualize ['prediction'] = y_pred"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.relplot(x = 'radius_mean', y = 'texture_mean', hue = 'prediction', data = df_visualize)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "As we can see - larger mean of the tumor radius and texture leads to the positive prediction of the cancer \\\n",
    "    Let's find out how the decision surface looks like for these two characteristics for differenct configurations of the number of neighbors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def knn_comparison(data, k):\n",
    " x = data[['radius_mean','texture_mean']].values\n",
    " y = data['prediction'].astype(int).values\n",
    " clf = neighbors.KNeighborsClassifier(n_neighbors=k)\n",
    " clf.fit(x, y)\n",
    "# Plotting decision region\n",
    " plot_decision_regions(x, y, clf=clf, legend=2)\n",
    "# Adding axes annotations\n",
    " plt.xlabel('radius_mean')\n",
    " plt.ylabel('texture_mean')\n",
    " plt.title('Knn with K='+ str(k))\n",
    " plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in [1, 5, 6, 20, 30, 40, 80]:\n",
    "    knn_comparison(df_visualize, i)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "    Let's try to fit the first decision tree to our training data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "clf_dt = DecisionTreeClassifier()\n",
    "clf_dt = clf_dt.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's see how our first tree looks like"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn import tree"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = plt.figure(figsize = (150,120))\n",
    "_ = tree.plot_tree(clf_dt, feature_names = X_train.columns, class_names = [\"No cancer\", \"Cancer\"], filled = True)\n",
    "fig.savefig(\"/Users/dominik/Desktop/binary_tree.png\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_confusion_matrix(clf_dt, X_test, y_test)\n",
    "y_pred = clf_dt.predict(X_test)\n",
    "print(classification_report(y_test, y_pred))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can clearly see that our classification tree overfits the data. Moreover, the tree is really huge and maybe we can make it smaller. Let's try to execute data pruning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = clf_dt.cost_complexity_pruning_path(X_train, y_train)\n",
    "ccp_alphas = path.ccp_alphas\n",
    "ccp_alphas = ccp_alphas[:-1]\n",
    "\n",
    "clf_dts = []\n",
    "\n",
    "for ccp_alpha in ccp_alphas:\n",
    "    clf_dt = DecisionTreeClassifier(ccp_alpha = ccp_alpha)\n",
    "    clf_dt.fit(X_train, y_train)\n",
    "    clf_dts.append(clf_dt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_scores = [clf_dt.score(X_train, y_train) for clf_dt in clf_dts]\n",
    "test_scores = [clf_dt.score(X_test, y_test) for clf_dt in clf_dts]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots()\n",
    "ax.set_xlabel('alpha')\n",
    "ax.set_ylabel('accuracy')\n",
    "ax.set_title('Accuracy depending on the alpha parameter: training vs. testing')\n",
    "ax.plot(ccp_alphas, train_scores, marker = 'o', label = 'train', drawstyle = 'steps-post')\n",
    "ax.plot(ccp_alphas, test_scores, marker = 'o', label = 'test', drawstyle = 'steps-post')\n",
    "ax.legend()\n",
    "plt.show"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "we can see that we don't lose any testing accuracy between alpha = 0 and alpha = 0.005, let's choose the \"marginal\" value to make the tree simpler and limit the overfitting problem. Additionally, we will do some crossvalidation."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import cross_val_score\n",
    "\n",
    "\n",
    "clf_dt = DecisionTreeClassifier(ccp_alpha = 0.00489138)\n",
    "scores = cross_val_score(clf_dt, X_train, y_train, cv = 5)\n",
    "df = pd.DataFrame(data = {'tree' : range(5), 'accuracy' : scores})\n",
    "\n",
    "df.plot(x = 'tree', y = 'accuracy', marker = 'o')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "we may assume that our \"best\" tree performed this good only on one specific type of the training data, a good idea is to perform cross validation with every value of alpha and check which configuration has the largest mean of the accuracy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "statistics = []\n",
    "for ccp_alpha in ccp_alphas:\n",
    "    clf_dt = DecisionTreeClassifier(ccp_alpha = ccp_alpha)\n",
    "    clf_dt.fit(X_train, y_train)\n",
    "    scores = cross_val_score(clf_dt, X_train, y_train, cv = 10)\n",
    "    statistics.append([ccp_alpha, np.mean(scores), np.std(scores)])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "statistics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cv_results = pd.DataFrame(statistics, columns = ['alpha_value', 'mean_accuracy', 'standard_deviation'])\n",
    "cv_results.plot(x = 'alpha_value', y = 'mean_accuracy', yerr = 'standard_deviation', marker = 'o')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we can see that the optimal value for alpha is less than the previus one which was used. Let's see our final model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "clf_dt = DecisionTreeClassifier(ccp_alpha = 0.00326737)\n",
    "clf_dt.fit(X_train, y_train)\n",
    "plot_confusion_matrix(clf_dt, X_test, y_test)\n",
    "\n",
    "y_pred = clf_dt.predict(X_test)\n",
    "print(classification_report(y_test, y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = plt.figure(figsize = (100,80))\n",
    "_ = tree.plot_tree(clf_dt, feature_names = X_train.columns, class_names = [\"No cancer\", \"Cancer\"], filled = True)\n",
    "fig.savefig(\"/Users/dominik/Desktop/binary_tree_final.png\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "By performing cross validation and tree pruning we were able to achieve higher and construct a simper tree."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "    Let's check the performance of the last model - Random Forest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_model.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n",
    "#we will check what is the optimal number of trees\n",
    "num_trees = [20, 40, 60, 80, 100, 120, 140]\n",
    "configurations = []\n",
    "for trees in num_trees:\n",
    "    performance_train = []\n",
    "    performance_test = []\n",
    "    \n",
    "    for i in range(10):\n",
    "        model = RandomForestClassifier(n_estimators = trees, oob_score=True)\n",
    "        model.fit(df_model, y)\n",
    "    \n",
    "        score_train = model.score(df_model, y)\n",
    "        score_test = model.oob_score_\n",
    "        performance_train.append(score_train)\n",
    "        performance_test.append(score_test)\n",
    "\n",
    "    configurations.append([trees, np.mean(performance_train), np.mean(performance_test)])\n",
    "    print(\"For the number of trees: {}, traininig accuracy = {}, testing accuracy= {}\".format(trees, np.mean(performance_train), np.mean(performance_test)))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "It seems like the number of trees = 100 is an optimal value. Let's fit a final model and check the importance of the features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_names = df_model.columns\n",
    "\n",
    "forest = RandomForestClassifier()\n",
    "forest.fit(df_model, y)\n",
    "\n",
    "importances = forest.feature_importances_\n",
    "std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis = 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "forest_importances = pd.Series(importances, index=feature_names)\n",
    "\n",
    "fig, ax = plt.subplots()\n",
    "forest_importances.plot.bar(yerr=std, ax=ax)\n",
    "ax.set_title(\"Feature importances using MDI\")\n",
    "ax.set_ylabel(\"Mean decrease in impurity\")\n",
    "fig.tight_layout()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can observe that some of the features (smoothness_mean, symmetry_mean, fractal_dimension_mean) are not that useful while building a Random Forest model."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.8.8 ('base')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "f640a9078f8e092032cb0185e0c2b2c1ad2376cf3b94da3c4476fa7bf4b3609c"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}