<img src="../../../../images/classifications.png" style="background:white; display: block; margin-left: auto;margin-right: auto; width:80%"/>

In [1]:
import pandas as pd
import numpy as np

# The data is about breast cancer, each row correspond with patient
df = pd.read_csv('../../../../data/clean/Data_Classification.csv')
display(df.head())
x = df.iloc[:, 1:-1].values
y = df.iloc[:, -1].values

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [2]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)
print("train dataset size : {} observations\ntest dataset size : {} observations".format(x_train.shape[0], x_test.shape[0]))

train dataset size : 546 observations
test dataset size : 137 observations


In [3]:
from sklearn.preprocessing import StandardScaler

stand_x = StandardScaler().fit(x_train)
x_ss = stand_x.transform(x_train)

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

logreg = LogisticRegression(random_state=42, n_jobs=-1)
logreg.fit(x_ss, y_train)

knn = KNeighborsClassifier(n_neighbors=5, algorithm='auto', p=2, metric='minkowski', n_jobs=-1)
knn.fit(x_ss, y_train)

svm = SVC(C=1.0, kernel='rbf', gamma='scale', random_state=42)
svm.fit(x_ss, y_train)

nb = GaussianNB(priors=None, var_smoothing=1e-09)
nb.fit(x_ss, y_train)

dt = DecisionTreeClassifier(criterion='entropy')
dt.fit(x_ss, y_train)

rf = RandomForestClassifier(n_estimators=100, criterion='entropy', n_jobs=-1)
rf.fit(x_ss, y_train)

RandomForestClassifier(criterion='entropy', n_jobs=-1)

In [5]:
y_pred_logreg = logreg.predict(stand_x.transform(x_test))
y_pred_knn = knn.predict(stand_x.transform(x_test))
y_pred_svm = svm.predict(stand_x.transform(x_test))
y_pred_nb = nb.predict(stand_x.transform(x_test))
y_pred_dt = dt.predict(stand_x.transform(x_test))
y_pred_rf = rf.predict(stand_x.transform(x_test))

pd.DataFrame(data=np.stack((y_test, y_pred_logreg, y_pred_knn, y_pred_svm, y_pred_nb, y_pred_dt, y_pred_rf), axis=1),
             index=None, columns=['y_actual', 'LogisticRegression_prediction', 'KNN_prediction', 'SuportVectorMachine_prediction', 'NaiveBayes_prediction', 'DecisionTree_prediction', 'RandomForest_prediction'],
             copy=False).head(10)

Unnamed: 0,y_actual,LogisticRegression_prediction,KNN_prediction,SuportVectorMachine_prediction,NaiveBayes_prediction,DecisionTree_prediction,RandomForest_prediction
0,2,2,2,2,2,2,2
1,2,2,2,2,2,2,2
2,2,2,2,2,2,2,2
3,2,2,2,2,2,2,2
4,2,2,2,2,2,2,2
5,4,4,4,4,4,4,4
6,2,2,2,2,2,2,2
7,2,2,2,2,2,2,2
8,4,4,4,4,4,4,4
9,2,2,2,2,2,2,2


In [6]:
from sklearn.metrics import accuracy_score

score_logreg = accuracy_score(y_test, y_pred_logreg)
score_knn = accuracy_score(y_test, y_pred_knn)
score_svm = accuracy_score(y_test, y_pred_svm)
score_nb = accuracy_score(y_test, y_pred_nb)
score_dt = accuracy_score(y_test, y_pred_dt)
score_rf = accuracy_score(y_test, y_pred_rf)

pd.DataFrame(data=[score_logreg, score_knn, score_svm, score_nb,score_dt, score_rf],
             index=["Logistic Regression", "KNN", "Suport Vector Machine", "Naive Bayes", "Decision Tree", "Random Forest"],
             columns=['Accuracy Score'],
             copy=False)

Unnamed: 0,Accuracy Score
Logistic Regression,0.963504
KNN,0.956204
Suport Vector Machine,0.963504
Naive Bayes,0.963504
Decision Tree,0.956204
Random Forest,0.970803


<img src="../../../../images/false_pos_neg.png" style="background:white; display: block; margin-left: auto;margin-right: auto; width:80%"/><br>
<img src="../../../../images/cap.png" style="background:white; display: block; margin-left: auto;margin-right: auto; width:80%"/>
<img src="../../../../images/cap_analysis.png" style="background:white; display: block; margin-left: auto;margin-right: auto; width:80%"/>

<p style="font-size:14px">From a business point of view<ul style="font-size:14px">
    <li>when we want to rank our predictions by their probability<ul>
            <li><strong>Logistic Regression</strong> or <strong>Naive Bayes</strong>
            <li>for this type of business problem, we should use Logistic Regression if our problem is linear, and Naive Bayes if our problem is non linear
            <li>for example if we want to rank our customers from the highest probability that they buy a certain product, to the lowest probability
        </ul>
    <li>when we want to predict to which segment our customers belong to<ul>
            <li><strong>Support Vector Machine</strong>
            <li>segments can be any kind of segments
        </ul>
    <li>when we want to have clear interpretation of our model results<ul>
            <li><strong>Decision Tree Classification</strong>
        </ul>
    <li>when we are just looking for high performance with less need for interpretation<ul>
            <li><strong>Random Forest Classification</strong>
        </ul>
</ul>
</p>