```
Topic:        Project 3
Subject:      Determining Benign or Malignant Breast Cancer
Date:         08/05/2020
Name:         David Weon

Notebook has been cleaned using nbextension 'Code prettify'
```

# Connecting to AWS Postgresql

In [1]:
from sqlalchemy import create_engine
import psycopg2
import pandas as pd
import numpy as np

In [2]:
#connect to aws machine with elastic IP
cnx = create_engine('postgresql://ubuntu@34.196.129.175:5432/breastcancer')

In [3]:
pd.read_sql_query('''SELECT * FROM dataset limit 20''', cnx).head()

Unnamed: 0,index,id,clumpthickness,uniformcellsize,uniformcellshape,margadhesion,epithelial,barenuclei,blandchromatin,normalnucleoli,mitoses,benormal
0,0,1000025,5,1,1,1,2,1,3,1,1,0
1,1,1002945,5,4,4,5,7,10,3,2,1,0
2,2,1015425,3,1,1,1,2,2,3,1,1,0
3,3,1016277,6,8,8,1,3,4,3,7,1,0
4,4,1017023,4,1,1,3,2,1,3,1,1,0


# Data Import, Clean, Check

## Data Import

In [4]:
#import data from aws postgresql
breastcancer = pd.DataFrame(pd.read_sql_query('''SELECT * FROM dataset''',
                                              cnx))
breastcancer = breastcancer.set_index('index')

## Data Clean

In [5]:
#check for '?' values
breastcancer.isin(['?']).any()

id                  False
clumpthickness      False
uniformcellsize     False
uniformcellshape    False
margadhesion        False
epithelial          False
barenuclei           True
blandchromatin      False
normalnucleoli      False
mitoses             False
benormal            False
dtype: bool

In [6]:
#replace '?' with nan for cleaning
breastcancer['barenuclei'].replace('?', np.nan, inplace=True)

#convert columns to int/float
breastcancer = breastcancer.apply(pd.to_numeric)

#replace '?' with the column mean
breastcancer['barenuclei'].fillna(breastcancer['barenuclei'].mean(),
                                  inplace=True)

## Data Check

In [7]:
breastcancer.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                699 non-null    int64  
 1   clumpthickness    699 non-null    int64  
 2   uniformcellsize   699 non-null    int64  
 3   uniformcellshape  699 non-null    int64  
 4   margadhesion      699 non-null    int64  
 5   epithelial        699 non-null    int64  
 6   barenuclei        699 non-null    float64
 7   blandchromatin    699 non-null    int64  
 8   normalnucleoli    699 non-null    int64  
 9   mitoses           699 non-null    int64  
 10  benormal          699 non-null    int64  
dtypes: float64(1), int64(10)
memory usage: 65.5 KB


In [8]:
breastcancer.describe()

Unnamed: 0,id,clumpthickness,uniformcellsize,uniformcellshape,margadhesion,epithelial,barenuclei,blandchromatin,normalnucleoli,mitoses,benormal
count,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0
mean,1071704.0,4.41774,3.134478,3.207439,2.806867,3.216023,3.544656,3.437768,2.866953,1.589413,0.344778
std,617095.7,2.815741,3.051459,2.971913,2.855379,2.2143,3.601852,2.438364,3.053634,1.715078,0.475636
min,61634.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
25%,870688.5,2.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,0.0
50%,1171710.0,4.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,0.0
75%,1238298.0,6.0,5.0,5.0,4.0,4.0,5.0,5.0,4.0,1.0,1.0
max,13454350.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,1.0


# Modeling

In [9]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
%matplotlib inline

In [10]:
X, y = breastcancer.drop(['id', 'benormal'], axis=1), breastcancer['benormal']

## Cross Validation Train/Val Scores for Various Models

In [11]:
import sys
sys.path.append("..")

from src.models.cv_models_train import crossval_knn_scores, crossval_logit_scores, crossval_rfc_scores, crossval_dtc_scores, crossval_svm_scores, crossval_gnb_scores

In [12]:
#k nearest neighbors model
knn_scores = crossval_knn_scores(X, y)
knn_scores

[{'accuracy': 0.9642240493319632},
 {'precision': 0.9592571562176426},
 {'recall': 0.9379397712573402},
 {'f1': 0.9482365472504792},
 {'ROC/AUC': 0.9874148705242783}]

In [13]:
#logistic regression model
logit_scores = crossval_logit_scores(X, y)
logit_scores

[{'accuracy': 0.9685097636176774},
 {'precision': 0.9552438592887885},
 {'recall': 0.9546791693421556},
 {'f1': 0.9546995052858188},
 {'ROC/AUC': 0.9926251066009648}]

In [14]:
#decision tree model
dtc_scores = crossval_dtc_scores(X, y)
dtc_scores

[{'accuracy': 0.9441932168550874},
 {'precision': 0.9173221014231887},
 {'recall': 0.9251946732181245},
 {'f1': 0.9199361471425089},
 {'ROC/AUC': 0.9394718648069501}]

In [15]:
#random forest model
rfc_scores = crossval_rfc_scores(X, y)
rfc_scores

[{'accuracy': 0.9599280575539568},
 {'precision': 0.9394144457121023},
 {'recall': 0.9456162418455755},
 {'f1': 0.941988506171354},
 {'ROC/AUC': 0.9883412717409239}]

In [16]:
#support vector machine model
svm_scores = crossval_svm_scores(X, y)
svm_scores

[{'accuracy': 0.9684994861253855},
 {'precision': 0.9402995205852349},
 {'recall': 0.9707706059167853},
 {'f1': 0.9551725466726554},
 {'ROC/AUC': 0.9864928552677336}]

In [17]:
#gaussian naive bayes
gnb_scores = crossval_gnb_scores(X, y)
gnb_scores

[{'accuracy': 0.9584892086330935},
 {'precision': 0.9149477124183006},
 {'recall': 0.9707706059167853},
 {'f1': 0.9418555676129801},
 {'ROC/AUC': 0.9851055147405902}]