In [106]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, chi2

In [107]:
dataset = pd.read_csv('static/data/dataset.csv')
dataset.head()

Unnamed: 0,age_c,assess_c,cancer_c,compfilm_c,density_c,famhx_c,hrt_c,prvmam_c,biophx_c,mammtype,CaTypeO,bmi_c,ptid
0,62,1,0,1,2,0,0,1,0,1,8,24.023544,1
1,65,1,0,1,4,0,0,1,0,1,8,-99.0,2
2,69,0,0,1,2,0,0,1,0,1,8,29.052429,3
3,64,2,0,1,2,0,0,1,0,1,8,-99.0,4
4,63,3,0,1,2,0,0,1,1,1,8,33.729523,5


In [108]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   age_c       40000 non-null  int64  
 1   assess_c    40000 non-null  int64  
 2   cancer_c    40000 non-null  int64  
 3   compfilm_c  40000 non-null  int64  
 4   density_c   40000 non-null  int64  
 5   famhx_c     40000 non-null  int64  
 6   hrt_c       40000 non-null  int64  
 7   prvmam_c    40000 non-null  int64  
 8   biophx_c    40000 non-null  int64  
 9   mammtype    40000 non-null  int64  
 10  CaTypeO     40000 non-null  int64  
 11  bmi_c       40000 non-null  float64
 12  ptid        40000 non-null  int64  
dtypes: float64(1), int64(12)
memory usage: 4.0 MB


In [109]:
dataset.describe()

Unnamed: 0,age_c,assess_c,cancer_c,compfilm_c,density_c,famhx_c,hrt_c,prvmam_c,biophx_c,mammtype,CaTypeO,bmi_c,ptid
count,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0
mean,69.55585,1.20345,0.006475,1.90345,2.229725,0.219875,0.504925,1.1082,0.444675,1.5,7.95975,-46.164809,18376.250225
std,7.202974,0.575775,0.080207,2.589304,0.716355,0.762853,1.854633,0.959435,1.309191,0.500006,0.499686,62.225752,10601.991024
min,60.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,-99.0,1.0
25%,63.0,1.0,0.0,1.0,2.0,0.0,0.0,1.0,0.0,1.0,8.0,-99.0,9202.75
50%,68.0,1.0,0.0,1.0,2.0,0.0,0.0,1.0,0.0,1.5,8.0,-99.0,18392.5
75%,75.0,2.0,0.0,1.0,3.0,0.0,0.0,1.0,1.0,2.0,8.0,24.691544,27562.25
max,89.0,5.0,1.0,9.0,4.0,9.0,9.0,9.0,9.0,2.0,8.0,71.721314,36714.0


In [110]:
dataset.isna().sum()

age_c         0
assess_c      0
cancer_c      0
compfilm_c    0
density_c     0
famhx_c       0
hrt_c         0
prvmam_c      0
biophx_c      0
mammtype      0
CaTypeO       0
bmi_c         0
ptid          0
dtype: int64

In [111]:
# replace Nil values with 9
dataset['bmi_c'] = dataset['bmi_c'].replace([-99], 9)
#Convert 8 placeholder to 0 
dataset['CaTypeO'] = dataset['CaTypeO'].replace([8], 0)
# Merge positive cancer diagnosis, versus positive cancer types
dataset['CaTypeO'] = dataset['CaTypeO'].replace([2], 1)
#drop null values identified by the numeric value '9'
dataset = dataset[dataset.compfilm_c != 9]
dataset = dataset[dataset.famhx_c != 9]
dataset = dataset[dataset.hrt_c != 9]
dataset = dataset[dataset.prvmam_c != 9]
dataset = dataset[dataset.biophx_c != 9]
dataset = dataset[dataset.bmi_c != 9]
#delete ptid column
del dataset['ptid']
#Print info again 
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15159 entries, 0 to 39999
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   age_c       15159 non-null  int64  
 1   assess_c    15159 non-null  int64  
 2   cancer_c    15159 non-null  int64  
 3   compfilm_c  15159 non-null  int64  
 4   density_c   15159 non-null  int64  
 5   famhx_c     15159 non-null  int64  
 6   hrt_c       15159 non-null  int64  
 7   prvmam_c    15159 non-null  int64  
 8   biophx_c    15159 non-null  int64  
 9   mammtype    15159 non-null  int64  
 10  CaTypeO     15159 non-null  int64  
 11  bmi_c       15159 non-null  float64
dtypes: float64(1), int64(11)
memory usage: 1.5 MB


In [112]:
#Encode existing features
dataset['density_c'].value_counts()

#Not needed, breast tissue is already encoded in chronological order... 

2    7695
3    3969
1    2934
4     561
Name: density_c, dtype: int64

In [113]:
#Analyze BMI 
dataset['bmi_c'].value_counts()
#Unable to further categorize based on wide array of data 

24.799606    117
25.606049    116
27.463623    109
28.342407    100
23.777039    100
            ... 
33.628937      1
20.083695      1
18.748505      1
32.459320      1
35.646667      1
Name: bmi_c, Length: 1847, dtype: int64

In [114]:
# label encode categorical columns

lblEncoder_state = LabelEncoder()
lblEncoder_state.fit(dataset['assess_c'])
dataset['assess_c'] = lblEncoder_state.transform(dataset['assess_c'])

lblEncoder_cons = LabelEncoder()
lblEncoder_cons.fit(dataset['cancer_c'])
dataset['cancer_c'] = lblEncoder_cons.transform(dataset['cancer_c'])

lblEncoder_name = LabelEncoder()
lblEncoder_name.fit(dataset['compfilm_c'])
dataset['compfilm_c'] = lblEncoder_name.transform(dataset['compfilm_c'])

lblEncoder_party = LabelEncoder()
lblEncoder_party.fit(dataset['density_c'])
dataset['density_c'] = lblEncoder_party.transform(dataset['density_c'])

lblEncoder_symbol = LabelEncoder()
lblEncoder_symbol.fit(dataset['famhx_c'])
dataset['famhx_c'] = lblEncoder_symbol.transform(dataset['famhx_c'])

lblEncoder_gender = LabelEncoder()
lblEncoder_gender.fit(dataset['hrt_c'])
dataset['hrt_c'] = lblEncoder_gender.transform(dataset['hrt_c'])

lblEncoder_category = LabelEncoder()
lblEncoder_category.fit(dataset['prvmam_c'])
dataset['prvmam_c'] = lblEncoder_category.transform(dataset['prvmam_c'])

lblEncoder_edu = LabelEncoder()
lblEncoder_edu.fit(dataset['bmi_c'])
dataset['bmi_c'] = lblEncoder_edu.transform(dataset['bmi_c'])

lblEncoder_state = LabelEncoder()
lblEncoder_state.fit(dataset['biophx_c'])
dataset['biophx_c'] = lblEncoder_state.transform(dataset['biophx_c'])

lblEncoder_state = LabelEncoder()
lblEncoder_state.fit(dataset['mammtype'])
dataset['mammtype'] = lblEncoder_state.transform(dataset['mammtype'])

In [115]:
# separate train features and label
y = dataset["CaTypeO"]
X = dataset.drop(labels=["CaTypeO"], axis=1)
# split dataset into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)
# train and test knn model
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn.predict(X_test)
print("Testing Accuracy is: ", knn.score(X_test, y_test)*100, "%")

Testing Accuracy is:  99.40633245382587 %


In [116]:
# scaling values into 0-1 range
scaler = MinMaxScaler(feature_range=(0, 1))
features = [
    'assess_c', 'cancer_c', 'compfilm_c', 'density_c', 'famhx_c', 'hrt_c', 'prvmam_c', 'prvmam_c', 'bmi_c', 'biophx_c', 'mammtype']
dataset[features] = scaler.fit_transform(dataset[features])

#TEST ACCURACY AGAIN
# separate train features and label
y = dataset["CaTypeO"]
X = dataset.drop(labels=["CaTypeO"], axis=1)
# split dataset into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)
# train and test knn model
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn.predict(X_test)
print("Testing Accuracy is: ", knn.score(X_test, y_test)*100, "%")

Testing Accuracy is:  99.43931398416886 %
