In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [None]:
pd.set_option('display.max_columns', 100)
data = pd.read_csv('cancer.csv')

In [None]:
data.drop(['Sample Code Number'],axis = 1, inplace = True)

In [None]:
data['Bare Nuclei']

0       3
1       3
2       3
3       3
4       3
       ..
694     1
695     1
696     8
697    10
698    10
Name: Bare Nuclei, Length: 699, dtype: int64

In [None]:
data.replace('?',0, inplace=True)

In [None]:
# Convert the DataFrame object into NumPy array otherwise you will not be able to impute
values = data.values

# Now impute it
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputedData = imputer.fit_transform(values)

In [None]:
scaler = MinMaxScaler(feature_range=(0, 1))
normalizedData = scaler.fit_transform(imputedData)
cols = ['Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape', 'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bland Chromatin', 'Bare Nuclei', 'Normal Nucleoli', 'Mitosis','Class']
normalizedData = pd.DataFrame(normalizedData, columns=cols)
print(normalizedData.head())

   Clump Thickness  Uniformity of Cell Size  Uniformity of Cell Shape  \
0         0.444444                 0.000000                  0.000000   
1         0.444444                 0.333333                  0.333333   
2         0.222222                 0.000000                  0.000000   
3         0.555556                 0.777778                  0.777778   
4         0.333333                 0.000000                  0.000000   

   Marginal Adhesion  Single Epithelial Cell Size  Bland Chromatin  \
0           0.000000                     0.111111              0.1   
1           0.444444                     0.666667              1.0   
2           0.000000                     0.111111              0.2   
3           0.000000                     0.222222              0.4   
4           0.222222                     0.111111              0.1   

   Bare Nuclei  Normal Nucleoli  Mitosis  Class  
0     0.222222         0.000000      0.0    0.0  
1     0.222222         0.111111      0.0

In [None]:
X = normalizedData.iloc[:,0:9]
y = normalizedData.iloc[:,9]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42)
d = DummyClassifier(strategy='most_frequent')
d.fit(X_train, y_train)
d.score(X_test, y_test)

0.680952380952381

In [None]:
# Generic Bagging model

bag_model = BaggingClassifier(base_estimator = DecisionTreeClassifier(), n_estimators=10, random_state=42)
bag_model = bag_model.fit(X_train, y_train)
bag_pred = bag_model.predict(X_test)
print("Generic Bagging Accuracy: {}".format(accuracy_score(y_test, bag_pred)))

Generic Bagging Accuracy: 0.9571428571428572


In [None]:
# Random Forest model

rf_model = RandomForestClassifier(n_estimators=100, max_features=7, random_state=42)
rf_model = rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
print("Random Forest Accuracy: {}".format(accuracy_score(y_test, rf_pred)))

Random Forest Accuracy: 0.9619047619047619


In [None]:
# Top 3 features for RandomForest

imp = pd.DataFrame(zip(X_train.columns, rf_model.feature_importances_))
imp.sort_values(by=[1], ascending=False)

Unnamed: 0,0,1
1,Uniformity of Cell Size,0.396144
2,Uniformity of Cell Shape,0.249673
5,Bland Chromatin,0.176869
6,Bare Nuclei,0.067437
0,Clump Thickness,0.03689
7,Normal Nucleoli,0.029999
4,Single Epithelial Cell Size,0.022517
3,Marginal Adhesion,0.016603
8,Mitosis,0.003868


In [None]:
# AdaBoost Classification

boost_model = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth=4), n_estimators=200, random_state=42, learning_rate =0.05)
boost_model = boost_model.fit(X_train, y_train)
boost_pred = boost_model.predict(X_test)
print("AdaBoost Classification Accuracy: {}".format(accuracy_score(y_test, boost_pred)))

AdaBoost Classification Accuracy: 0.9619047619047619


In [None]:
# Top 3 features for AdaBoost

imp = pd.DataFrame(zip(X_train.columns, boost_model.feature_importances_))
imp.sort_values(by=[1], ascending=False)

Unnamed: 0,0,1
0,Clump Thickness,0.194414
6,Bare Nuclei,0.190095
5,Bland Chromatin,0.142675
2,Uniformity of Cell Shape,0.131433
7,Normal Nucleoli,0.09574
4,Single Epithelial Cell Size,0.092518
3,Marginal Adhesion,0.055472
1,Uniformity of Cell Size,0.053106
8,Mitosis,0.044548


In [None]:
# Voting Ensemble for Classification

rf = RandomForestClassifier(n_estimators=200)
dt = DecisionTreeClassifier(max_depth=4)
svm = SVC(probability=True)
lr = LogisticRegression()

eclf = VotingClassifier(estimators=[('rf', rf), ('dt', dt), ('svm', svm), ('lr', lr)], voting='soft')
eclf = eclf.fit(X_train, y_train)
eclf_pred = eclf.predict(X_test)
print("Ensemble Classification Accuracy: {}".format(accuracy_score(y_test, eclf_pred)))

Ensemble Classification Accuracy: 0.9666666666666667


In [None]:
#Ensemble