In [None]:
import pandas as pd

cancer_set = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', 
                        header = None)
print(cancer_set.shape)
print(cancer_set.head(5))
cancer_set[1].unique()

(569, 32)
         0  1      2      3       4   ...      27      28      29      30       31
0    842302  M  17.99  10.38  122.80  ...  0.6656  0.7119  0.2654  0.4601  0.11890
1    842517  M  20.57  17.77  132.90  ...  0.1866  0.2416  0.1860  0.2750  0.08902
2  84300903  M  19.69  21.25  130.00  ...  0.4245  0.4504  0.2430  0.3613  0.08758
3  84348301  M  11.42  20.38   77.58  ...  0.8663  0.6869  0.2575  0.6638  0.17300
4  84358402  M  20.29  14.34  135.10  ...  0.2050  0.4000  0.1625  0.2364  0.07678

[5 rows x 32 columns]


array(['M', 'B'], dtype=object)

In [None]:
cancer_features = cancer_set.iloc[:,2:] ## df.iloc[rows_start:row_end, col_start:col_end]

print(cancer_features.shape) #(rows, cols)
print(cancer_features.size)  # rows*cols
print(type(cancer_features))

(569, 30)
17070
<class 'pandas.core.frame.DataFrame'>


**cancer_features is a dataframe. It is converted to a numpy array with below code.**

In [None]:
cancer_features = cancer_features.values
print(type(cancer_features))
print(cancer_features.shape)

<class 'numpy.ndarray'>
(569, 30)


**The 30 features used associated with cancer_features dataset are labeled with the following listed names.**

In [None]:
cancer_features_names = ['mean radius', 
'mean texture', 'mean perimeter', 
'mean area', 'mean smoothness', 
'mean compactness', 'mean concavity',
'mean concave points', 'mean symmetry',
'mean fractal dimension','radius error',
'texture error','perimeter error',
'area error', 'smoothness error',
'compactness error','concavity error',
'concave points error','symmetry error',
'fractal dimension error','worst radius',
'worst texture', 'worst perimeter', 
'worst area','worst smoothness', 
'worst compactness', 'worst concavity',
'worst concave points','worst symmetry',
'worst fractal dimension']

# Preprocessing
**Target values of each patient are extracted with below code snippet.**

In [None]:
cancer_target = cancer_set.iloc[:, 1]

# Replacing 'M' with 0 and 'B' with 1
cancer_target = cancer_target.replace(['M', 'B'], [0, 1])

# Converting to numpy array
cancer_target = cancer_target.values

print(type(cancer_target))
print(cancer_target.shape)

<class 'numpy.ndarray'>
(569,)


**Thus obtained cancer_features and cancer_target can be used by a ML algorithm.**

**The same processed data is available in scikit-learn. The below code snippet illustrates accessing features and target arrays.**

# Standardization

In [None]:
import sklearn.datasets as datasets

cancer = datasets.load_breast_cancer()

print(cancer.data.shape)
print(cancer.target.shape)

In [None]:
import sklearn.preprocessing as preprocessing
import sklearn.datasets as datasets

breast_cancer = datasets.load_breast_cancer()

standardizer = preprocessing.StandardScaler()
standardizer = standardizer.fit(cancer.data)
breast_cancer_standardized = standardizer.transform(breast_cancer.data)

print('Mean of each feature after Standardization :\n\n')
print(len(breast_cancer_standardized.mean(axis=0)))
print(breast_cancer_standardized.mean(axis=0))
print('\nStd. of each feature after Standardization :\n\n')
print(breast_cancer_standardized.std(axis=0))


Mean of each feature after Standardization :


30
[-3.16286735e-15 -6.53060890e-15 -7.07889127e-16 -8.79983452e-16
  6.13217737e-15 -1.12036918e-15 -4.42138027e-16  9.73249991e-16
 -1.97167024e-15 -1.45363120e-15 -9.07641468e-16 -8.85349205e-16
  1.77367396e-15 -8.29155139e-16 -7.54180940e-16 -3.92187747e-16
  7.91789988e-16 -2.73946068e-16 -3.10823423e-16 -3.36676596e-16
 -2.33322442e-15  1.76367415e-15 -1.19802625e-15  5.04966114e-16
 -5.21317026e-15 -2.17478837e-15  6.85645643e-16 -1.41265636e-16
 -2.28956670e-15  2.57517109e-15]

Std. of each feature after Standardization :


[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1.]


In [None]:
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 10)).fit(breast_cancer.data)

breast_cancer_minmaxscaled10 = min_max_scaler.transform(breast_cancer.data)

**In the above example, data is transformed to range 0 and 10.**

In [None]:
max_abs_scaler = preprocessing.MaxAbsScaler().fit(breast_cancer.data)

breast_cancer_maxabsscaled = max_abs_scaler.transform(breast_cancer.data)

**By default, MaxAbsScaler transforms data to the range -1 and 1.**

# Normalization

**In above example, l1 norm is used with norm parameter.**

In [None]:
normalizer = preprocessing.Normalizer(norm='l1').fit(breast_cancer.data)

breast_cancer_normalized = normalizer.transform(breast_cancer.data)

# Binerization
**Binarization is the process of transforming data points to 0 or 1 based on a given threshold.**

In [None]:
binarizer = preprocessing.Binarizer(threshold=3.0).fit(breast_cancer.data)
breast_cancer_binarized = binarizer.transform(breast_cancer.data)
print(breast_cancer_binarized[:5,:5])

[[1. 1. 1. 1. 0.]
 [1. 1. 1. 1. 0.]
 [1. 1. 1. 1. 0.]
 [1. 1. 1. 1. 0.]
 [1. 1. 1. 1. 0.]]


# One Hot Encoder

In [None]:
onehotencoder = preprocessing.OneHotEncoder()
onehotencoder = onehotencoder.fit([[1], [1], [1], [2], [2], [1]])

# Transforming category values 1 and 2 to one-hot vectors
print(onehotencoder.transform([[1]]).toarray())
print(onehotencoder.transform([[2]]).toarray())

[[1. 0.]]
[[0. 1.]]


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


# Imputer

In [None]:
imputer = preprocessing.Imputer(missing_values='NaN', strategy='mean')  ## Import impute.SimpleImputer from sklearn instead.
#Imputer was deprecated
imputer = imputer.fit(breast_cancer.data)
breast_cancer_imputed = imputer.transform(breast_cancer.data)



# Label Encoding

In [None]:
labels = ['malignant', 'benign', 'malignant', 'benign']

labelencoder = preprocessing.LabelEncoder()

labelencoder = labelencoder.fit(labels)

bc_labelencoded = labelencoder.transform(breast_cancer.target_names)

***Full Preprocessing on Irirs Program  ***



In [1]:
import sklearn.datasets
import sklearn.preprocessing as prep
import numpy as np
iris = sklearn.datasets.load_iris()

normalizer = prep.Normalizer(norm='l2').fit(iris.data)  # fit to data

iris_normalized = normalizer.transform(iris.data) # transform

print(iris_normalized.mean(axis=0))


from sklearn.preprocessing import OneHotEncoder
onehotencoder = prep.OneHotEncoder()
onehotencoder = onehotencoder.fit(iris.target.reshape(-1,1))
#iris_target_onehot
#print(iris.target.reshape(-1,1))

#iris_target_onehot = ohe.fit(iris.target.reshape(-1,1)) # fit to data

iris_target_onehot = onehotencoder.transform(iris.target.reshape(-1,1))
print(iris_target_onehot.toarray()[[0,50,100]])

iris.data[:50] = [np.nan]
#print(iris.data[:60])


imputer = prep.Imputer(missing_values='NaN', strategy='mean')
imputer = imputer.fit(iris.data)
iris_imputed = imputer.transform(iris.data)
print(iris_imputed.mean(axis=0))

[0.75140029 0.40517418 0.45478362 0.14107142]
[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]


AttributeError: ignored

# Nearest Neighbor
Nearest neighbors method is used to determine a predefined number of data points that are closer to a sample point and predict its label.

In [None]:
import sklearn.datasets as datasets

from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier

cancer = datasets.load_breast_cancer()  # Loading the data set

X_train, X_test, Y_train, Y_test = train_test_split(cancer.data, cancer.target, stratify=cancer.target,random_state=42)

knn_classifier = KNeighborsClassifier()   

knn_classifier = knn_classifier.fit(X_train, Y_train)


**The following code determines the accuracy of model on train and test data sets.**

In [None]:
print('Accuracy of Train Data :', knn_classifier.score(X_train,Y_train))
print('Accuracy of Test Data :', knn_classifier.score(X_test,Y_test))

Accuracy of Train Data : 0.9460093896713615
Accuracy of Test Data : 0.9300699300699301


In [None]:
import sklearn.datasets as datasets

from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier

iris = datasets.load_iris()  # Loading the data set

X_train, X_test, Y_train, Y_test = train_test_split(iris.data, iris.target, stratify=iris.target, random_state=30)

print(X_train.shape)
print(X_test.shape)

knn_classifier = KNeighborsClassifier()
knn_clf = knn_classifier.fit(X_train, Y_train)

print(knn_classifier.score(X_train,Y_train))  #accuracy on training data set 
print(knn_classifier.score(X_test,Y_test))  # accuracy on testing data set

accs = []

for n in range(3,11):
  knn_classifier = KNeighborsClassifier(n_neighbors = n)
  knn_clf = knn_classifier.fit(X_train, Y_train)
  acc = knn_classifier.score(X_test,Y_test)
  accs.append(acc)
  
  
print(accs)
accs.sort()
print(accs[-1])# highest accuracy
  

# Decision Tree


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
import sklearn.datasets as datasets

from sklearn.model_selection import train_test_split

import numpy as np

np.random.seed(100)

boston = datasets.load_boston()  # Loading the data set

X_train, X_test, Y_train, Y_test = train_test_split(boston.data, boston.target, random_state=30)

print(X_train.shape)
print(X_test.shape)

dt_reg = DecisionTreeRegressor()   

dt_reg = dt_reg.fit(X_train, Y_train) 

print(dt_reg.score(X_train,Y_train))
print(dt_reg.score(X_test,Y_test))

print(dt_reg.predict(X_test[:2]))

#print('Accuracy of Train Data :', dt_classifier.score(X_train,Y_train))  # Overfitted

#print('Accuracy of Test Data :', dt_classifier.score(X_test,Y_test))

accs = []
for n in range(2,6):
  dt_reg = DecisionTreeRegressor(max_depth=n)   
  dt_reg = dt_reg.fit(X_train, Y_train) 
  acc = dt_reg.score(X_test,Y_test)
  accs.append(acc)

print(accs)
print(accs.index(max(accs))+2)

**Further the model is improved with change in max_depth value to 2.**

In [None]:

dt_classifier = DecisionTreeClassifier(max_depth=2)   

dt_classifier = dt_classifier.fit(X_train, Y_train) 

print('Accuracy of Train Data :', dt_classifier.score(X_train,Y_train))

print('Accuracy of Test Data :', dt_classifier.score(X_test,Y_test))

# Ensemble Methods

Example of creating a Random forest model is shown below.

In [None]:
from sklearn.ensemble import RandomForestRegressor
import sklearn.datasets as datasets

from sklearn.model_selection import train_test_split

import numpy as np

np.random.seed(100)

boston = datasets.load_boston()  # Loading the data set

X_train, X_test, Y_train, Y_test = train_test_split(boston.data, boston.target, random_state=30)

print(X_train.shape)
print(X_test.shape)

rf_classifier = RandomForestRegressor()

rf_classifier = rf_classifier.fit(X_train, Y_train) 

print(rf_classifier.score(X_train,Y_train))

print(rf_classifier.score(X_test,Y_test))

print(rf_classifier.predict(X_test[:2]))


accs = []
for n,e in zip([3,4,5], [50,100,200]):
  dt_reg = RandomForestRegressor(max_depth=n, n_estimators=e)   
  dt_reg = dt_reg.fit(X_train, Y_train) 
  acc = dt_reg.score(X_test,Y_test)
  accs.append(acc)

print(accs)
print((accs.index(max(accs))+3, 200))


# SVM
The shown model overfits the training data.

In the following example, scaled input data is used to improve the accuracy of SVM classifier.

In [None]:
import sklearn.preprocessing as preprocessing

standardizer = preprocessing.StandardScaler()
standardizer = standardizer.fit(cancer.data)
cancer_standardized = standardizer.transform(cancer.data)

svm_classifier = SVC()

svm_classifier = svm_classifier.fit(X_train, Y_train) 

from sklearn.svm import SVC

svm_classifier = SVC()

svm_classifier = svm_classifier.fit(X_train, Y_train) 

print('Accuracy of Train Data :', svm_classifier.score(X_train,Y_train))

print('Accuracy of Test Data :', svm_classifier.score(X_test,Y_test))

from sklearn import metrics

Y_pred = svm_classifier.predict(X_test)

print('Classification report : \n',metrics.classification_report(Y_test, Y_pred))

In [None]:
import sklearn.datasets as datasets


from sklearn.model_selection import train_test_split

import numpy as np

np.random.seed(100)

digits = datasets.load_digits()  # Loading the data set

X_train, X_test, Y_train, Y_test = train_test_split(digits.data, digits.target, random_state=30)

print(X_train.shape)
print(X_test.shape)

from sklearn.svm import SVC

svm_clf = SVC()

svm_clf = svm_clf.fit(X_train, Y_train) 

print(svm_clf.score(X_test,Y_test))

import sklearn.preprocessing as preprocessing

standardizer = preprocessing.StandardScaler()
digits_standardized = standardizer.fit(digits.data)
digits_standardized = standardizer.transform(digits.data)

X_train, X_test, Y_train, Y_test = train_test_split(digits_standardized, digits.target, random_state=30)
svm_clf2 = SVC()

svm_clf2 = svm_clf2.fit(X_train, Y_train) 

print(svm_clf2.score(X_test,Y_test))

#breast_cancer_standardized = standardizer.transform(breast_cancer.data)

#from sklearn import metrics

#Y_pred = svm_classifier.predict(X_test)

#print('Classification report : \n',metrics.classification_report(Y_test, Y_pred))




# Clustering

In [None]:
from sklearn.cluster import KMeans

kmeans_cluster = KMeans(n_clusters=2)

kmeans_cluster = kmeans_cluster.fit(X_train) 

kmeans_cluster.predict(X_test)

In [None]:
from sklearn.cluster import KMeans
from sklearn import metrics
import sklearn.datasets as datasets

iris = datasets.load_iris()  # Loading the data set

#X_train, X_test, Y_train, Y_test = train_test_split(iris.data, iris.target, stratify=iris.target, random_state=30)

km_cls = KMeans(n_clusters=3)

km_cls = km_cls.fit(iris.data) 

#kmeans_cluster.predict(X_test)
print(metrics.homogeneity_score(km_cls.predict(iris.data), iris.target))

from sklearn.cluster import AgglomerativeClustering

agg_cls = AgglomerativeClustering(n_clusters=3)
agg_cls = agg_cls.fit(iris.data)
print(metrics.homogeneity_score(agg_cls.fit_predict(iris.data), iris.target))


from sklearn.cluster import AffinityPropagation

af_cls = AffinityPropagation()
af_cls = af_cls.fit(iris.data)
print(metrics.homogeneity_score(af_cls.fit_predict(iris.data), iris.target))

#print(X_train.shape)
#print(X_test.shape)

pd.read_table()

Evaluation

In [None]:
from sklearn import metrics

#print(metrics.homogeneity_score(kmeans_cluster.predict(X_test), Y_test))

#print(metrics.completeness_score(kmeans_cluster.predict(X_test), Y_test))

#print(metrics.v_measure_score(kmeans_cluster.predict(X_test), Y_test))

#print(metrics.adjusted_rand_score(kmeans_cluster.predict(X_test), Y_test))




In [None]:
import sklearn.preprocessing as preprocessing

x = [[7.8], [1.3], [4.5], [0.9]]
print(preprocessing.Binarizer().fit(x).transform(x).shape)

In [None]:
from sklearn import datasets

iris = datasets.load_iris()
type(iris)