# **Scikin-learn (Sklearn)**

*   A library for machine learning in Python
*   Contain several classification, regression, clustering algorithms
*   Designed to work with other Python libraries such as NumPy, MatPlotlib, and SciPy
*   Provide several popular (toy) datasets 


Import Libraries

In [None]:
import numpy as np
from sklearn import datasets

## **Breast Cancer Wisconsin (Diagnostic) Dataset**
*   569 instances (212 Malignant, 357 Benign)
*   30 numerical features (computed from a digitized image of a breast mass)
*   2 classes (Malignant, Benign)

Load Data

In [None]:
wisconsin = datasets.load_breast_cancer()

Investigate Data

In [None]:
print(wisconsin.keys())

In [None]:
print(wisconsin.data.shape)
print(wisconsin.data[0:3,:])

In [None]:
print(wisconsin.feature_names)

In [None]:
print(wisconsin.target_names)
print(wisconsin.target[0:30])

Split Dataset

In [None]:
from sklearn.model_selection import train_test_split

TrainData, TestData, TrainLabel, TestLabel = train_test_split(wisconsin.data, wisconsin.target, test_size=0.3, random_state=1)

In [None]:
print('Train Data: ', TrainData.shape)
print('Test Data: ', TestData.shape)

Check features values between 2 classes

In [None]:
avg_feat = np.mean(TrainData, axis=0)
avg_feat_mg = np.mean(TrainData[TrainLabel==0,:], axis=0)
avg_feat_bn = np.mean(TrainData[TrainLabel==1,:], axis=0)
print('Avg. Features =', avg_feat)
print('\nMalignant: Avg. Features =', avg_feat_mg)
print('\nBenign: Avg. Features =', avg_feat_bn)

Visaulize features

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(15,5))
plt.plot(np.arange(0,30), avg_feat, 'bo-',
         np.arange(0,30), avg_feat_mg, 'r*:',
         np.arange(0,30), avg_feat_bn, 'gd-.')
plt.title('Avg. Features')
plt.ylabel('Price')
plt.xlabel('Year')
plt.legend(['All', 'Malignant', 'Benign'])
#plt.ylim([0, 0.5])
plt.show()

Normalize data

In [None]:
TrainData_min = TrainData.min(axis=0)
TrainData_max = TrainData.max(axis=0)

TrainData = (TrainData - TrainData_min) / (TrainData_max - TrainData_min)
TestData = (TestData - TrainData_min) / (TrainData_max - TrainData_min)

Visualize features again

In [None]:
avg_feat = np.mean(TrainData, axis=0)
avg_feat_mg = np.mean(TrainData[TrainLabel==0,:], axis=0)
avg_feat_bn = np.mean(TrainData[TrainLabel==1,:], axis=0)

plt.figure(figsize=(15,5))
plt.plot(np.arange(0,30), avg_feat, 'bo-',
         np.arange(0,30), avg_feat_mg, 'r*:',
         np.arange(0,30), avg_feat_bn, 'gd-.')
plt.title('Avg. Features')
plt.ylabel('Price')
plt.xlabel('Year')
plt.legend(['All', 'Malignant', 'Benign'])
plt.show()

Let's make random guesses 🙃

In [None]:
np.random.seed(3)
Weights_trial1 = 1.9 * np.random.randn(30) + 0.3  # Randomly set the weights

res_tr1 = np.matmul(TrainData, Weights_trial1)     # Make prediction

In [None]:
np.random.seed(1)
Weights_trial2 = 2.5 * np.random.randn(30) + 0.3   # Randomly set the weights

res_tr2 = np.matmul(TrainData, Weights_trial2)     # Make prediction

In [None]:
np.random.seed(7)
Weights_trial3 = 1.8 * np.random.randn(30) + 0.3  # Randomly set the weights

res_tr3 = np.matmul(TrainData, Weights_trial3)     # Make prediction

Prediction on training data

In [None]:
pred_tr1 = res_tr1 > 0
pred_tr2 = res_tr2 > 0
pred_tr3 = res_tr3 > 0

acc_tr1 = np.sum(pred_tr1 == TrainLabel) / len(pred_tr1)
acc_tr2 = np.sum(pred_tr2 == TrainLabel) / len(pred_tr2)
acc_tr3 = np.sum(pred_tr3 == TrainLabel) / len(pred_tr3)

print("<Training Accuracy>")
print("Trial 1 = ", acc_tr1 * 100)
print("Trial 2 = ", acc_tr2 * 100)
print("Trial 3 = ", acc_tr3 * 100)

Prediction on test data

In [None]:
res_ts = np.matmul(TestData, Weights_trial2)     # Make prediction
pred_ts = res_ts > 0

acc_ts = np.sum(pred_ts == TestLabel) / len(pred_ts)
print("Test Accuracy = ", acc_ts * 100)

### Use sklearn metrics to measure performance

In [None]:
from sklearn import metrics

In [None]:
acc_tr = metrics.accuracy_score(TrainLabel, pred_tr2)
prec_tr = metrics.precision_score(TrainLabel, pred_tr2)
rec_tr = metrics.recall_score(TrainLabel, pred_tr2)
f1_tr = metrics.f1_score(TrainLabel, pred_tr2)

print("<Training>")
print("Accuracy = ", acc_tr*100, "Precision = ", prec_tr, "Recall = ", rec_tr, "F1-score = ", f1_tr)

In [None]:
acc_ts = metrics.accuracy_score(TestLabel, pred_ts)
prec_ts = metrics.precision_score(TestLabel, pred_ts)
rec_ts = metrics.recall_score(TestLabel, pred_ts)
f1_ts = metrics.f1_score(TestLabel, pred_ts)

print("<Test>")
print("Accuracy = ", acc_ts*100, "Precision = ", prec_ts, "Recall = ", rec_ts, "F1-score = ", f1_ts)

Confusion matrix

In [None]:
tr_cmat = metrics.confusion_matrix(TrainLabel, pred_tr2)
ts_cmat = metrics.confusion_matrix(TestLabel, pred_ts)

print("<Confusion Matrix: Training>")
print(tr_cmat)

print("\n<Confusion Matrix: Test>")
print(ts_cmat)

ROC curve & AUC

In [None]:
tr_fpr, tr_tpr, tr_th = metrics.roc_curve(TrainLabel, res_tr2, pos_label=1)
ts_fpr, ts_tpr, ts_th = metrics.roc_curve(TestLabel, res_ts, pos_label=1)

In [None]:
import matplotlib.pyplot as plt

plt.plot(tr_fpr, tr_tpr, color='b', label='Train')
plt.plot(ts_fpr, ts_tpr, color='r', label='Test')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.legend(loc='best')
plt.show()

In [None]:
tr_auc = metrics.roc_auc_score(TrainLabel, res_tr2)
print('Training AUC : ', tr_auc)

ts_auc = metrics.roc_auc_score(TestLabel, res_ts)
print('Test AUC : ', ts_auc)

## **Diabetes Dataset**
*   442 diabetes patients
*   10 numerical features (age, sex, body mass index, avg. blood pressure, ...)
*   Disease progression one year after baseline 

In [None]:
import numpy as np
from sklearn import datasets

Load Data

In [None]:
diabetes = datasets.load_diabetes()

Investigate Data

In [None]:
print(diabetes.keys())

In [None]:
print(diabetes.data.shape)
print(diabetes.data[0:3,:])

In [None]:
print(diabetes.feature_names)

In [None]:
print(diabetes.target.shape)
print(diabetes.target[0:30])

In [None]:
Data_min = diabetes.data.min(axis=0)
Data_max = diabetes.data.max(axis=0)

diabetes_data = (diabetes.data - Data_min) / (Data_max - Data_min)

Let's make random guesses 🙃

In [None]:
np.random.seed(2)
Weights_trial1 = 2.0 * np.random.randn(10) + 10.0   # Randomly set the weights

pred_tr1 = np.matmul(diabetes_data, Weights_trial1)     # Make prediction

In [None]:
np.random.seed(8)
Weights_trial2 = 1.5 * np.random.randn(10) + 15.0   # Randomly set the weights

pred_tr2 = np.matmul(diabetes_data, Weights_trial2)     # Make prediction

In [None]:
np.random.seed(21)
Weights_trial3 = 3.8 * np.random.randn(10) + 20.0   # Randomly set the weights

pred_tr3 = np.matmul(diabetes_data, Weights_trial3)     # Make prediction

Visualize the prediction results

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(14,4))
plt.subplot(1, 3, 1)
plt.scatter(diabetes.target, pred_tr1, c='b', s=10)
plt.title('trial1')
plt.subplot(1, 3, 2)
plt.scatter(diabetes.target, pred_tr2, c='r', s=10)
plt.title('trial2')
plt.subplot(1, 3, 3)
plt.scatter(diabetes.target, pred_tr3, c='m', s=10)
plt.title('trial3')
plt.show()

### Use sklearn metrics to measure performance

In [None]:
from sklearn import metrics

In [None]:
def reg_measure(target, pred):
  MAE = metrics.mean_absolute_error(target, pred)
  MSE = metrics.mean_squared_error(target, pred)
  R2 = metrics.r2_score(diabetes.target, pred)

  print("MAE = ", MAE, " MSE = ", MSE, " R2 = ", R2)

In [None]:
print('Trial1>')
reg_measure(diabetes.target, pred_tr1)
print('Trial2>')
reg_measure(diabetes.target, pred_tr2)
print('Trial3>')
reg_measure(diabetes.target, pred_tr3)

This is NOT the way we build a ML model 😰

We will see how we build an actual ML model in the coming lectures!