# Scikit-learn library

## Install libraries (add to binder/environment.yml)

In [None]:
!pip install scikit-learn
!pip install matplotlib
!pip install numpy

## Import libraries

In [None]:
import numpy as np
import matplotlib
matplotlib.rcParams['figure.figsize'] = [14,14]

## Classification

### Dataset

#### Load Data

In [None]:
import sklearn.datasets
data = sklearn.datasets.load_breast_cancer()
# data = sklearn.datasets.load_iris()
# data = sklearn.datasets.load_diabetes()
X = data['data']
y = data['target']
feature_names = data['feature_names']
target_names = data['target_names']

#### Train-Validation-Test Split

In [None]:
import sklearn.model_selection
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size = 0.20, random_state = 42)
X_train, X_val, y_train, y_val = sklearn.model_selection.train_test_split(X_train, y_train, test_size = 0.25, random_state = 42)

### Preprocessing

#### Standardising data

In [None]:
import sklearn.preprocessing
standard_scaler = sklearn.preprocessing.StandardScaler()
X_train_standardised = standard_scaler.fit_transform(X_train)
X_val_standardised = standard_scaler.transform(X_val)
X_test_standardised = standard_scaler.transform(X_test)

#### Feature selection

In [None]:
import sklearn.feature_selection
feature_selection_function = sklearn.feature_selection.f_classif
select_best_features = sklearn.feature_selection.SelectKBest(score_func = feature_selection_function, k = 5)
X_train_selected = select_best_features.fit_transform(X_train_standardised, y_train)
X_val_selected = select_best_features.transform(X_val_standardised)
X_test_selected = select_best_features.transform(X_test_standardised)
selected_features_indexes = select_best_features.get_support(indices=True)
selected_features_names = feature_names[selected_features_indexes]


#### Dimentionality reduction

In [None]:
import sklearn.decomposition
pca_model = sklearn.decomposition.PCA(n_components = 2)
X_train_reduced = pca_model.fit_transform(X_train_selected)
X_val_reduced = pca_model.transform(X_val_selected)
X_test_reduced = pca_model.transform(X_test_selected)
explained_variance_ratio = pca_model.explained_variance_ratio_

#### Plot dataset

In [None]:
import matplotlib.pyplot as plt
plt.scatter(X_train_reduced[:,0], X_train_reduced[:,1], c = y_train)

#### Unsupervised learning

In [None]:
import sklearn.cluster
kmeans_model = sklearn.cluster.KMeans(n_clusters = 2, random_state = 42)
y_pred_train_cluster = kmeans_model.fit_predict(X_train_reduced)

# Lets see the result
true_positive_train = X_train_reduced[np.where((y_train==1) & (y_pred_train_cluster==1))[0]]
true_negative_train = X_train_reduced[np.where((y_train==0) & (y_pred_train_cluster==0))[0]]
false_positive_train = X_train_reduced[np.where((y_train==0) & (y_pred_train_cluster==1))[0]]
false_negative_train = X_train_reduced[np.where((y_train==1) & (y_pred_train_cluster==0))[0]]
plt.scatter(true_positive_train[:,0], true_positive_train[:,1], color = 'blue', marker = '.')
plt.scatter(true_negative_train[:,0], true_negative_train[:,1], color = 'green', marker = '.')
plt.scatter(false_positive_train[:,0], false_positive_train[:,1], color = 'blue', marker = 'x')
plt.scatter(false_negative_train[:,0], false_negative_train[:,1], color = 'green', marker = 'x')
plt.legend(['TP', 'TN', 'FP', 'FN'])
plt.title('Clustering data')

#### Supervised learning

##### Logistic Regression

In [None]:
import sklearn.linear_model

logistic_regression_model = sklearn.linear_model.LogisticRegression(penalty = 'l2')
logistic_regression_model.fit(X_train_selected, y_train)
y_pred_val = logistic_regression_model.predict(X_val_selected)

# Change the classifier to get the best validation performance
complete_X_train_selected = np.vstack((X_train_selected, X_val_selected))
complete_y_train = np.hstack((y_train, y_val))
logistic_regression_model.fit(complete_X_train_selected, complete_y_train)

y_pred_test_lr = logistic_regression_model.predict(X_test_selected)
lr_intercept = logistic_regression_model.intercept_
lr_coefs = np.squeeze(logistic_regression_model.coef_)

equation = "logit(X) = %.2f + (%.2f x0) + (%.2f x1) + (%.2f x2) + (%.2f x3) + (%.2f x4)" % (lr_intercept[0], lr_coefs[0], lr_coefs[1], lr_coefs[2], lr_coefs[3], lr_coefs[4])
print(equation)

##### Decision Tree

In [None]:
import sklearn.tree

decision_tree_model = sklearn.tree.DecisionTreeClassifier(max_depth = 3)
decision_tree_model.fit(X_train_selected, y_train)
y_pred_val = decision_tree_model.predict(X_val_selected)

# Change the classifier to get the best validation performance
complete_X_train_selected = np.vstack((X_train_selected, X_val_selected))
complete_y_train = np.hstack((y_train, y_val))
decision_tree_model.fit(complete_X_train_selected, complete_y_train)

y_pred_test_dt = decision_tree_model.predict(X_test_selected)

sklearn.tree.plot_tree(decision_tree_model, feature_names = selected_features_names, class_names = target_names, filled = True)

#### Evaluation

In [None]:
import sklearn.metrics
confusion_matrix_test_lr = sklearn.metrics.confusion_matrix(y_test, y_pred_test_lr)
confusion_matrix_test_dt = sklearn.metrics.confusion_matrix(y_test, y_pred_test_dt)

print("---------------------Logistic Regression---------------------")
print(sklearn.metrics.classification_report(y_test, y_pred_test_lr, target_names = target_names))

print("------------------------Decision Tree------------------------")
print(sklearn.metrics.classification_report(y_test, y_pred_test_dt, target_names = target_names))

## Regression

### Dataset

#### Load Data

In [None]:
import sklearn.datasets
data = sklearn.datasets.load_diabetes()
X = data['data']
y = data['target']
feature_names = np.array(data['feature_names'])

#### Train-validation-test split

In [None]:
import sklearn.model_selection
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size = 0.20, random_state = 42)
X_train, X_val, y_train, y_val = sklearn.model_selection.train_test_split(X_train, y_train, test_size = 0.25, random_state = 42)

### Preprocessing

#### Standardising data

In [None]:
import sklearn.preprocessing
standard_scaler = sklearn.preprocessing.StandardScaler()
X_train_standardised = standard_scaler.fit_transform(X_train)
X_val_standardised = standard_scaler.transform(X_val)
X_test_standardised = standard_scaler.transform(X_test)

#### Feature selection

In [None]:
import sklearn.feature_selection
feature_selection_function = sklearn.feature_selection.f_regression
select_best_features = sklearn.feature_selection.SelectKBest(score_func = feature_selection_function, k = 2)
X_train_selected = select_best_features.fit_transform(X_train_standardised, y_train)
X_val_selected = select_best_features.transform(X_val_standardised)
X_test_selected = select_best_features.transform(X_test_standardised)
selected_features_indexes = select_best_features.get_support(indices=True)
selected_features_names = feature_names[selected_features_indexes]

### Supervised learning

#### Linear Regression

In [None]:
import sklearn.linear_model

linear_regression_model = sklearn.linear_model.LinearRegression()
linear_regression_model.fit(X_train_selected, y_train)
y_pred_val = linear_regression_model.predict(X_val_selected)

# Change the regressor to get the best validation performance
complete_X_train_selected = np.vstack((X_train_selected, X_val_selected))
complete_y_train = np.hstack((y_train, y_val))
linear_regression_model.fit(complete_X_train_selected, complete_y_train)

y_pred_test_linreg = linear_regression_model.predict(X_test_selected)
linreg_intercept = linear_regression_model.intercept_
linreg_coefs = np.squeeze(linear_regression_model.coef_)

equation = "y = %.2f + (%.2f x0) + (%.2f x1)" % (lr_intercept, linreg_coefs[0], linreg_coefs[1])
print(equation)

#### Evaluation

In [None]:
import sklearn.metrics
r2_score_linreg = sklearn.metrics.r2_score(y_test, y_pred_test_linreg)
rmse_linreg = sklearn.metrics.mean_squared_error(y_test, y_pred_test_linreg, squared = False)
rms_target = np.sqrt(np.mean(y_test ** 2))
rrmse_linreg = rmse_linreg/rms_target * 100


## Exercises