# 1 - Load & Initialize Data
## Import Libraries & Load Data
First, we must initialize the environment and import data from the CSV file into a Pandas dataframe:

In [1]:
# Imports
import math
import pandas as pd
import numpy as np
from pandas import DataFrame
from IPython import display
from sklearn import preprocessing # For label encoding
from sklearn.model_selection import cross_val_score

# Test that Pandas is installed and imported
pd.__version__

df_skin = pd.read_csv("C:/Users/blazn/Desktop/Machine_Learning/Skin_Segmentation.data", encoding = "ISO-8859-1")
# Source: https://archive.ics.uci.edu/ml/datasets/Skin+Segmentation#

## Display Numerical Data
Next, we clean up the data a bit and print basic stats on the number-based columns:

In [2]:
df_skin.describe(include = ['number'])

Unnamed: 0,B,G,R,Class
count,245057.0,245057.0,245057.0,245057.0
mean,125.065446,132.507327,123.177151,1.792461
std,62.255653,59.941197,72.562165,0.405546
min,0.0,0.0,0.0,1.0
25%,68.0,87.0,70.0,2.0
50%,139.0,153.0,128.0,2.0
75%,176.0,177.0,164.0,2.0
max,255.0,255.0,255.0,2.0


## Display Non-Numerical Data
Next, we print basic (and less useful) stats on the non-number columns:

In [3]:
cm_labels = df_skin["Class"].unique()
print(cm_labels)
print("No non-numeric data.\n1 is Skin, 2 is Non-Skin")
print("Manually setting cm_labels to [skin, non-skin]")
cm_labels = ["skin","non-skin"]

[1 2]
No non-numeric data.
1 is Skin, 2 is Non-Skin
Manually setting cm_labels to [skin, non-skin]


# 2 - Pre-process Data

## Randomize Data
Randomize data and print first few rows for confirmation:

In [4]:
df_skin = df_skin.reindex(np.random.permutation(df_skin.index))
df_skin.to_csv("C:/Users/blazn/Desktop/Machine_Learning/skin_RANDOMIZED.data", index = False)
df_skin.head(n = 10)

Unnamed: 0,B,G,R,Class
67687,139,151,157,2
85095,62,63,23,2
158262,139,139,85,2
139479,54,54,18,2
42485,96,147,220,1
165346,181,178,133,2
58626,97,83,77,2
146945,32,34,12,2
18263,141,181,246,1
122920,68,65,50,2


## Select Columns for Features & Labels
The following methods pre-process the data by extracting the relevant features and targets into separate dataframes:

In [5]:
# Takes in a Pandas DataFrame that contains a raw dataset and returns a
# Pandas DataFrame that contains only the selected features used for a model
def get_features_dataframe(df_input):
    
    # Create a new/blank DataFrame
    df_selected = pd.DataFrame()
    
    # Grab any features already available
    df_selected["R"] = df_input["R"]
    df_selected["B"] = df_input["B"]
    df_selected["G"] = df_input["G"]
    
    # Make a copy of the selected features
    df_processed = df_selected.copy()
    
    # Return the selected features (both pre-existing and synthetic)
    return df_processed


# Takes in a Pandas DataFrame that contains a raw dataset and returns a
# Pandas DataFrame that contains only the selected target(s) used for a model
def get_targets_dataframe(df_input):
    
    # Create a new/blank DataFrame
    df_selected = pd.DataFrame()
    
    # Grab any features already available
    df_selected["skin_label"] = df_input["Class"]
    
    # Make a copy of the selected features
    df_processed = df_selected.copy()
    
    # Return the selected features (both pre-existing and synthetic)
    return df_processed

## Seperate Data into Training & Testing Sets
Select the:

percentage of data to be used for classic test/validation split training
number of folds for cross-validation

In [6]:
# Percentage (0-1.0 corresponds to 0% to 100%) of dataset
percent_training_data = .8           # Parameter, can be changed
percent_validation_data = 1 - percent_training_data
num_cv_folds = 5                     # Parameter, can be changed

Now separate the data into training and validation sets by setting the percentage of data to be used for training:

In [7]:
# Choose the first (percent_training_data)% examples for training
num_total_examples = len(df_skin)
num_training_examples = math.ceil(num_total_examples * percent_training_data)
num_validation_examples = num_total_examples - num_training_examples

# Get all examples (useful later on...)
df_features_all = get_features_dataframe(df_skin.head(num_total_examples))
df_targets_all = get_targets_dataframe(df_skin.head(num_total_examples))

# Choose the first (percent_training_data)% for training examples
df_features_training = get_features_dataframe(df_skin.head(num_training_examples))
df_targets_training = get_targets_dataframe(df_skin.head(num_training_examples))

# Choose the last (1-percent_training_data)% for validation examples
df_features_validation = get_features_dataframe(df_skin.tail(num_validation_examples))
df_targets_validation = get_targets_dataframe(df_skin.tail(num_validation_examples))

## Display Summary of Training/Testing Data (SANITY CHECK)
Print out basic stats of the training and validation data for both of the features and targets/labels. Means (averages) between the training and validation features/targets should be close if the data was properly randomized:

In [8]:
# Print summary of data split:
print (str(num_total_examples) + " total examples used: ")
print("\t" + str(round(num_training_examples / num_total_examples * 100, 2)) +
      "% (" + str(num_training_examples) + " examples used for training)")
print("\t" + str(round(num_validation_examples / num_total_examples * 100, 2)) +
      "% (" + str(num_validation_examples) + " examples used for validation)")

# Display summary of features data:
print ("\nTraining examples summary:")
display.display(df_features_training.describe())
print ("\nValidation examples summary:")
display.display(df_features_validation.describe())

# Display summary of labels/targets data:
print ("\nTraining labels/targets summary:")
display.display(df_targets_training.describe())
print ("\nValidation labels/targets summary:")
display.display(df_targets_validation.describe())

245057 total examples used: 
	80.0% (196046 examples used for training)
	20.0% (49011 examples used for validation)

Training examples summary:


Unnamed: 0,R,B,G
count,196046.0,196046.0,196046.0
mean,123.085837,125.089668,132.493303
std,72.494166,62.272005,59.960809
min,0.0,0.0,0.0
25%,70.0,68.0,86.0
50%,128.0,139.0,153.0
75%,164.0,176.0,177.0
max,255.0,255.0,255.0



Validation examples summary:


Unnamed: 0,R,B,G
count,49011.0,49011.0,49011.0
mean,123.542409,124.968558,132.563425
std,72.833126,62.190742,59.863265
min,0.0,0.0,0.0
25%,70.0,68.0,87.0
50%,128.0,139.0,153.0
75%,165.0,176.0,177.0
max,255.0,255.0,255.0



Training labels/targets summary:


Unnamed: 0,skin_label
count,196046.0
mean,1.793156
std,0.405044
min,1.0
25%,2.0
50%,2.0
75%,2.0
max,2.0



Validation labels/targets summary:


Unnamed: 0,skin_label
count,49011.0
mean,1.78968
std,0.40754
min,1.0
25%,2.0
50%,2.0
75%,2.0
max,2.0


## Standardize Data and Display (SANITY CHECK)
Stardardize all feature data so that it looks like Gaussian distribution with 0 MEAN and UNIT 1 variation (standard deviation). Display results for sanity check:

In [9]:
# Create scaler from training examples and normalize both training and validation examples
scaler = preprocessing.StandardScaler().fit(df_features_training)
df_features_training_normalized = pd.DataFrame(scaler.transform(df_features_training))
df_features_validation_normalized = pd.DataFrame(scaler.transform(df_features_validation))

# Display summary of feature data
print ("\nTraining examples summary:")
display.display(df_features_training_normalized.describe())
print ("\nValidation examples summary:")
display.display(df_features_validation_normalized.describe())


Training examples summary:


Unnamed: 0,0,1,2
count,196046.0,196046.0,196046.0
mean,5.5253480000000006e-17,8.372289e-17,-1.589648e-16
std,1.000003,1.000003,1.000003
min,-1.697877,-2.008768,-2.209671
25%,-0.7322792,-0.9167814,-0.7753968
50%,0.06778719,0.2233808,0.3420026
75%,0.5643801,0.8175498,0.742265
max,1.819657,2.086181,2.043118



Validation examples summary:


Unnamed: 0,0,1,2
count,49011.0,49011.0,49011.0
mean,0.006298,-0.001945,0.001169
std,1.004678,0.998698,0.998376
min,-1.697877,-2.008768,-2.209671
25%,-0.732279,-0.916781,-0.758719
50%,0.067787,0.223381,0.342003
75%,0.578174,0.81755,0.742265
max,1.819657,2.086181,2.043118


# 3 - Generate Machine Learning Models and Make Predictions
## Variable Initialization
The following code creates multiple arrays for the purpose of code simplicity

In [10]:
lst_model_names = ["Logistic Regression (LR)", "K Nearest Neighbors (KNN)", "Support Vector Classification (SVC)"]
lst_models = []
lst_model_predictions =[]
lst_model_CMs = []

## Train Data
The following code fits several classifiers to the training data:

In [11]:
print("Model Parameters: ", end="\n\n\t")

# Train/fit Logistic Regresstion Model
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression(solver="lbfgs", max_iter=1000, multi_class="multinomial")
logistic.fit(df_features_training_normalized, df_targets_training.to_numpy().ravel())
lst_models.append(logistic)
print(logistic, end="\n\n\t")

# Train/fit K-nearest Neighbors Model
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(df_features_training_normalized, df_targets_training.to_numpy().ravel())
lst_models.append(knn)
print(knn, end="\n\n\t")

# Train/fit Support Vector Classification Model
from sklearn import svm
svc = svm.SVC(kernel="linear", class_weight="balanced")
svc.fit(df_features_training_normalized, df_targets_training.to_numpy().ravel())
lst_models.append(svc)
print(svc, end="\n\n\t")

# Train/fit Linear Support Vector Classification Model

Model Parameters: 

	LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

	KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

	SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

	

## Make Predictions
The following code makes predictions and prints the raw prediction arrays:

In [12]:
# Predict validation examples and print
for i in range(len(lst_model_names)):
    prediction = lst_models[i].predict(df_features_validation_normalized)
    lst_model_predictions.append(prediction)
    print(lst_model_names[i] + " Predictions:")
    print("\t", prediction, "<== PREDICTION")
    
# Print actual validation labels
print("\n\t", df_targets_validation.to_numpy().ravel(), "<== ACTUAL\n")

Logistic Regression (LR) Predictions:
	 [2 2 2 ... 2 2 2] <== PREDICTION
Support Vector Classification (SVC) Predictions:
	 [2 2 2 ... 2 2 2] <== PREDICTION
K Nearest Neighbors (KNN) Predictions:
	 [2 2 2 ... 2 2 2] <== PREDICTION

	 [2 2 2 ... 2 2 2] <== ACTUAL



# 4 - Formatted Results
## Generate Stats
Generate confusion matrices and labels to display:

In [13]:
# Intuitive labels for Data
#cm_labels = [1, 2] # skin, non-skin

from sklearn.metrics import confusion_matrix

# Generate confusion matrices
for i in range(len(lst_model_names)):
    cm = confusion_matrix(df_targets_validation, lst_model_predictions[i])
    lst_model_CMs.append(cm)

## Display Basic Summary
The following code prints basic results:

In [None]:
# Print correctness of each model
for i in range(len(lst_model_names)):
    print(lst_model_names[i] + " Prediction Accuracy: ")
    
    # Print results for classic split of test and validation data
    print("\tResults for classic {:.0f}/{:.0f} (training/testing) split".format(percent_training_data*100, percent_validation_data*100))
    overall_score = lst_models[i].score(df_features_validation_normalized, df_targets_validation)
    print("\t\tOverall: {:.2f}%".format(overall_score * 100))
    
    # Print out scores for individual classes
    for j in range(len(cm_labels)):
        print("\t\t{:s}: {:.2f}%".format(cm_labels[j], lst_model_CMs[i][j][j] / sum(lst_model_CMs[i][j]) * 100))
    
    
    # Print results for cross-validation
    cv_results = cross_val_score(lst_models[i],df_features_all, df_targets_all.to_numpy().ravel(), cv=num_cv_folds)
    print("\tResults for classic {:d}-fold cross-validation:".format(num_cv_folds))
    print("\t\tOverall: {:.2f}%\n".format(np.mean(cv_results) * 100))

Logistic Regression (LR) Prediction Accuracy: 
	Results for classic 80/20 (training/testing) split
		Overall: 91.75%
		skin: 82.03%
		non-skin: 94.34%
	Results for classic 5-fold cross-validation:
		Overall: 91.88%

Support Vector Classification (SVC) Prediction Accuracy: 
	Results for classic 80/20 (training/testing) split
		Overall: 99.94%
		skin: 99.97%
		non-skin: 99.94%
	Results for classic 5-fold cross-validation:
		Overall: 99.96%

K Nearest Neighbors (KNN) Prediction Accuracy: 
	Results for classic 80/20 (training/testing) split
		Overall: 93.03%
		skin: 100.00%
		non-skin: 91.18%


## Display Confusion Matrices
The following code generates the confusion matrix of the previous predictions:

In [None]:
# Import method from file (changes were made so there are no deprecated methods)
# File/package sourced from

from confusion_matrix_pretty_print import pretty_plot_confusion_matrix

# Print confusion matrices
for i in range(len(lst_model_names)):
    title = lst_model_names[i] + " Confusion Matrix"
    df_cm = DataFrame(lst_model_CMs[i], index=cm_labels, columns=cm_labels)
    pretty_plot_confusion_matrix(df_cm, cmap="PuRd", pred_val_axis="X", title=title)
