In [39]:
pip install numpy

Note: you may need to restart the kernel to use updated packages.


In [1]:
from PIL import Image

import PIL
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import glob
import matplotlib.pyplot as plt
import scipy

In [2]:
%matplotlib inline  

# Load the dataset
- Load all the images
- Make them into numpy array
- assign label

In [3]:
# parameters
img_base_dir = '../data/images/'
skin_types = {
    'normal': 0,
    'dry': 1,
    'oily': 2}
img_size = (64,64) 

imgs = []
labels = []
for skin_type in skin_types.keys():
    print('Getting data in here:', skin_type)
    
    # define folder direction
    folder = '%s/%s' %(img_base_dir, skin_type)
    
    # grab all images in the folder
    files = glob.glob ('%s/*.jpg' %(folder)) 

    # for each image, load into numpy array
    for f in files:
        img = np.asarray(Image.open(f).resize(img_size))

        # average color-pixel: 3 channel --> 1 channel
        img = np.average(img, axis=2)
        
        # normalize each pixel from range 0-255 to 0-1
        img /= 255
        
        # plot and check
        # plt.imshow(img)
        imgs.append(img)
        labels.append(skin_types[skin_type])
        
# concat all the array in list
# --> num_images x width x height
imgs = np.stack(imgs)
labels = np.asarray(labels)
print(imgs.shape)
print(labels.shape)

Getting data in here: normal
Getting data in here: dry
Getting data in here: oily
(2441, 64, 64)
(2441,)


# Preprocessing step
1. Identify skin/face images only; remove other images
2. Identify correct label using NLP (keyword search, sentiment analysis)
    - raw test also need clean-up
        - remove stop-word
        - lemmitization
        - contraction
        - etc
3. images preprocessing:
    - zoom or not zoom
    - remove reducdnat background
    - sampling techniques to increases the number of traing samples (keras)

# Split the data into train, validation, test sets
- Simple way: train and test sets only
- correct way: train, validation, and test sets

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
# Split data into train and test subsets
X_train, X_test, y_train, y_test = train_test_split(
    imgs, labels, test_size=0.3, shuffle=True, random_state=100)

# check
print(X_train.shape)
print(X_test.shape)

(1708, 64, 64)
(733, 64, 64)


# Train a machine learning model for skin classification

### Flatten the images so that it has num_images x num_features dimension

In [6]:
# flatten the images
dimension = img_size[0] * img_size[1] # 64*64 = 4096
X_train = X_train.reshape((X_train.shape[0], dimension))  #rmb to change this to 1, dimension for the test image
X_test = X_test.reshape((X_test.shape[0], dimension))

# check
print(X_train.shape)
print(X_test.shape)

(1708, 4096)
(733, 4096)


### Train ML model
- Random forest probably v. slow, use linear version of SVM or logistic regression

In [7]:
from sklearn import svm, metrics
from sklearn.linear_model import LogisticRegression

In [8]:
# Create a support vector classifier
clf = LogisticRegression()

# train
clf.fit(X_train, y_train)

# predict
preds = clf.predict(X_test)
pred_probs = clf.predict_proba(X_test)
print(preds)
print(pred_probs)

[2 0 0 1 0 0 2 2 1 2 1 1 2 2 2 0 1 1 2 1 1 0 0 0 0 1 0 0 0 1 1 1 1 2 1 1 2
 2 2 2 1 0 2 1 2 2 1 1 2 2 0 1 2 1 2 0 0 0 1 2 0 1 1 1 1 0 1 1 0 0 0 2 0 2
 1 0 1 1 2 1 0 2 1 0 1 0 0 1 1 0 2 2 1 0 0 2 2 2 1 2 1 2 0 0 2 2 2 0 0 2 0
 1 2 2 2 2 0 1 0 0 0 1 0 2 2 2 2 2 2 2 1 2 1 1 0 2 2 2 2 1 1 2 0 1 2 0 2 2
 1 1 2 1 1 2 1 0 2 2 1 2 0 2 0 1 0 0 0 0 1 0 1 1 1 0 2 1 1 2 0 1 0 1 0 2 2
 1 2 1 1 0 2 2 1 0 2 2 0 1 2 1 0 2 2 2 2 0 1 2 1 1 2 0 1 1 2 0 2 1 1 0 2 1
 0 1 1 1 1 0 1 1 0 2 2 2 1 1 2 2 1 0 2 0 2 1 2 2 0 0 1 1 0 1 1 1 2 2 0 0 0
 1 1 2 2 2 0 0 1 2 2 0 0 2 2 1 0 2 1 0 2 1 0 1 0 0 2 2 2 1 1 0 0 1 1 1 2 1
 2 0 0 2 1 2 1 2 1 1 0 1 0 2 0 1 0 2 0 0 2 1 1 2 1 2 1 0 2 0 0 1 0 1 2 0 2
 2 1 2 2 0 1 2 2 2 2 2 0 2 2 0 1 2 1 0 2 0 1 1 2 0 1 2 0 1 1 1 0 2 2 2 2 1
 2 1 2 1 0 1 1 2 0 1 0 2 1 0 2 2 1 1 1 1 1 1 1 1 0 1 0 0 0 2 1 0 1 2 1 2 1
 0 1 1 1 1 2 1 0 0 1 2 1 0 1 0 1 2 0 2 2 0 1 0 1 2 0 1 2 0 0 1 1 2 0 0 0 2
 1 1 2 0 1 1 2 1 2 0 2 2 0 1 2 1 0 2 0 1 0 0 2 2 0 2 1 0 2 1 1 1 1 1 1 2 2
 2 2 1 2 2 2 2 1 2 2 0 2 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Evaluation

In [9]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score


In [10]:
cm = confusion_matrix(y_test, preds)

# check
print(cm)
print(accuracy_score(y_test, preds))
print(precision_score(y_test,preds, average='micro'))
print(recall_score(y_test,preds, average='micro'))

[[ 47 101  98]
 [ 74  55 118]
 [ 79 107  54]]
0.21282401091405184
0.21282401091405184
0.21282401091405184


# Save the model

In [11]:
import pickle

In [12]:
fn = 'lr_skin_classification'
model_dir = '../models'

# save
with open('%s/%s.pkl' %(model_dir, fn), 'wb') as f:
    pickle.dump(clf, f)

# Load the model

In [13]:
fn = 'lr_skin_classification'
model_dir = '../models'

# save
with open('%s/%s.pkl' %(model_dir, fn), 'rb') as f:
    model = pickle.load(f)
    
# check
print(model)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


In [29]:
# label prediction
array = model.predict(X_test)

In [22]:
# probability prediction
array2 = model.predict_proba(X_test)

In [30]:
result2 = array2[0]

# Make a single prediction

In [50]:
image_size = (64, 64)
image = np.asarray(Image.open('../data/dry_extra/dry_skin_B007E9F86Q_pid633_0.jpg').resize(image_size))
image = np.average(image, axis=2)
image /= 255
dimension = image_size[0] * image_size[1]
image = image.reshape(1, dimension)

In [52]:
array = model.predict(image)

In [57]:
if array == 0:
    print("Label: Normal")
elif array == 1:
    print("Label: Dry")
elif array == 2:
    print("Label: Oily")

Label: Normal
