In [1]:
from PIL import Image
import PIL.ImageOps

from collections import defaultdict
import glob
from random import shuffle, seed
import numpy as np
import pylab as pl
import pandas as pd
import re
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
from matplotlib.pyplot import imshow
import matplotlib.pyplot as plt

def img_to_array(filename):
    """
    takes a filename and turns it into a numpy array of RGB pixels
    """
    img = Image.open(filename)
    desired_size = 200
    old_size = img.size  # old_size[0] is in (width, height) format
    ratio = float(desired_size)/max(old_size)
    new_size = tuple([int(x*ratio) for x in old_size])
    img = img.resize(new_size, Image.ANTIALIAS)
    new_img = Image.new("RGB", (desired_size, desired_size))
    
    new_img.paste(img, ((desired_size-new_size[0])//2,
                        (desired_size-new_size[1])//2))
    img = new_img
    
    img = list(img.getdata())
    img = map(list, img)
    img = np.array(img)
    img_wide = img.reshape(1, -1)
    return img_wide[0]

girls_files = glob.glob('/Users/franklooi/Desktop/Minerva/CS156 Machine Learning/Women/*.JPEG')
boys_files = glob.glob('/Users/franklooi/Desktop/Minerva/CS156 Machine Learning/Men/*.JPEG')

process_file = img_to_array

raw_data = [(process_file(filename),'girl',filename) for filename in girls_files] + \
           [(process_file(filename),'boy',filename) for filename in boys_files]
    
# randomly order the data
seed(0)
shuffle(raw_data)

# pull out the features and the labels
data = np.array([cd for (cd,_y,f) in raw_data])
labels = np.array([_y for (cd,_y,f) in raw_data])

X = data
Y = [1 if label == 'boy' else 0 for label in labels]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [2]:
from sklearn.svm import LinearSVC

print "Full Data without PCA or LDA"
logreg = LogisticRegression(penalty='l2', class_weight='balanced', solver = 'liblinear')
logreg.fit(X_train, Y_train)
predictions = logreg.predict(X_test) 

#obtain the scores to measure 
print ("Accuracy score for Logistic Regression: {}".format(metrics.accuracy_score(Y_test, predictions)))
print("Classification report for classifier %s:\n%s\n"
      % ('Logistic Regression', metrics.classification_report(Y_test, predictions)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(Y_test, predictions))

svc = LinearSVC(random_state=0)
svc.fit(X_train, Y_train)
svcpredictions = svc.predict(X_test)

print ("Accuracy score for Linear SVC: {}".format(metrics.accuracy_score(Y_test, svcpredictions)))
print("Classification report for classifier %s:\n%s\n"
      % ('Linear SVC', metrics.classification_report(Y_test, svcpredictions)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(Y_test, svcpredictions))

Full Data without PCA or LDA
Accuracy score for Logistic Regression: 0.632558139535
Classification report for classifier Logistic Regression:
             precision    recall  f1-score   support

          0       0.60      0.67      0.63       204
          1       0.67      0.60      0.63       226

avg / total       0.64      0.63      0.63       430


Confusion matrix:
[[136  68]
 [ 90 136]]
Accuracy score for Linear SVC: 0.63023255814
Classification report for classifier Linear SVC:
             precision    recall  f1-score   support

          0       0.60      0.66      0.63       204
          1       0.66      0.60      0.63       226

avg / total       0.63      0.63      0.63       430


Confusion matrix:
[[135  69]
 [ 90 136]]


In [3]:
from sklearn.decomposition import PCA
from skimage import io  
from pylab import *
import operator

accuracy_scores = []
n_comps = []
for i in range(1,10):
    n_comp = i * 8
    # Fit different PCA on all images
    pca = PCA(n_components = n_comp)
    x_x = pca.fit_transform(X)
    
    X_train_PCA_, X_test_PCA_, Y_train_PCA_, Y_test_PCA_ = train_test_split(x_x, Y, test_size=0.2, random_state=42)

    logreg_logreg = LogisticRegression(penalty='l2', class_weight='balanced', solver = 'saga')
    logreg_logreg.fit(X_train_PCA_, Y_train_PCA_)
    predictions_PCA_ = logreg_logreg.predict(X_test_PCA_) 
    accuracy_scores.append(metrics.accuracy_score(Y_test_PCA_, predictions_PCA_))
    n_comps.append(n_comp)

index_with_higest_accuracy = max(enumerate(accuracy_scores),key=lambda x: x[1])[0]
n_components = n_comps[index_with_higest_accuracy]

print "PCA with {} number of components".format(n_components)

pca_final = PCA(n_components = n_components)
new_X = pca_final.fit_transform(X)

X_train_PCA, X_test_PCA, Y_train_PCA, Y_test_PCA = train_test_split(new_X, Y, test_size=0.2, random_state=42)

logreg_PCA = LogisticRegression(penalty='l2', class_weight='balanced', solver = 'liblinear')
logreg_PCA.fit(X_train_PCA, Y_train_PCA)
predictions_PCA = logreg_PCA.predict(X_test_PCA) 

#obtain the scores to measure 
print ("Accuracy score for Logistic Regression: {}".format(metrics.accuracy_score(Y_test_PCA, predictions_PCA)))
print("Classification report for classifier %s:\n%s\n"
      % ('Logistic Regression', metrics.classification_report(Y_test_PCA, predictions_PCA)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(Y_test_PCA, predictions_PCA))

svc_PCA = LinearSVC(random_state=0)
svc_PCA.fit(X_train_PCA, Y_train_PCA)
svcpredictions_PCA = svc_PCA.predict(X_test_PCA)

print ("Accuracy score for Linear SVC: {}".format(metrics.accuracy_score(Y_test_PCA, svcpredictions_PCA)))
print("Classification report for classifier %s:\n%s\n"
      % ('Linear SVC', metrics.classification_report(Y_test_PCA, svcpredictions_PCA)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(Y_test_PCA, svcpredictions_PCA))

PCA with 32 number of components
Accuracy score for Logistic Regression: 0.695348837209
Classification report for classifier Logistic Regression:
             precision    recall  f1-score   support

          0       0.66      0.73      0.69       204
          1       0.73      0.66      0.70       226

avg / total       0.70      0.70      0.70       430


Confusion matrix:
[[149  55]
 [ 76 150]]
Accuracy score for Linear SVC: 0.544186046512
Classification report for classifier Linear SVC:
             precision    recall  f1-score   support

          0       0.52      0.63      0.57       204
          1       0.58      0.47      0.52       226

avg / total       0.55      0.54      0.54       430


Confusion matrix:
[[128  76]
 [120 106]]


In [6]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

print "LDA"

X_train_LDA, X_test_LDA, Y_train_LDA, Y_test_LDA = train_test_split(X, Y, test_size=0.2, random_state=42)

lda = LDA(n_components=1) 
lda.fit_transform(X_train_LDA,Y_train_LDA) 
predictions_LDA = lda.predict(X_test_LDA) 

print ("Accuracy score for Linear Discriminant Analysis: {}".format(metrics.accuracy_score(Y_test_LDA, predictions_LDA)))
print("Classification report for classifier %s:\n%s\n"
      % ('LDA', metrics.classification_report(Y_test_LDA, predictions_LDA)))

logreg_LDA = LogisticRegression(penalty='l2', class_weight='balanced', solver = 'saga')
logreg_LDA.fit(X_train_LDA, Y_train_LDA)
log_predictions_LDA = logreg_LDA.predict(X_test_LDA) 

print ("Accuracy score for Logistic Regression: {}".format(metrics.accuracy_score(Y_test_LDA, log_predictions_LDA)))
print("Classification report for classifier %s:\n%s\n"
      % ('Logistic Regression', metrics.classification_report(Y_test_LDA, log_predictions_LDA)))

LDA
Accuracy score for Linear Discriminant Analysis: 0.625581395349
Classification report for classifier LDA:
             precision    recall  f1-score   support

          0       0.60      0.63      0.62       204
          1       0.65      0.62      0.63       226

avg / total       0.63      0.63      0.63       430






Accuracy score for Logistic Regression: 0.655813953488
Classification report for classifier Logistic Regression:
             precision    recall  f1-score   support

          0       0.63      0.68      0.65       204
          1       0.69      0.63      0.66       226

avg / total       0.66      0.66      0.66       430


