# Exploring Facial Features For Gender Recognition

In [1]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import sparse as sp
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc, roc_auc_score, accuracy_score, log_loss, confusion_matrix
%matplotlib inline

DATAFOLDER = "/Users/snuffles753/Documents/NYU-GSAS/ds1004/term-project/data"

In [2]:
# Pyspark related imports
import time
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.sql import SQLContext
from pyspark.mllib.linalg import Matrices


spark = SparkSession.builder.appName("Python Spark SQL basic example2").getOrCreate()
spark.conf.set("spark.executor.memory", '8g')
spark.conf.set('spark.executor.cores', '2')
spark.conf.set('spark.cores.max', '2')
spark.conf.set("spark.driver.memory",'8g')
sc = spark.sparkContext
sqlCtx = SQLContext(spark)

In [3]:
# Load the sparse matrices containing the image feature data
sp_face_features = None
first = True
num_loaded = 0
for filename in os.listdir(os.path.join(DATAFOLDER, 'sparse-images2/')):
    fn_path = os.path.join(DATAFOLDER, 'sparse-images2/' + filename)
    b = np.load(fn_path)
    data = b['data']
    m_format = b['format']
    shape = b['shape']
    row = b['row']
    col = b['col']
    tmp = sp.csr_matrix( (data,(row,col)), shape=shape )
    if first:
        sp_face_features = sp.vstack((tmp,sp_face_features), format="csr")
    else:
        sp_face_features = tmp
        first = False
print(sp_face_features.shape)

(34917, 90003)


In [4]:
NUM_FEATURES = 90000
GENDER_INDEX = NUM_FEATURES + 2
AGE_INDEX = NUM_FEATURES + 1
NUM_F_SETS = 9
COLS_PER_SET = 10000
PIC_DIM = 100
FIRST_COL_OFFSET = 1
NOSE_BRIDGE_INDEX = 3

def get_x_y_coord(z):
    z = z - FIRST_COL_OFFSET
    x = (z - ((z // COLS_PER_SET) * COLS_PER_SET)) % PIC_DIM
    y = (z - ((z // COLS_PER_SET) * COLS_PER_SET)) // PIC_DIM
    feature_set = z // COLS_PER_SET
    return (x, y, feature_set)
        
def get_distance_features(active_cols):
    coords = list(map(lambda z: get_x_y_coord(z), active_cols))
    np_coords = np.array(coords) 
    distances = []
    nose_feature_set = np_coords[np_coords[:,2] == NOSE_BRIDGE_INDEX]       
    for i in range(0, NUM_F_SETS):
        feature_set = np_coords[np_coords[:,2] == i]
        if i != 2:
            if len(nose_feature_set) == 0 or len(feature_set) == 0:
                distances.append( (NUM_FEATURES + i, 0) )
            else:
                nose_centroid = np.mean(nose_feature_set, axis=0)[0:2]
                feature_centroid = np.mean(feature_set, axis=0)[0:2]
                dist = np.linalg.norm(nose_centroid - feature_centroid)
                distances.append( (NUM_FEATURES + i, dist) )
    return distances

def get_spark_gender_dataframe_from_image_matrix(image_matrix, label_index):
    """
    Process the sparse scipy matrix with image features and return a spark dataframe with sparse vectors
    """
    spark_rows_formatted = []
    skip_count = 0
    for i, row in enumerate(image_matrix):
        active_cols = row.nonzero()[1]
        # Remove first column if index col and remove last two label columns
        if active_cols[0] == 0:
            active_cols = active_cols[1:-2]
        else:
            active_cols = active_cols[:-2]
        indexes = list(map(lambda z: (z, 1), active_cols))
        indexes += get_distance_features(active_cols)
        try:
            label = int(image_matrix[i, label_index])
            spark_rows_formatted.append( (label, indexes) )
        except ValueError:
            skip_count += 1
    print("Note that {} images were skipped due to nan label.".format(str(skip_count)))
    mapped_f = map(lambda x: (x[0], Vectors.sparse(NUM_FEATURES + NUM_F_SETS, x[1][1:])), 
                   spark_rows_formatted)
    df_analysis = spark.createDataFrame(mapped_f, schema=["label", "features"])
    return df_analysis


In [5]:
df_gender_analysis = get_spark_gender_dataframe_from_image_matrix(sp_face_features,
                                                                 GENDER_INDEX)
df_gender_analysis.show(5)

Note that 677 images were skipped due to nan label.
+-----+--------------------+
|label|            features|
+-----+--------------------+
|    1|(90009,[3091,4001...|
|    0|(90009,[2800,3401...|
|    0|(90009,[4296,5393...|
|    0|(90009,[3195,4105...|
|    0|(90009,[2576,3678...|
+-----+--------------------+
only showing top 5 rows



In [6]:
# Prepare the training and test data
splits = df_gender_analysis.randomSplit([0.75, 0.25])
data_train = splits[0]
data_test = splits[1]
print("The training data has {} instances.".format(data_train.count()))
print("The test data has {} instances.".format(data_test.count()))

The training data has 25708 instances.
The test data has 8532 instances.


In [None]:
# Modeling with scikit
model = LogisticRegression()
model.fit(data_train, labels_train2)
y_pred = model.predict_proba(data_test)[:, 1]
accuracy = accuracy_score(labels_test2, (y_pred > 0.5).astype(int))
logloss = log_loss(labels_test2, y_pred)
fpr, tpr, thresholds = roc_curve(labels_test2, y_pred)
roc_auc = auc(fpr, tpr)
metrics = {'Accuracy': accuracy, 'ROC AUC': roc_auc, 'Log Loss': logloss}
plt.plot(fpr, tpr, label='AUC = {0:.3f}'.format(roc_auc))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC')
plt.legend(loc="lower right")

# Logistic Regression (base case)

In [7]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=10, regParam=.3)

# Fit the model
lrModel = lr.fit(data_train)
trainingSummary = lrModel.summary
trainingSummary.roc.show()
print("areaUnderROC: " + str(trainingSummary.areaUnderROC))

+--------------------+-------------------+
|                 FPR|                TPR|
+--------------------+-------------------+
|                 0.0|                0.0|
|                 0.0|0.01885020944677163|
|                 0.0|0.03798931099234436|
|                 0.0|0.05669507438971544|
|                 0.0|0.07540083778708652|
|                 0.0|0.09388993211035679|
|                 0.0|0.11266791853242814|
|                 0.0|0.13187924310270113|
|                 0.0|0.15087389859887332|
|                 0.0|0.16979633107034522|
|8.430281571404485E-5|0.18857431749241657|
|8.430281571404485E-5|0.20749674996388848|
|8.430281571404485E-5|0.22656362848476094|
|3.372112628561794E-4|0.24490827675863064|
|3.372112628561794E-4|0.24519716885743176|
|4.215140785702242E-4|0.26383070923010254|
|5.058168942842691E-4| 0.2823920265780731|
|6.744225257123588E-4|0.30124223602484473|
|0.001095936604282583|0.31929799219991334|
|0.001180239419996...|0.33778708652318357|
+----------

In [None]:
predictions = lrModel.transform(data_test)
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)


In [None]:
evaluator.getMetricName()


In [15]:
fn_path = os.path.join(DATAFOLDER, '1_our_faces_sparse.npz')
b = np.load(fn_path)
data = b['data']
m_format = b['format']
shape = b['shape']
row = b['row']
col = b['col']
sp_face_features_me = sp.csr_matrix( (data,(row,col)), shape=shape )
df_age_analysis_me = get_spark_gender_dataframe_from_image_matrix(sp_face_features_me,
                                                                 GENDER_INDEX)
df_age_analysis_me.show(5)                                                                  

Note that 0 images were skipped due to nan label.
+-----+--------------------+
|label|            features|
+-----+--------------------+
|    0|(90009,[1811,2589...|
|    1|(90009,[4801,5100...|
|    1|(90009,[3313,4197...|
+-----+--------------------+



In [16]:
predictions = lrModel.transform(df_age_analysis_me)
predictions.show()
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|    0|(90009,[1811,2589...|[-0.1088840082218...|[0.47280585982957...|       1.0|
|    1|(90009,[4801,5100...|[-0.5003597348388...|[0.37745613350055...|       1.0|
|    1|(90009,[3313,4197...|[0.03523766410104...|[0.50880850458927...|       0.0|
+-----+--------------------+--------------------+--------------------+----------+



0.5

# Linear Regression

In [None]:
df_age_analysis = get_spark_gender_dataframe_from_image_matrix(sp_face_features,
                                                                 AGE_INDEX)
df_age_analysis.show(5)


In [None]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(maxIter=10, regParam=.3)

# Fit the model
lrModel = lr.fit(df_age_analysis)
# Print the coefficients and intercept for linear regression
# print("Coefficients: %s" % str(lrModel.coefficients))
print("Intercept: %s" % str(lrModel.intercept))

# Summarize the model over the training set and print out some metrics
trainingSummary = lrModel.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

In [None]:
# 1_our_faces_sparse.npz
fn_path = os.path.join(DATAFOLDER, '1_our_faces_sparse.npz')
b = np.load(fn_path)
data = b['data']
m_format = b['format']
shape = b['shape']
row = b['row']
col = b['col']
sp_face_features_me = sp.csr_matrix( (data,(row,col)), shape=shape ).todense()
# df_age_analysis_me = get_spark_gender_dataframe_from_image_matrix(sp_face_features_me,
#                                                                  AGE_INDEX)
# df_age_analysis_me.show(5)


In [None]:
coefs = lrModel.coefficients
np_coefs = coefs.toArray()
print(np_coefs[0:9000])
features = sp_face_features_me[:,1:90001].T
coefs_fmt = np_coefs[0:90000].reshape((1, -1)).T
print(features)
print(coefs_fmt)
preds = features.T @ coefs_fmt
# predictions = np.dot(coefs[0:90000], sp_face_features_me[:,1:90000])
print(preds)

# The Multilayer Perceptron approach

In [None]:
# specify layers for the neural network:
# input layer of size 4 (features), two intermediate of size 5 and 4
# and output of size 3 (classes)
layers = [90000, 10, 10, 2]

# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)

# train the model
model = trainer.fit(data_train.limit(1000))

# compute accuracy on the test set
result = model.transform(data_test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

In [None]:
sc.stop()