# Exploring Facial Features For Gender Recognition

In [3]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import sparse as sp
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc, roc_auc_score, accuracy_score, log_loss, confusion_matrix
%matplotlib inline

DATAFOLDER = "/Users/snuffles753/Documents/NYU-GSAS/ds1004/term-project/data"

In [27]:
# Pyspark related imports
import time
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.sql import SQLContext
from pyspark.mllib.linalg import Matrices


spark = SparkSession.builder.appName("Python Spark SQL basic example2").getOrCreate()
sc = spark.sparkContext
sqlCtx = SQLContext(spark)

In [9]:
# Load the sparse matrices containing the image feature data
sp_face_features = None
first = True
for filename in os.listdir(os.path.join(DATAFOLDER, 'sparse-images/')):
    fn_path = os.path.join(DATAFOLDER, 'sparse-images/' + filename)
    b = np.load(fn_path)
    data = b['data']
    m_format = b['format']
    shape = b['shape']
    row = b['row']
    col = b['col']
    tmp = sp.csr_matrix( (data,(row,col)), shape=shape )
    if first:
        sp_face_features = sp.vstack((tmp,sp_face_features), format="csr")
    else:
        sp_face_features = tmp
        first = False
print(sp_face_features.shape)

(4000, 90003)


In [10]:
def get_spark_gender_dataframe_from_image_matrix(image_matrix):
    """
    Process the sparse scipy matrix with image features and return a spark dataframe with sparse vectors
    """
    VECTOR_LENGTH = 90000
    spark_rows_formatted = []
    skip_count = 0
    for i, row in enumerate(image_matrix):
        active_cols = row.nonzero()[1]
        if active_cols[0] == 0:
            active_cols = active_cols[1:-2]
        else:
            active_cols = active_cols[:-2]
        indexes = list(map(lambda x: (x, 1), active_cols))
        try:
            gender = int(image_matrix[i,90002])
            spark_rows_formatted.append( (gender, indexes) )
        except ValueError:
            skip_count += 1
    print("Note that {} images were skipped due to nan label.".format(str(skip_count)))
    mapped_f = map(lambda x: (x[0], Vectors.sparse(VECTOR_LENGTH, x[1][1:])), spark_rows_formatted)
    df_gender_analysis = spark.createDataFrame(mapped_f, schema=["label", "features"])
    return df_gender_analysis


In [11]:
df_gender_analysis = get_spark_gender_dataframe_from_image_matrix(sp_face_features)
df_gender_analysis.show(5)

Note that 42 images were skipped due to nan label.
+-----+--------------------+
|label|            features|
+-----+--------------------+
|    1|(90000,[4820,5922...|
|    1|(90000,[4191,5390...|
|    1|(90000,[3589,4591...|
|    0|(90000,[4681,5679...|
|    1|(90000,[4488,5488...|
+-----+--------------------+
only showing top 5 rows



In [12]:
# Prepare the training and test data
splits = df_gender_analysis.randomSplit([0.75, 0.25])
data_train = splits[0]
data_test = splits[1]
print("The training data has {} instances.".format(data_train.count()))
print("The test data has {} instances.".format(data_test.count()))

The training data has 2968 instances.
The test data has 990 instances.


In [None]:
# Modeling with scikit
model = LogisticRegression()
model.fit(data_train, labels_train2)
y_pred = model.predict_proba(data_test)[:, 1]
accuracy = accuracy_score(labels_test2, (y_pred > 0.5).astype(int))
logloss = log_loss(labels_test2, y_pred)
fpr, tpr, thresholds = roc_curve(labels_test2, y_pred)
roc_auc = auc(fpr, tpr)
metrics = {'Accuracy': accuracy, 'ROC AUC': roc_auc, 'Log Loss': logloss}
plt.plot(fpr, tpr, label='AUC = {0:.3f}'.format(roc_auc))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC')
plt.legend(loc="lower right")

# The Multilayer Perceptron approach

In [25]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=10, regParam=0.3)

# Fit the model
lrModel = lr.fit(data_train)
trainingSummary = lrModel.summary
trainingSummary.roc.show()
print("areaUnderROC: " + str(trainingSummary.areaUnderROC))

+---+-------------------+
|FPR|                TPR|
+---+-------------------+
|0.0|                0.0|
|0.0|0.01684088269454123|
|0.0|0.03368176538908246|
|0.0|0.05110336817653891|
|0.0|0.06794425087108014|
|0.0|0.08478513356562137|
|0.0| 0.1016260162601626|
|0.0|0.11846689895470383|
|0.0|0.13530778164924506|
|0.0| 0.1521486643437863|
|0.0|0.16898954703832753|
|0.0|0.18583042973286876|
|0.0|   0.20267131242741|
|0.0|0.21951219512195122|
|0.0|0.22067363530778164|
|0.0|0.23809523809523808|
|0.0|0.25551684088269455|
|0.0|0.27235772357723576|
|0.0|  0.289198606271777|
|0.0| 0.3060394889663182|
+---+-------------------+
only showing top 20 rows

areaUnderROC: 0.9999277595390034


In [28]:
predictions = lrModel.transform(data_test)
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)


0.5559371912279878

In [31]:
evaluator.getMetricName()


'areaUnderROC'