# Exploring Facial Features For Gender Recognition

In [1]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import sparse as sp
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc, roc_auc_score, accuracy_score, log_loss, confusion_matrix
%matplotlib inline

DATAFOLDER = "/Users/snuffles753/Documents/NYU-GSAS/ds1004/term-project/data"

In [2]:
# Pyspark related imports
import time
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import SQLContext
from pyspark.mllib.linalg import Matrices

spark = SparkSession.builder.appName("Python Spark SQL basic example2").getOrCreate()
sc = spark.sparkContext
sqlCtx = SQLContext(spark)

In [3]:
# Load the sparse matrix containing the image feature data
b = np.load(os.path.join(DATAFOLDER, 'sparse_matrix.npz'))
data = b['data']
m_format = b['format']
shape = b['shape']
row = b['row']
col = b['col']
sp_face_features = sp.csr_matrix( (data,(row,col)), shape=shape )
print(sp_face_features.shape)

(33990, 90001)


In [4]:
def get_spark_dataframe_from_image_matrix(image_matrix, vector_length):
    """
    Process the sparse scipy matrix with image features and return a spark dataframe with sparse vectors
    """
    spark_rows_formatted = []
    for i, row in enumerate(image_matrix):
        indexes = list(map(lambda x: (x, 1), row.nonzero()[1]))
        spark_rows_formatted.append( (i, indexes) )
    mapped_f = map(lambda x: (x[0], Vectors.sparse(vector_length, x[1][1:])), spark_rows_formatted)
    df_gender_analysis = spark.createDataFrame(mapped_f, schema=["index", "features"])
    return df_gender_analysis


In [5]:
df_image_features = get_spark_dataframe_from_image_matrix(sp_face_features[0:100], sp_face_features.shape[1])
df_image_features.show()

+-----+--------------------+
|index|            features|
+-----+--------------------+
|    0|(90001,[4627,5527...|
|    1|(90001,[4114,5114...|
|    2|(90001,[2989,4001...|
|    3|(90001,[1996,3098...|
|    4|(90001,[4314,5215...|
|    5|(90001,[3095,4294...|
|    6|(90001,[4020,4818...|
|    7|(90001,[4307,5306...|
|    8|(90001,[3079,4081...|
|    9|(90001,[2691,3991...|
|   10|(90001,[3681,4483...|
|   11|(90001,[2198,3499...|
|   12|(90001,[2994,4092...|
|   13|(90001,[3198,4297...|
|   14|(90001,[2797,3401...|
|   15|(90001,[2801,3580...|
|   16|(90001,[2787,3889...|
|   17|(90001,[3000,3510...|
|   18|(90001,[3312,4312...|
|   19|(90001,[2692,3705...|
+-----+--------------------+
only showing top 20 rows



In [9]:
file_map = spark.read.csv("../data/wiki_data.csv", header=True, inferSchema=True)
file_map = file_map.select(file_map._c0.alias('row_index'), file_map.file, file_map.gender)
file_map.show()

+---------+--------------------+------+
|row_index|                file|gender|
+---------+--------------------+------+
|        0|cropped_10000217_...|   1.0|
|        1|cropped_10000548_...|   1.0|
|        2|cropped_100012_19...|   1.0|
|        3|cropped_10001965_...|   1.0|
|        4|cropped_10002116_...|   0.0|
|        5|cropped_10002702_...|   0.0|
|        6|cropped_10003541_...|   1.0|
|        7|cropped_100039_19...|   1.0|
|        8|cropped_10004113_...|   1.0|
|        9|cropped_10004122_...|   1.0|
|       10|cropped_10004299_...|   1.0|
|       11|cropped_1000456_1...|   1.0|
|       12|cropped_10004882_...|   1.0|
|       13|cropped_1000522_1...|   0.0|
|       14|cropped_10005261_...|   1.0|
|       15|cropped_10005947_...|   0.0|
|       16|cropped_1000684_1...|   1.0|
|       17|cropped_10006850_...|   1.0|
|       18|cropped_10007577_...|   1.0|
|       19|cropped_1000781_1...|   1.0|
+---------+--------------------+------+
only showing top 20 rows



In [None]:
# wiki_map = pd.read_csv(os.path.join(DATAFOLDER, 'wiki_data.csv'),
#                       names=['index2', 'f_name2', 'gender'],
#                        header=1)
# wiki_map['f_name2'] = wiki_map['f_name2'].str.replace('cropped_', '')
# wiki_map.head()



In [None]:
mapped = file_map.merge(wiki_map, left_on='f_name', right_on='f_name2', how='left')
mapped.drop(['index', 'index2'], axis=1, inplace=True)
mapped.describe()

In [None]:
gender = mapped[['gender']].values
print(gender.shape)
print(gender)

In [None]:
data = face_features[:,1:]
print(data.shape)
print(data)

In [None]:
data_train, data_test, labels_train, labels_test = train_test_split(data, gender, test_size=0.20, random_state=42)


In [None]:
print(data_test.shape)
print(np.any(np.isnan(labels_train)))
labels_train2 = np.nan_to_num(labels_train)
print(np.any(np.isnan(labels_train2)))
print(np.any(np.isnan(labels_test)))
labels_test2 = np.nan_to_num(labels_test)
print(np.any(np.isnan(labels_test2)))


In [None]:
# Modeling with scikit
model = LogisticRegression()
model.fit(data_train, labels_train2)
y_pred = model.predict_proba(data_test)[:, 1]
accuracy = accuracy_score(labels_test2, (y_pred > 0.5).astype(int))
logloss = log_loss(labels_test2, y_pred)
fpr, tpr, thresholds = roc_curve(labels_test2, y_pred)
roc_auc = auc(fpr, tpr)
metrics = {'Accuracy': accuracy, 'ROC AUC': roc_auc, 'Log Loss': logloss}
plt.plot(fpr, tpr, label='AUC = {0:.3f}'.format(roc_auc))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC')
plt.legend(loc="lower right")

# The Multilayer Perceptron approach

In [None]:
# Pyspark related imports
import time
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import SQLContext
from pyspark.mllib.linalg import Matrices

spark = SparkSession.builder.appName("Python Spark SQL basic example2").getOrCreate()
sc = spark.sparkContext
sqlCtx = SQLContext(spark)

In [None]:
#sqlCtx.createDataFrame(data,).show()
# rdd = sc.parallelize(data)
sparse_matrix = Matrices.sparse(sp_face_features.shape[1],
                                sp_face_features.shape[0],
                                sp_face_features.indptr,
                                sp_face_features.indices,
                                sp_face_features.data)


In [None]:
print(face_features)

In [None]:
# import numpy
# print(rdd.take(1))
# print(sparse_matrix.toDense())
# df = spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], 
#                            ["label", "weight", "features"])

# df.show()
test = convert_to_sparse_spark_format(sp_face_features)[0:100]
# test = np_face_features[0:100, 0:80000]
arr = np.array([[2,3,4], [2,8,5], [2,3,6],[4,5,7]])
print(arr.shape)
print(arr)
print(face_features.shape)
# print(test)
print(type(arr))
print(type(test))
print(test[0][1:])
# df = np.concatenate(arr).reshape(1000,-1)
start_time = time.time()
# dff = map(lambda x: (1, Vectors.sparse(90000, x)), test)
dff = map(lambda x: (1, Vectors.sparse(90000, x[1:])), test)
# dff = map(lambda x: (int(x.getrow()), 2, test)
mydf = spark.createDataFrame(dff,schema=["label", "features"])
print("Model took {} seconds".format((time.time() - start_time)))
mydf.show(5)

# df = map(lambda x: Vectors.dense(x), face_features)
# df2 = spark.createDataFrame(df,["features"])

In [None]:
import random
test = convert_to_sparse_spark_format(sp_face_features)[0:100]
mapped_gender = map(lambda x: (random.randint(0, 1), Vectors.sparse(90000, x[1:])), test)
df_gender_analysis = spark.createDataFrame(mapped_gender, schema=["label", "features"])
df_gender_analysis.show(5)


In [None]:
assembler = VectorAssembler(inputCols=["features2"], outputCol="features")
output = assembler.transform(df_gender_analysis)
formatted = output.select(output.label, output.features)
# formatted  = formatted.limit(100)
formatted.show(5)

In [None]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=10, regParam=0.3)

# Fit the model
lrModel = lr.fit(df_gender_analysis)

# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(lrModel.coefficients[0]))
print("Intercept: " + str(lrModel.intercept))