In [1]:
import pandas as pd
import numpy as np
import requests
from tqdm import tqdm, tqdm_notebook
from concurrent.futures import ThreadPoolExecutor
import os
import glob

# Importing Ratings Made by Human Rater to Train a Size Estimation Model

In [2]:
df = pd.read_excel("./data/WBD_dataset.xlsx")
df.head()

Unnamed: 0,asian_female (1).jpg - 1_Q1,asian_female (2).jpg - 2_Q1,asian_female (3).jpg - 3_Q1,asian_female (4).jpg - 4_Q1,asian_female (5).jpg - 5_Q1,asian_female (6).jpg - 6_Q1,asian_female (7).jpg - 7_Q1,asian_female (8).jpg - 8_Q1,asian_female (9).jpg - 9_Q1,asian_female (10).jpg - 10_Q1,...,white_male (192).jpg - 1459_Q1,white_male (193).jpg - 1460_Q1,white_male (194).jpg - 1461_Q1,white_male (195).jpg - 1462_Q1,white_male (196).jpg - 1463_Q1,white_male (197).jpg - 1464_Q1,white_male (198).jpg - 1465_Q1,white_male (199).jpg - 1466_Q1,white_male (200).jpg - 1467_Q1,white_male.jpg - 1468_Q1
0,5,5,5,5,4,4,4,4,5,3,...,4,4,5,4,5,5,5,2,5,7
1,6,6,5,6,5,5,4,3,5,2,...,3,4,5,4,3,5,4,2,5,6
2,5,4,3,6,2,5,1,2,4,1,...,3,4,5,3,2,5,4,2,5,6


In [3]:
correlations = df.T.corr().values

In [4]:
def cronbach_alpha(correlations):
    # Only use the lower diagonal correlations
    num_items = correlations.shape[0]
    lower_triangle = np.tril(correlations, k=-1)  # Use k=-1 to exclude diagonal
    sum_correlations = np.sum(lower_triangle)
    alpha = (num_items / (num_items - 1)) * (
        1 - (sum_correlations / (num_items * (num_items - 1) / 2))
    )

    return alpha


# Calculate and print Cronbach's alpha
alpha_coefficient = cronbach_alpha(correlations)
print(f"Cronbach's Alpha: {alpha_coefficient}")

Cronbach's Alpha: 0.41942349373446164


In [5]:
raterscores = df.values.mean(axis=0)

In [6]:
base = "https://psych.x10host.com/bmiestimator/allimages/"
fnames = [base + x.split(" - ")[0] for x in df.columns.tolist()]

# Download Images to Face Folder

In [17]:
%%time
def download_image(url, filename):
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()  # Raise an exception for bad status codes

        with open(f"faces/{filename}.jpg", 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
    except requests.exceptions.RequestException as e:
        print(f"Error downloading {url}: {e}")

os.makedirs("faces", exist_ok=True)
with ThreadPoolExecutor(max_workers=5) as executor:  # Adjust max_workers as needed
    futures = []
    for i, url in enumerate(fnames):
        futures.append(executor.submit(download_image, url, str(i) ))

    for future in tqdm(futures):
        future.result() # Check for exceptions during download

100%|██████████| 1468/1468 [03:29<00:00,  7.00it/s]

CPU times: total: 16.7 s
Wall time: 3min 29s





# Get Faces into Face List

In [26]:
faces = []
y = []

image_paths = glob.glob(os.path.join("faces", "*.jpg"))

for image_path in image_paths:
    try:
        # Extract the index from the filename
        index = int(
            os.path.basename(image_path)[:-4]
        )  # Remove '.jpg' and convert to int
        faces.append(image_path)
        y.append(raterscores[index])
    except (ValueError, IndexError) as e:
        print(f"Warning: Skipping file {image_path}: {e}")

# Modeling

## Install the DeepFace Library

In [7]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import RidgeCV
from scipy.stats import pearsonr
from deepface import DeepFace
import pickle




## Get Face Embeddings

In [2]:
def embed_face(fname):
    embedding_objs = DeepFace.represent(
        img_path=fname,
        model_name="Facenet512",
    )
    return embedding_objs[0]["embedding"]


def process_face(idx, fname):
    try:
        embedding = embed_face(fname)
        return (
            embedding,
            raterscores[idx],
        )  # Return embedding and corresponding y value
    except Exception as e:
        # print(f"Error processing {fname}: {e}")
        return None, None  # Return None in case of error

In [None]:
# Check if faces list is empty
if not faces:
    print(
        "Error: No face images found in the 'faces' directory. Please check the download process."
    )
else:
    with ThreadPoolExecutor() as executor:
        results = list(executor.map(process_face, range(len(faces)), faces))

    embeddings_y = [
        (embedding, y_val)
        for embedding, y_val in results
        if embedding is not None and not np.isnan(y_val)
    ]

    # Check if embeddings_y is empty before unpacking
    if not embeddings_y:
        print(
            "Error: No face embeddings generated. Please check the embed_face function and image paths."
        )
    else:
        embeddings, y = zip(*embeddings_y)

25-01-03 17:03:47 - facenet512_weights.h5 will be downloaded...25-01-03 17:03:47 - facenet512_weights.h5 will be downloaded...
25-01-03 17:03:47 - facenet512_weights.h5 will be downloaded...
25-01-03 17:03:47 - facenet512_weights.h5 will be downloaded...

25-01-03 17:03:47 - facenet512_weights.h5 will be downloaded...25-01-03 17:03:47 - facenet512_weights.h5 will be downloaded...



Downloading...
From: https://github.com/serengil/deepface_models/releases/download/v1.0/facenet512_weights.h5
To: /root/.deepface/weights/facenet512_weights.h5
Downloading...
From: https://github.com/serengil/deepface_models/releases/download/v1.0/facenet512_weights.h5
To: /root/.deepface/weights/facenet512_weights.h5
  0%|          | 0.00/95.0M [00:00<?, ?B/s]
  0%|          | 0.00/95.0M [00:00<?, ?B/s][ADownloading...
From: https://github.com/serengil/deepface_models/releases/download/v1.0/facenet512_weights.h5
To: /root/.deepface/weights/facenet512_weights.h5
Downloading...
From: https://github.com/serengil/deepface_models/releases/download/v1.0/facenet512_weights.h5
To: /root/.deepface/weights/facenet512_weights.h5
Downloading...
From: https://github.com/serengil/deepface_models/releases/download/v1.0/facenet512_weights.h5
To: /root/.deepface/weights/facenet512_weights.h5
Downloading...
From: https://github.com/serengil/deepface_models/releases/download/v1.0/facenet512_weights.h5


## Train a Size Estimation Model

In [None]:
assert len(y) == len(embeddings), "y and embeddings lists must have the same length"

In [3]:
# Custom correlation coefficient scoring function
def correlation_scorer(estimator, X, y):
    y_pred = estimator.predict(X)
    return pearsonr(y, y_pred)[0]

In [None]:
# Assuming 'embeddings' and 'raterscores' are defined from the previous code
X = np.array(embeddings)
y = np.array(y)

# RidgeCV model
ridge_model = RidgeCV(alphas=np.logspace(-6, 6, 13))

# Cross-validation with the custom scorer
# Use y_filtered instead of the original y
scores = cross_val_score(
    ridge_model, X, y, cv=10, scoring=correlation_scorer, error_score="raise"
)

print(f"Cross-validated correlation scores: {scores}")
print(f"Mean correlation score: {np.mean(scores)}")

In [None]:
ridge_model.fit(X, y)

In [None]:
with open("ridge_model.pkl", "wb") as file:
    pickle.dump(ridge_model, file)

# Predict Demographics

In [8]:
import pandas as pd
import numpy as np
import requests
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import os
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import RidgeCV
from scipy.stats import pearsonr
from deepface import DeepFace
import pickle

## Read in Prestige Data

In [9]:
df_prestige = pd.read_csv("./data/OccupationalPrestigeRatings.tab", sep="\t")
df_prestige["OPR Job Rating"] = pd.to_numeric(
    df_prestige["OPR Job Rating"], errors="coerce"
)
df_prestige = df_prestige[["OPR Job Title", "OPR Job Rating"]].dropna()
df_prestige.head()

Unnamed: 0,OPR Job Title,OPR Job Rating
0,Actor,69.52
1,Announcer,59.1
2,Announcer,59.1
3,Art Director,62.83
4,Athlete and/or Sports Competitor,71.03


In [10]:
jobtitles = df_prestige["OPR Job Title"].tolist()
prestige_values = df_prestige["OPR Job Rating"].astype(float).values

## Sentence Transformer model

In [11]:
from sentence_transformers import SentenceTransformer, util

In [12]:
# load in the all mini l6 v2 model
sentenceembeddingmodel = SentenceTransformer("all-MiniLM-L6-v2")
jobtitleembeddings = sentenceembeddingmodel.encode(jobtitles)

X = np.array(jobtitleembeddings)
y = prestige_values

In [13]:
def correlation_scorer(estimator, X, y):
    y_pred = estimator.predict(X)
    return pearsonr(y, y_pred)[0]

In [14]:
ridge_model_prestige = RidgeCV(alphas=np.logspace(-6, 6, 13))

# Cross-validation with the custom scorer
scores = cross_val_score(ridge_model_prestige, X, y, cv=10, scoring=correlation_scorer)

print(f"Cross-validated correlation scores: {scores}")
print(f"Mean correlation score: {np.mean(scores)}")

Cross-validated correlation scores: [0.67058648 0.85479236 0.79719832 0.85406499 0.7470677  0.88088967
 0.83667608 0.8123716  0.64683596 0.77048009]
Mean correlation score: 0.7870963261185712


In [15]:
ridge_model_prestige.fit(X, y)

In [16]:
with open("./models/ridge_model_prestige.pkl", "wb") as file:
    pickle.dump(ridge_model_prestige, file)

# Org Data - Get Prestige

In [17]:
def predict_demographics(url: str, index: int, ridge_model) -> dict:
    img_data = requests.get(url).content
    with open("temp.jpg", "wb") as handler:
        handler.write(img_data)

    data = {
        "image_src": url,
        "estimatedsize": None,
        "gender": None,
        "age": None,
        "race": None,
    }

    try:
        embedding = embed_face("temp.jpg")
        data["estimatedsize"] = ridge_model.predict([embedding])[0]

        analysis = DeepFace.analyze(
            img_path="temp.jpg", actions=["age", "gender", "race"], silent=True
        )
        # Check if analysis is a list and convert it to a dictionary if needed
        if isinstance(analysis, list):
            analysis = analysis[
                0
            ]  # Assuming the first element of the list contains the dictionary

        data.update(
            {
                "gender": analysis.get("dominant_gender"),
                "age": analysis.get("age"),
                "race": analysis.get("dominant_race"),
            }
        )

    except ValueError:
        print(f"Face not detected in image from URL: {url}")
    except ConnectionError:
        print(f"Error connecting to URL: {url}")
    finally:
        pd.DataFrame([data]).to_csv(
            "./data/predicted_demographics.csv",
            index=False,
            mode="a",
            header=(index == 0),
        )
        return data

In [18]:
def embed_face(fname):
    embedding_objs = DeepFace.represent(
        img_path=fname,
        model_name="Facenet512",
    )
    return embedding_objs[0]["embedding"]

In [19]:
org_data = pd.read_csv("./Companies/org_chart_data.csv")

In [13]:
# loaded_data = pd.read_csv("./data/predicted_demographics.csv")

# urls_to_leave_out = loaded_data["image_src"].to_list()
# urls_left = (
#     org_data[~org_data["image_src"].isin(urls_to_leave_out)]["image_src"]
#     .dropna()
#     .to_list()
# )

In [None]:
with open("./models/ridge_model.pkl", "rb") as file:
    ridge_model = pickle.load(file)

predicted_demographics_data = [
    predict_demographics(url, index, ridge_model)
    for index, url in tqdm_notebook(
        enumerate(
            org_data["image_src"]
            .dropna()
            .unique()
            # urls_left
        )
    )
]

In [20]:
# inferred_data_df = pd.DataFrame(predicted_demographics_data)
inferred_data_df = pd.read_csv("./data/predicted_demographics.csv")


inferred_data_df.shape

(6604, 5)

In [21]:
assert (
    inferred_data_df.drop_duplicates(subset=["image_src"]).shape
    == inferred_data_df.drop_duplicates().shape
), "Duplicate rows found in the inferred data"
inferred_data_df = inferred_data_df.drop_duplicates()

In [22]:
# Merge the original data with the inferred data based on the image_src column and drop rows with missing values (NaN) in the inferred data i.e. no demographic information
# The merge will keep only the rows with image_src values present in both DataFrames (inner join)
# Its a many-to-many merge because images are unique but they can appear more than once in the original data (e.g. same person in different companies (board position))

org_data_demographics = pd.merge(
    org_data,
    inferred_data_df.dropna(subset=["estimatedsize", "gender", "age", "race"]),
    on="image_src",
    how="inner",
    validate="many_to_many",
)


org_data_demographics.head()

Unnamed: 0,company,name,title,reports_to,org_status,image_src,estimatedsize,gender,age,race
0,google,Jung Kim,"Director, UX",Meriah Moulton,5.0,https://cdn.theorg.com/f6964631-afde-4023-80e3...,4.381319,Woman,31.0,asian
1,amazon,Mai le,"Vp, AWS",Matt Garman,3.0,https://cdn.theorg.com/70f0d92b-77a3-4fca-9024...,4.42283,Woman,41.0,asian
2,google,Kyle Schumacher,Head Of Brand Marketing - Gemini,Cassidy Morgan,5.0,https://cdn.theorg.com/d3b50948-bc92-47aa-a25f...,4.441792,Woman,35.0,white
3,disney,Tao Xiong,"Senior Director Of Research, Disney+ Hotstar",Erik Crouthamel,5.0,https://cdn.theorg.com/7e861b10-303e-4ed4-a1cb...,4.391356,Man,37.0,asian
4,disney,Vivan Kaul,"Principal Technical Program Manager, Disney En...",Dana Walden,3.0,https://cdn.theorg.com/27982d44-0c87-4140-bbc8...,4.43514,Man,30.0,middle eastern


In [27]:
org_data_demographics.shape == org_data_demographics.drop_duplicates(
    subset=["name", "image_src", "reports_to"]
).shape

True

In [28]:
org_data_demographics.to_csv("./data/org_data_with_demographics.csv", index=False)

## Predicting Occupational Prestige

In [29]:
jobtitles = org_data_demographics["title"].to_list()
jobembeddings = sentenceembeddingmodel.encode(jobtitles)
predicted_prestige = ridge_model_prestige.predict(jobembeddings)

In [30]:
org_data_demographics["prestige"] = predicted_prestige
org_data_demographics.head()

Unnamed: 0,company,name,title,reports_to,org_status,image_src,estimatedsize,gender,age,race,prestige
0,google,Jung Kim,"Director, UX",Meriah Moulton,5.0,https://cdn.theorg.com/f6964631-afde-4023-80e3...,4.381319,Woman,31.0,asian,63.591678
1,amazon,Mai le,"Vp, AWS",Matt Garman,3.0,https://cdn.theorg.com/70f0d92b-77a3-4fca-9024...,4.42283,Woman,41.0,asian,56.324716
2,google,Kyle Schumacher,Head Of Brand Marketing - Gemini,Cassidy Morgan,5.0,https://cdn.theorg.com/d3b50948-bc92-47aa-a25f...,4.441792,Woman,35.0,white,56.584945
3,disney,Tao Xiong,"Senior Director Of Research, Disney+ Hotstar",Erik Crouthamel,5.0,https://cdn.theorg.com/7e861b10-303e-4ed4-a1cb...,4.391356,Man,37.0,asian,66.12468
4,disney,Vivan Kaul,"Principal Technical Program Manager, Disney En...",Dana Walden,3.0,https://cdn.theorg.com/27982d44-0c87-4140-bbc8...,4.43514,Man,30.0,middle eastern,63.736621


In [31]:
org_data_demographics.to_csv("./data/org_data_with_prestige.csv", index=False)