In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_lfw_people

In [1]:
import os
root = os.getcwd()
root

'./fair_taucc/datasets/lfw'

In [2]:
lfw_people = fetch_lfw_people(min_faces_per_person=0, resize=0.4)

In [3]:
X = lfw_people.data  # Shape: (n_samples, n_features)
y = lfw_people.target  # This contains the labels (person IDs)
target_names = lfw_people.target_names  # Names corresponding to IDs

In [35]:
target_names

array(['AJ Cook', 'AJ Lamas', 'Aaron Eckhart', ..., 'Zumrati Juma',
       'Zurab Tsereteli', 'Zydrunas Ilgauskas'], dtype='<U35')

In [5]:
n_samples, n_features = X.shape
n_classes = target_names.shape[0]

print(f"Total dataset size:")
print(f"n_samples: {n_samples}")
print(f"n_features: {n_features}")
print(f"n_classes: {n_classes}")

Total dataset size:
n_samples: 13233
n_features: 1850
n_classes: 5749


In [6]:
h, w = lfw_people.images.shape[1:3]
print(f"Image shape: {h} x {w}")

Image shape: 50 x 37


In [22]:
lfw_home = os.path.join(os.path.expanduser('~'), 'scikit_learn_data', 'lfw_home')

# Initialize a list to store filenames
filenames = []

# Loop through each image index to construct filenames
for i in range(len(lfw_people.images)):
    person_name = lfw_people.target_names[lfw_people.target[i]]
    # Construct filename based on person name and index
    filename = os.path.join(lfw_home, person_name, f"{person_name}_{i:04d}.jpg")
    filenames.append(filename)

# Print some filenames
for filename in filenames[:10]:  # Display first 10 filenames
    print(filename)

./scikit_learn_data/lfw_home/Tim Curry/Tim Curry_0000.jpg
./scikit_learn_data/lfw_home/Mahmoud Abbas/Mahmoud Abbas_0001.jpg
./scikit_learn_data/lfw_home/Michel Charles Chretien/Michel Charles Chretien_0002.jpg
./scikit_learn_data/lfw_home/Abid Hamid Mahmud Al-Tikriti/Abid Hamid Mahmud Al-Tikriti_0003.jpg
./scikit_learn_data/lfw_home/Mona Locke/Mona Locke_0004.jpg
./scikit_learn_data/lfw_home/Barbara Bodine/Barbara Bodine_0005.jpg
./scikit_learn_data/lfw_home/John Baldacci/John Baldacci_0006.jpg
./scikit_learn_data/lfw_home/Michael Andretti/Michael Andretti_0007.jpg
./scikit_learn_data/lfw_home/Tony Parker/Tony Parker_0008.jpg
./scikit_learn_data/lfw_home/Mamdouh Habib/Mamdouh Habib_0009.jpg


In [26]:
files = np.array(filenames)
files.shape

(13233,)

In [48]:
files = np.array([os.path.basename(path) for path in files])
files

array(['Tim Curry_0000.jpg', 'Mahmoud Abbas_0001.jpg',
       'Michel Charles Chretien_0002.jpg', ..., 'Hugo Chavez_13230.jpg',
       'Ariel Sharon_13231.jpg', 'Junichiro Koizumi_13232.jpg'],
      dtype='<U44')

In [77]:
files = np.char.replace(files, ' ', '_')
files

array(['Tim_Curry_0000.jpg', 'Mahmoud_Abbas_0001.jpg',
       'Michel_Charles_Chretien_0002.jpg', ..., 'Hugo_Chavez_13230.jpg',
       'Ariel_Sharon_13231.jpg', 'Junichiro_Koizumi_13232.jpg'],
      dtype='<U44')

In [28]:
import os
root = os.getcwd() + "/lfw"
root

'./fair_taucc/datasets/lfw'

In [78]:
np.save(root + "/filenames.npy", files)

In [36]:
np.save(root + "/matrix.npy", X)
np.save(root + "/person_ids.npy", y)
np.save(root + "/person_names.npy", target_names)

In [41]:
females = np.loadtxt(root + "/female_names.txt", dtype=str)
females

array(['Claudia_Coslovich_0001.jpg', 'Allison_Searing_0001',
       'Elizabeth_Hill_0001.jpg', ..., 'Zoe_Ball_0001.jpg',
       'Zorica_Radovic_0001.jpg', 'Zumrati_Juma_0001.jpg'], dtype='<U40')

In [42]:
males = np.loadtxt(root + "/male_names.txt", dtype=str)
males

array(['Alfred_Ford_0001.jpg', 'Craig_Fitzgibbon_0001.jpg',
       'AnFernce_Negron_0001.jpg', ..., 'Zulfiqar_Ahmed_0001.jpg',
       'Zurab_Tsereteli_0001.jpg', 'Zydrunas_Ilgauskas_0001.jpg'],
      dtype='<U44')

In [101]:
females = np.unique(np.array([s[:-9] for s in females]))
males = np.unique(np.array([s[:-9] for s in males]))


In [137]:
import re
def rimuovi_suffix(stringa):
    return re.sub(r'_\d+\.jpg$', '', stringa)

In [138]:
files_modificate = [rimuovi_suffix(s) for s in files]
files_modificate

['Tim_Curry',
 'Mahmoud_Abbas',
 'Michel_Charles_Chretien',
 'Abid_Hamid_Mahmud_Al-Tikriti',
 'Mona_Locke',
 'Barbara_Bodine',
 'John_Baldacci',
 'Michael_Andretti',
 'Tony_Parker',
 'Mamdouh_Habib',
 'Steve_Shiver',
 'Zurab_Tsereteli',
 'John_Cusack',
 'Glenn_Rivers',
 'Patricia_Clarkson',
 'Vaclav_Havel',
 'James_Becker',
 'Wan_Yanhai',
 'Ozzie_Smith',
 'Zach_Safrin',
 'Robert_Gallo',
 'Chen_Shui-bian',
 'Hoda_Asfor',
 'Jeremy_Greenstock',
 'Silvio_Berlusconi',
 'Donald_Rumsfeld',
 'Carson_Daly',
 'Nestor_Kirchner',
 'Filippo_Volandri',
 'Miguel_Hakim',
 'Hassan_Wirajuda',
 'Julio_Cesar_Chavez',
 'Markus_Naslund',
 'Peter_Rasmussen',
 'Frank_Hilldrup',
 'Joe_Vandever',
 'Garry_Trudeau',
 'Amber_Tamblyn',
 'Mario_Dumont',
 'Luiz_Inacio_Lula_da_Silva',
 'John_Brady',
 'Rahul_Dravid',
 'Jennifer_Aniston',
 'Gilberto_Simoni',
 'Gerry_Adams',
 'Geoffrey_Rush',
 'Lindsay_Lohan',
 'Jean_Chretien',
 'Justin_Timberlake',
 'John_Starks',
 'John_Howard',
 'Bulent_Ecevit',
 'Minnie_Mendoza',
 'I

In [141]:
Sx = np.full(n_samples, 0)

In [142]:
for idx, name in enumerate(files_modificate):
    print(name)
    if name in females:
        print("F")
        Sx[idx] = 1

Tim_Curry
Mahmoud_Abbas
Michel_Charles_Chretien
Abid_Hamid_Mahmud_Al-Tikriti
Mona_Locke
F
Barbara_Bodine
F
John_Baldacci
Michael_Andretti
Tony_Parker
Mamdouh_Habib
Steve_Shiver
Zurab_Tsereteli
John_Cusack
Glenn_Rivers
Patricia_Clarkson
F
Vaclav_Havel
James_Becker
Wan_Yanhai
Ozzie_Smith
Zach_Safrin
Robert_Gallo
Chen_Shui-bian
Hoda_Asfor
F
Jeremy_Greenstock
Silvio_Berlusconi
Donald_Rumsfeld
Carson_Daly
Nestor_Kirchner
Filippo_Volandri
Miguel_Hakim
Hassan_Wirajuda
Julio_Cesar_Chavez
Markus_Naslund
Peter_Rasmussen
Frank_Hilldrup
Joe_Vandever
Garry_Trudeau
Amber_Tamblyn
F
Mario_Dumont
Luiz_Inacio_Lula_da_Silva
John_Brady
Rahul_Dravid
Jennifer_Aniston
F
Gilberto_Simoni
Gerry_Adams
Geoffrey_Rush
Lindsay_Lohan
F
Jean_Chretien
Justin_Timberlake
John_Starks
John_Howard
Bulent_Ecevit
Minnie_Mendoza
Iva_Majoli
F
Donald_Rumsfeld
Roh_Moo-hyun
Aaron_Peirsol
George_W_Bush
David_Spade
Alonzo_Mourning
Luiz_Inacio_Lula_da_Silva
John_Snow
Mike_Bryan
Bill_Clinton
George_W_Bush
Colin_Farrell
Pat_Burns
Ryan_

F
Steve_Allee
Colin_Powell
George_W_Bush
Jiang_Zemin
George_W_Bush
Juan_Pablo_Montoya
Carlos_Menem
Marissa_Jaret_Winokur
F
George_W_Bush
Naomi_Watts
F
Scott_Rudin
Arianna_Huffington
F
Greg_Rusedski
Vicente_Fox
Colin_Powell
Harvey_Weinstein
George_W_Bush
Recep_Tayyip_Erdogan
David_Beckham
Luiz_Inacio_Lula_da_Silva
Jacques_Chirac
Ricardo_Sanchez
Megawati_Sukarnoputri
F
Venus_Williams
F
Conan_OBrien
George_W_Bush
Jennifer_Reilly
F
George_W_Bush
Bob_Iger
Agnelo_Queiroz
Donald_Rumsfeld
Corinne_Coman
F
Jose_Theodore
John_Mayer
Alec_Baldwin
Michael_Shelby
Lucrecia_Orozco
F
Mahmoud_Abbas
Michael_Chang
Jan-Michael_Gambill
Albert_Costa
Brendan_Gaughan
Jacques_Chirac
George_W_Bush
Michael_McNeely
Ann_Veneman
F
Conrad_Black
Sarah_Jessica_Parker
F
Gerrit_Zalm
Saddam_Hussein
Shinya_Taniguchi
Michael_Friedman
Harry_Kalas
Frank_Hsieh
Gerhard_Schroeder
Bryan_Adams
Steven_Hatfill
Johnny_Depp
Lleyton_Hewitt
Felipe_Perez_Roque
Annette_Lu
F
Rita_Moreno
F
Milo_Maestrecampo
John_White
Condoleezza_Rice
F
Gary

George_W_Bush
Lleyton_Hewitt
Fernando_Henrique_Cardoso
Stipe_Mesic
Kifah_Ajouri
Jamie_Martin
Li_Peng
Mike_Sweeney
Chuck_Hagel
Charles_Moose
Robert_Mugabe
Robert_Duvall
Cristina_Saralegui
F
Cesar_Gaviria
Hans_Blix
Jiang_Zemin
Daniell_Sunjata
Fred_Thompson
Roger_Etchegaray
Nestor_Kirchner
Benjamin_McKenzie
Guillermo_Canas
Pervez_Musharraf
Dustin_Brown
Prince_Claus
Mike_Slive
Luis_Horna
Ariel_Sharon
Brenda_Magana
F
George_W_Bush
Nina_Pecari
F
Coretta_Scott_King
F
Tim_Conway
William_Ford_Jr
Catherine_Deneuve
F
Colin_Powell
Li_Ka-shing
Kelly_Clarkson
F
Lindsay_Davenport
F
Alan_Greenspan
Ben_Davis
John_Kerry
Harriet_Lessy
F
Paradorn_Srichaphan
Taha_Yassin_Ramadan
Matt_LeBlanc
Szu_Yu_Chen
F
Bud_Selig
Jose_Miguel_Aleman
Jack_Grubman
Amy_Smart
F
Marco_Pantani
Frank_Cassell
Leo_Mullin
Du_Qinglin
Colin_Powell
Mike_Weir
Dave_McNealey
Griffin_Colvin
Silvio_Berlusconi
Mitchell_Daniels
Andrew_Caldecott
Megawati_Sukarnoputri
F
Pupi_Avati
Mohammed_Al-Douri
Mahmoud_Abbas
Christopher_Reeve
Vicente_Fernan

F
Luiz_Inacio_Lula_da_Silva
Queen_Elizabeth_II
F
Andrew_Weissmann
Hans_Blix
Keanu_Reeves
Allyson_Felix
F
Thor_Pedersen
Lenny_Wilkens
Jean_Chretien
George_Robertson
Ian_Thorpe
Gerhard_Schroeder
Sergey_Lavrov
John_Negroponte
Ray_Allen
Colin_Powell
Phil_Morris
Edina_Batar
F
Robert_Duvall
Rafael_Ramirez
Natalia_Vodonova
F
James_Phelps
Giuseppe_Morchio
Alvaro_Noboa
Gil_de_Ferran
Michael_Chiklis
Tim_Lopes
Peter_Arnett
Elvis_Presley
Jackie_Chan
George_HW_Bush
Linda_Amicangioli
F
Magui_Serna
F
Tony_Blair
Bruce_Lunsford
Vidar_Helgesen
Luiz_Inacio_Lula_da_Silva
Colin_Powell
Michael_Kahn
Mohammad_Khatami
John_Ashcroft
Joe_Leonard
Joseph_Estrada
Penny_Lancaster
F
Gerhard_Schroeder
Serena_Williams
F
Jane_Kaczmarek
F
Ari_Fleischer
Oliver_Neuville
David_Beckham
Stephen_Glassroth
Yashwant_Sinha
Princess_Diana
F
Duncan_Fletcher
Fernando_Sanz
Carlos_Menem
Kalpana_Chawla
F
Helene_Eksterowicz
F
Filippo_Inzaghi
Serena_Williams
F
Vecdi_Gonul
Kamal_Kharrazi
Russell_Simmons
Elizabeth_Smart
F
Tony_Blair
Gustav

George_W_Bush
Hamid_Karzai
Nicholas_Byron
Tommy_Franks
Leonid_Kuchma
Kofi_Annan
Hosni_Mubarak
Alejandro_Toledo
Jim_Letten
Gerhard_Schroeder
Tom_Brady
Helmut_Panke
Robin_McLaurin_Williams
Anthony_Principi
Inam-ul-Haq
Angela_Merkel
F
Hu_Jintao
Jan-Michael_Gambill
Hu_Jintao
Diana_Krall
F
Elva_Hsiao
F
Jacques_Chirac
Bob_Stoops
Donald_Rumsfeld
Denise_Johnson
F
Sada_Jacobson
F
Alexandra_Stevenson
F
Michael_Moore
Alejandro_Toledo
Valdas_Adamkus
Hugo_Chavez
Rudy_Tomjanovich
George_W_Bush
Jean_Chretien
Hasan_Wirayuda
Hugo_Chavez
Donald_Rumsfeld
Donald_Rumsfeld
Dave_Ragone
Marcelo_Rios
Lee_Tae-sik
Donald_Rumsfeld
Dimitar_Berbatov
Florencia_Kirchner
F
Muhammad_Ali
Stanley_McChrystal
Julie_Gerberding
F
David_Beckham
Hugo_Chavez
John_Fenn
Bison_Dele
Julio_Iglesias_Jr
JJ_Redick
Erik_Morales
Vicente_Fox
Norm_Coleman
Junichiro_Koizumi
Tony_Blair
Heather_Mills
F
Jean-Claude_Trichet
Jeremy_Greenstock
Richard_Shelby
Sedigh_Barmak
Des_Brown
Jane_Riley
F
Ellen_Martin
F
Ronaldo_Luis_Nazario_de_Lima
Bill_Par

Tony_Bennett
Oswaldo_Paya
Ariel_Sharon
George_HW_Bush
Elena_Bovina
F
Tony_Blair
Erskine_Bowles
Lars_Von_Trier
Roman_Polanski
Jafar_Umar_Thalib
Peter_Camejo
David_Tornberg
John_Ashcroft
Franz_Fischler
Luiz_Inacio_Lula_da_Silva
Tony_Blair
Peggy_McGuinness
F
Gerhard_Schroeder
Carla_Del_Ponte
F
George_W_Bush
Alberto_Fujimori
Alberto_Ruiz_Gallardon
Serena_Williams
F
Terje_Roed-Larsen
Venus_Williams
F
Jean_Charest
George_W_Bush
Jason_Kidd
Bryan_Thomas
Mohammad_Hasanein
Eduard_Shevardnadze
Lleyton_Hewitt
Lorraine_Bracco
F
Madeleine_Albright
F
Colin_Farrell
Mariano_Zabaleta
Serena_Williams
F
Brad_Pitt
Jennifer_Capriati
F
Jelena_Dokic
F
Paula_Dobriansky
F
Hillary_Clinton
F
Paul_Patton
Patrick_McEnroe
Gray_Davis
Magui_Serna
F
Lesley_McCulloch
F
Joe_Torre
Joe_Mantello
Bill_Clancy
King_Abdullah_II
Alvaro_Uribe
Shirley_Jones
F
Carlos_Menem
Bernard_Landry
Colin_Powell
Alejandro_Toledo
Al_Pacino
George_W_Bush
Jacques_Chirac
Tom_Craddick
Guillermo_Canas
Malcolm_Jamal_Warner
Dudley_Rogers
Carlos_Moya
K

F
Tim_Henman
Nicholas_Byron
Angelo_Reyes
Christiane_Wulff
F
Michael_Doleac
Hubie_Brown
Zeng_Qinghong
Fidel_Castro
Robinson_Stevenin
Rob_Marshall
Eduard_Shevardnadze
Chang_Dae-whan
Gonzalo_Sanchez_de_Lozada
Dick_Armey
George_W_Bush
Intisar_Ajouri
F
Serena_Williams
F
Lindsay_Benko
F
Laura_Bush
F
Tom_Cruise
John_Travolta
Amer_al-Saadi
Mary-Kate_Olsen
F
Anjum_Hussain
Andrew_Weissmann
Atal_Bihari_Vajpayee
Ariel_Sharon
Chris_Bell
Klaus_Zwickel
Howard_Smith
George_W_Bush
Binyamin_Ben-Eliezer
Brian_Lara
Cameron_Diaz
F
Britney_Spears
F
Nasser_al-Kidwa
Vladimir_Putin
Billy_Crawford
George_McCloud
Hans_Blix
Luiz_Inacio_Lula_da_Silva
Norah_Jones
F
Hans_Blix
Colin_Powell
Tom_Harkin
Inocencio_Arias
Stepan_Demirchian
James_Blake
Hector_Babenco
Henry_Castellanos
Benazir_Bhutto
F
Mahmoud_Abbas
Mikhail_Wehbe
George_W_Bush
Colin_Powell
George_W_Bush
Ariel_Sharon
Dolly_Parton
F
Lleyton_Hewitt
Ridley_Scott
Pierre_Pettigrew
Scott_Dickson
Saeb_Erekat
Blas_Ople
Supachai_Panitchpakdi
Carrie-Anne_Moss
F
Mikhail

Eduardo_Duhalde
Habib_Rizieq
Mireya_Elisa_Moscoso_Rodriguez
F
Richard_Rodriguez
Jenny_Romero
F
Steve_Karsay
Rick_Perry
Michel_Therrien
George_Robertson
Sophia_Loren
F
Liza_Minnelli
F
Adam_Scott
Juan_Carlos_Ferrero
Colin_Powell
Roberto_Marinho
Yu_Shyi-kun
Roger_Federer
Kelvin_Sampson
Michael_Linscott
Colin_Powell
Lleyton_Hewitt
George_Lopez
Richard_Krajicek
Julianne_Moore
F
Mona_Rishmawi
F
Tony_Blair
Fidel_Castro
Serena_Williams
F
Jerome_Golmard
Martha_Lucia_Ramirez
F
Allyson_Felix
F
Dunn_Lampton
Abdel_Madi_Shabneh
George_W_Bush
David_Beckham
Jennifer_Capriati
F
Erin_Brockovich
F
Nina_Jacobson
F
Katie_Harman
F
Gregg_Popovich
Andrew_Fastow
Tom_Ridge
Matt_Damon
Jana_Henke
F
Leslie_Caldwell
F
Jason_Kidd
Colin_Powell
Jean_Chretien
Gray_Davis
Richard_Gere
Serena_Williams
F
Gerhard_Schroeder
Martin_Scorsese
Al_Gore
Walt_Harris
Tony_Blair
Wen_Jiabao
Kathleen_Kennedy_Townsend
F
Yusuf_Misbac
Phil_Mickelson
Jim_Bollman
Miguel_Contreras
JT_Snow
Jean_Chretien
Li_Zhaoxing
Kofi_Annan
John_Paul_II
Lar

F
Lucio_Gutierrez
Tony_Blair
Jason_Biggs
Joe_Mantello
Dean_Barkley
Arnold_Schwarzenegger
Carin_Koch
F
Hector_Mitelman
Hee-Won_Han
F
Gerry_Adams
John_Robbins
Ali_Naimi
Sergio_Vieira_De_Mello
Clara_Harris
F
Lou_Reed
Masahiko_Nagasawa
Abdullatif_Sener
Anette_Hosoi
F
Alex_Popov
Roger_Federer
Jan-Michael_Gambill
Bill_Graham
Pio_Laghi
George_W_Bush
Fidel_Castro
George_W_Bush
Derrick_Rodgers
Joe_Lieberman
Rebecca_Romijn-Stamos
F
Ray_Lucas
Holly_Hunter
F
Tony_Blair
Recep_Tayyip_Erdogan
Jorge_Alberto_Galindo
Junichiro_Koizumi
Paul_Hogan
Peter_Hunt
Danny_Elfman
Sarah_Hughes
F
Colin_Powell
Anastasia_Myskina
F
Jim_Hahn
Jose_Carreras
John_Howard
George_Maxwell_Richards
Vicente_Fox
Jackie_Chan
Tommy_Thompson
Lesley_McCulloch
F
Kim_Ryong-sung
Sarah_Jessica_Parker
F
John_Negroponte
Peter_Lundgren
Angelina_Jolie
F
George_W_Bush
Angelina_Jolie
F
Eduardo_Duhalde
Charles_Grassley
Sergei_Ivanov
Mariah_Carey
F
Paul_Bremer
Rudolph_Giuliani
Miguel_Contreras
Kurt_Warner
Jean_Chretien
William_Umbach
Jeffrey_Imm

In [155]:
num_males = np.where(Sx == 0)[0].shape
print(f"Number of males: {num_males}")

Number of males: (10268,)


In [154]:
num_females = np.where(Sx == 1)[0].shape
print(f"Number of females: {num_females}")

Number of females: (2965,)


In [152]:
np.save(root + "/gender.npy", Sx)