In [176]:
import pandas as pd
import numpy as np
import ast
from sklearn.cluster import KMeans, MiniBatchKMeans

In [177]:
filename = '../data/2_embeddings_raw_data.csv'

# Read the TSV file into a pandas DataFrame
df = pd.read_csv(filename, on_bad_lines='warn')

In [178]:
df['Embedding'] = df['Embedding'].apply(ast.literal_eval)
df['Embedding'] = df['Embedding'].apply(lambda x: [float(num) for num in x])

df

Unnamed: 0,auid,name,EthnicSeer,prop,lastname,firstname,Ethnea,Genni,SexMac,SSNgender,Highest_probF_ethnicity,Highest_probF_value,Embedding
0,9731334_2,Cameron 'Dale' Bass,ITA,0.653567,'Dale' Bass,Cameron,ENGLISH,M,mostly_male,-,ENGLISH,92.191,"[0.0027535639237612486, -0.006899471394717693,..."
1,2155715_1,Bert Hart,ENG,0.772359,Hart,Bert,DUTCH,M,male,M,DUTCH,87.200,"[-0.012196633964776993, -0.034759119153022766,..."
2,14609221_2,Esther Nolte- Hoen,GER,0.665081,Nolte- Hoen,Esther,GERMAN,F,female,F,HISPANIC,43.243,"[-0.02559061162173748, -0.02379501983523369, -..."
3,8101337_1,Ellen 't Hoen,CHI,0.665526,'t Hoen,Ellen,DUTCH,F,female,F,DUTCH,37.459,"[-0.014605682343244553, -0.030205124989151955,..."
4,9804785_2,Peter 't Hoen,GER,0.330864,'t Hoen,Peter,DUTCH,M,male,M,GERMAN,34.203,"[0.018846435472369194, -0.026805326342582703, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
35887,9239360_5,Senda Mezghani,ARA,0.691154,Mezghani,Senda,ARAB,F,andy,F,HISPANIC,60.288,"[0.0009060240699909627, -0.012940743006765842,..."
35888,9258893_4,Raquelle Mesholam-Gately,FRN,0.671148,Mesholam-Gately,Raquelle,ENGLISH-HISPANIC,F,andy,F,HISPANIC,99.978,"[-0.009965583682060242, 0.001120477681979537, ..."
35889,12621069_2,Niki Messini-Nikolaki,RUS,0.946401,Messini-Nikolaki,Niki,GREEK,F,female,F,GREEK,96.862,"[-0.00857141800224781, -0.01564069651067257, 0..."
35890,3154416_5,Tahar Mestiri,FRN,0.376373,Mestiri,Tahar,ARAB,M,male,-,ARAB,83.593,"[-0.012485790997743607, -0.01225669402629137, ..."


In [244]:
# Remove some names with low probability of being in top ethnicities to have better class balance for clustering
top_ethnicities = ['HISPANIC', 'ENGLISH', 'INDIAN', 'ARAB', 'CHINESE', 'JAPANESE', 'NORDIC', 'SLAV']
df = df[~df["Highest_probF_ethnicity"].isin(top_ethnicities) | ((df["Highest_probF_ethnicity"].isin(top_ethnicities) & (df["Highest_probF_value"] > 95)))]

In [256]:
# ethnea_counts = df['Highest_probF_ethnicity'].value_counts()
# print(ethnea_counts.to_string())
# print(len(ethnea_counts))

In [247]:
# Convert the 'Embedding' column to a numpy array
X = np.array(df['Embedding'].tolist())

# Normalize the embeddings
X_normalized = X / np.linalg.norm(X, axis=1, keepdims=True)

print(len(X))

21813


In [321]:
%%time
# Apply K-means clustering
df_copy = df.copy()
kmeans = MiniBatchKMeans(n_clusters=150, random_state=0, batch_size=2048)
kmeans.fit(X_normalized)

y_kmeans = kmeans.predict(X_normalized)
print(y_kmeans[0:10])
print(len(y_kmeans))




[ 62  64 126  48 141 126 125  12  48 111]
21813
CPU times: user 7min 16s, sys: 5.45 s, total: 7min 22s
Wall time: 49.1 s


In [322]:
# Add 'Cluster' column to the copied DataFrame
df_copy['Cluster'] = y_kmeans

# Sort DataFrame by 'Cluster' column
df_sorted = df_copy[['firstname', 'Cluster', 'Highest_probF_ethnicity', 'Highest_probF_value', 'Genni']].sort_values(by='Cluster')
# Rename columns
df_sorted.rename(columns={'Highest_probF_ethnicity': 'Ethnicity', 'Highest_probF_value': 'Ethnicity Probability', 'Genni': 'Gender'}, inplace=True)
df_sorted['Group'] = list(zip(df_sorted['Ethnicity'], df_sorted['Gender']))

# Save to csv
csv_filename = '../data/3_clusters_150.csv'
df_sorted.to_csv(csv_filename, index=False)
print(f"Sorted embeddings saved to {csv_filename}.")

Sorted embeddings saved to ../data/3_clusters_150.csv.


In [323]:
df_sorted

Unnamed: 0,firstname,Cluster,Ethnicity,Ethnicity Probability,Gender,Group
32626,Ede,0,HUNGARIAN,38.357,F,"(HUNGARIAN, F)"
32601,Edyta,0,SLAV,99.779,F,"(SLAV, F)"
4454,Edzer,0,TURKISH,56.486,M,"(TURKISH, M)"
12373,Edvaldo,0,HISPANIC,99.158,M,"(HISPANIC, M)"
32630,Edina,0,HUNGARIAN,78.443,F,"(HUNGARIAN, F)"
...,...,...,...,...,...,...
10242,Duran,149,TURKISH,99.979,M,"(TURKISH, M)"
11618,Dennie,149,DUTCH,48.823,M,"(DUTCH, M)"
25325,Donnall,149,ENGLISH,99.934,M,"(ENGLISH, M)"
8669,Darek,149,SLAV,100.000,M,"(SLAV, M)"


In [324]:
## Creates a df with information about each cluster

cluster_sizes = df_sorted['Cluster'].value_counts().reset_index().rename(columns={'index': 'Cluster', 'Cluster': 'Size'})

# Find the group with the highest count for each cluster
highest_group = df_sorted.groupby('Cluster')['Group'].apply(lambda x: x.value_counts().idxmax()).reset_index(name='Group')

# Calculate the percentage of rows with the highest group for each cluster
highest_group_percent = df_sorted.groupby('Cluster')['Group'].apply(lambda x: (x.value_counts(normalize=True).max() * 100).round(2)).reset_index(name='Group_Acc')

# Merge the DataFrames
result_df = pd.merge(cluster_sizes, highest_group, on='Cluster')
result_df = pd.merge(result_df, highest_group_percent, on='Cluster')

result_df = result_df.sort_values(by='Group_Acc').reset_index(drop=True)

# Print the combined DataFrame
result_df


Unnamed: 0,Cluster,Size,Group,Group_Acc
0,35,146,"(KOREAN, F)",6.16
1,130,184,"(FRENCH, F)",7.61
2,68,217,"(ARAB, F)",8.76
3,38,180,"(AFRICAN, M)",8.89
4,24,146,"(ISRAELI, F)",8.90
...,...,...,...,...
145,75,151,"(JAPANESE, M)",94.04
146,118,109,"(JAPANESE, M)",95.41
147,37,137,"(CHINESE, M)",96.35
148,112,135,"(JAPANESE, F)",97.78


In [325]:
## Only takes clusters with high ethnicity/gender agreement

# Filter clusters with Ethnicity and Gender Acc > 50%
chosen_clusters = result_df[(result_df['Group_Acc'] > 50)]

# Only keep size of 10 or above
chosen_clusters = chosen_clusters[chosen_clusters['Size'] > 9]

# Print the filtered DataFrame
print(chosen_clusters.reset_index(drop=True).tail())
print("Average Cluster Size: " + str(chosen_clusters['Size'].mean()))

    Cluster  Size          Group  Group_Acc
60       75   151  (JAPANESE, M)      94.04
61      118   109  (JAPANESE, M)      95.41
62       37   137   (CHINESE, M)      96.35
63      112   135  (JAPANESE, F)      97.78
64       17   230    (INDIAN, M)      97.83
Average Cluster Size: 155.06153846153848


In [328]:
grouped_df = chosen_clusters.groupby(['Group'])
selected_rows_df = pd.DataFrame()

for group_id, (group_label, group_data) in enumerate(grouped_df):
    cluster_list = group_data['Cluster'].to_list()

    # Find all matching rows in 'results_df' with the same 'Group'
    # from a list of clusters
    matching_rows = df_sorted[
        (df_sorted['Group'] == group_label) &
        (df_sorted['Cluster'].isin(cluster_list))
    ]

    # Take a random 10 rows from the matching rows
    selected_rows = matching_rows.sample(n=10, random_state=32)

    # Concatenate the selected rows with the 'selected_rows_df' DataFrame
    selected_rows_df = pd.concat([selected_rows_df, selected_rows], ignore_index=True)


selected_rows_df

Unnamed: 0,firstname,Cluster,Ethnicity,Ethnicity Probability,Gender,Group
0,Kayode,54,AFRICAN,94.402,M,"(AFRICAN, M)"
1,Adeniyi,54,AFRICAN,99.562,M,"(AFRICAN, M)"
2,Kolawole,54,AFRICAN,95.009,M,"(AFRICAN, M)"
3,Adewale,54,AFRICAN,98.553,M,"(AFRICAN, M)"
4,Rotimi,54,AFRICAN,93.058,M,"(AFRICAN, M)"
...,...,...,...,...,...,...
285,Sabahattin,55,TURKISH,99.992,M,"(TURKISH, M)"
286,Sevket,55,TURKISH,99.975,M,"(TURKISH, M)"
287,Sahin,55,TURKISH,98.843,M,"(TURKISH, M)"
288,Ahmet,55,TURKISH,100.000,M,"(TURKISH, M)"


In [329]:
# Find any missing groups
missing_groups = []

unique_groups = list(chosen_clusters['Group'].unique())

ethnicity_gender_dict = {}

for ethnicity, gender in unique_groups:
    if ethnicity not in ethnicity_gender_dict:
        ethnicity_gender_dict[ethnicity] = set()
    ethnicity_gender_dict[ethnicity].add(gender)

missing_groups = []

for ethnicity, genders in ethnicity_gender_dict.items():
    if 'F' not in genders:
        missing_groups.append((ethnicity, 'F'))
    elif 'M' not in genders:
        missing_groups.append((ethnicity, 'M'))

print(missing_groups)

unique_groups_sorted = sorted(unique_groups + missing_groups)
for group in unique_groups_sorted:
    print(group)

[('KOREAN', 'F'), ('ROMANIAN', 'F'), ('NORDIC', 'F'), ('THAI', 'F'), ('AFRICAN', 'F'), ('DUTCH', 'F'), ('ISRAELI', 'F'), ('ENGLISH', 'M'), ('GERMAN', 'F')]
('AFRICAN', 'F')
('AFRICAN', 'M')
('ARAB', 'F')
('ARAB', 'M')
('CHINESE', 'F')
('CHINESE', 'M')
('DUTCH', 'F')
('DUTCH', 'M')
('ENGLISH', 'F')
('ENGLISH', 'M')
('FRENCH', 'F')
('FRENCH', 'M')
('GERMAN', 'F')
('GERMAN', 'M')
('GREEK', 'F')
('GREEK', 'M')
('HISPANIC', 'F')
('HISPANIC', 'M')
('INDIAN', 'F')
('INDIAN', 'M')
('ISRAELI', 'F')
('ISRAELI', 'M')
('ITALIAN', 'F')
('ITALIAN', 'M')
('JAPANESE', 'F')
('JAPANESE', 'M')
('KOREAN', 'F')
('KOREAN', 'M')
('NORDIC', 'F')
('NORDIC', 'M')
('ROMANIAN', 'F')
('ROMANIAN', 'M')
('SLAV', 'F')
('SLAV', 'M')
('THAI', 'F')
('THAI', 'M')
('TURKISH', 'F')
('TURKISH', 'M')


In [330]:
# Print 10 random first names from each ethnicity
for group in missing_groups:
    selected_rows = df_sorted[(df_sorted["Group"] == group) & (df_sorted['Ethnicity Probability'] > 90)].sample(n=10, replace=True)
    selected_rows_df = pd.concat([selected_rows_df, selected_rows], ignore_index=True)
    print(f"{group}: {selected_rows['firstname'].tolist()}")


('KOREAN', 'F'): ['Soyoung', 'Soung', 'Mee', 'Mijung', 'Marshae', 'Sujin', 'Jea', 'Seyeon', 'Yoonjung', 'Hosoon']
('ROMANIAN', 'F'): ['Camelia', 'Luminita', 'Georgeta', 'Mihaelina', 'Marioara', 'Viorela', 'Iuliana', 'Florina', 'Iuliana', 'Florina']
('NORDIC', 'F'): ['Juulia', 'Ellisiv', 'Ulrikke', 'Mirkka', 'Ashild', 'Liis', 'Sveinbjorg', 'Ingebjorg', 'Gunnel', 'Grete']
('THAI', 'F'): ['Podjanee', 'Achara', 'Pornpimon', 'Prapaporn', 'Peeranuch', 'Apiradee', 'Malinee', 'Suwimol', 'Melanee', 'Piyanart']
('AFRICAN', 'F'): ['Olukemi', 'Hiwot', 'Adwoa', 'Omolola', 'Olukemi', 'Hlengiwe', 'Thokozile', 'Liezl', 'Oluwakemi', 'Olajumoke']
('DUTCH', 'F'): ['Tineke', 'About', 'Anniek', 'Frouwke', 'Leontien', 'Ans', 'Dorien', 'Taqdees', 'Kristien', 'Balqees']
('ISRAELI', 'F'): ['Rakefet', 'Zipi', 'Pnina', 'Batsheva', 'Tziporah', 'Gali', 'Mazal', 'Hadassah', 'Ahava', 'Mirit']
('ENGLISH', 'M'): ['Hugh', 'Jordon', 'Bennett', 'Lindell', 'Curtis', 'Chet', 'Keene', 'Aidan', 'Sherwood', 'Bennett']
('GERMA

In [333]:
selected_rows_df.to_csv('../data/3_name_groups.csv')

In [332]:
print("Number of different groups: ", len(selected_rows_df['Group'].unique()))

Number of different groups:  38


In [207]:
# # OPTIONAL: Chart showing gender and ethnicity agreement

# ## Creates a df_sorted with information about each cluster

# cluster_sizes = df_sorted['Cluster'].value_counts().reset_index().rename(columns={'index': 'Cluster', 'Cluster': 'Size'})

# # Calculate the percentage of rows with the highest ethnicity for each cluster
# highest_ethnicity_percent = df_sorted.groupby('Cluster')['Ethnicity'].apply(lambda x: (x.value_counts(normalize=True).max() * 100).round(2)).reset_index(name='Ethnicity_Acc')

# # Calculate the percentage of rows with the highest gender for each cluster
# highest_gender_percent = df_sorted.groupby('Cluster')['Gender'].apply(lambda x: (x.value_counts(normalize=True).max() * 100).round(2)).reset_index(name='Gender_Acc')

# # Find the ethnicity with the highest count for each cluster
# highest_ethnicity = df_sorted.groupby('Cluster')['Ethnicity'].apply(lambda x: x.value_counts().idxmax()).reset_index(name='Ethnicity')

# # Find the gender with the highest count for each cluster
# highest_gender = df_sorted.groupby('Cluster')['Gender'].apply(lambda x: x.value_counts().idxmax()).reset_index(name='Genni')

# # Merge the DataFrames
# result_df = pd.merge(cluster_sizes, highest_ethnicity, on='Cluster')
# result_df = pd.merge(result_df, highest_ethnicity_percent, on='Cluster')
# result_df = pd.merge(result_df, highest_gender, on='Cluster')
# result_df = pd.merge(result_df, highest_gender_percent, on='Cluster')

# result_df = result_df.sort_values(by='Cluster').reset_index(drop=True)

# import matplotlib.pyplot as plt

# plt.figure(figsize=(8, 6))  # Set the figure size

# # Scatter plot
# plt.scatter(result_df['Ethnicity_Acc'], result_df['Gender_Acc'])

# # Add labels to the points
# for i in range(len(result_df)):
#     plt.text(result_df['Ethnicity_Acc'][i], result_df['Gender_Acc'][i], result_df['Ethnicity'][i])

# # Set the axis labels
# plt.xlabel('Ethnicity Accuracy')
# plt.ylabel('Gender Accuracy')

# # Set the title
# plt.title('Accuracy by Ethnicity and Gender')

# # Show the plot
# plt.show()
