In [None]:
import pandas as pd
import numpy as np
import ast
from sklearn.cluster import KMeans, MiniBatchKMeans

In [None]:
filename = '../data/2_embeddings_raw_data.csv'
df = pd.read_csv(filename, on_bad_lines='warn')

In [None]:
df['Embedding'] = df['Embedding'].apply(ast.literal_eval)
df['Embedding'] = df['Embedding'].apply(lambda x: [float(num) for num in x])

df

In [None]:
X = np.array(df['Embedding'].tolist())

X_normalized = X / np.linalg.norm(X, axis=1, keepdims=True)

print(len(X))

In [None]:
%%time
# Apply K-means clustering
df_copy = df.copy()
kmeans = MiniBatchKMeans(n_clusters=120, random_state=0, batch_size=2048)
kmeans.fit(X_normalized)

y_kmeans = kmeans.predict(X_normalized)
print(y_kmeans[0:10])
print(len(y_kmeans))


In [None]:
# Add 'Cluster' column to the copied DataFrame
df_copy['Cluster'] = y_kmeans

# Sort DataFrame by 'Cluster' column
df_sorted = df_copy[['firstname', 'Cluster', 'Highest_probF_ethnicity', 'Highest_probF_value', 'Genni']].sort_values(by='Cluster')
# Rename columns
df_sorted.rename(columns={'Highest_probF_ethnicity': 'Ethnicity', 'Highest_probF_value': 'Ethnicity Probability', 'Genni': 'Gender'}, inplace=True)
df_sorted['Group'] = list(zip(df_sorted['Ethnicity'], df_sorted['Gender']))

# Save to csv
csv_filename = '../data/3_clusters_120.csv'
df_sorted.to_csv(csv_filename, index=False)
print(f"Sorted embeddings saved to {csv_filename}.")

In [None]:
df_sorted

In [None]:
# Testing random sampling without using clustering
# sampled_df = df_sorted.groupby(['Ethnicity', 'Gender']).apply(lambda x: x.sample(n=10, replace=True if len(x) < 10 else False)).reset_index(drop=True)
# sampled_df.to_csv('test.csv')

In [None]:
## Creates a df with information about each cluster

cluster_sizes = df_sorted['Cluster'].value_counts().reset_index().rename(columns={'index': 'Cluster', 'Cluster': 'Size'})

# Find the group with the highest count for each cluster
highest_group = df_sorted.groupby('Cluster')['Group'].apply(lambda x: x.value_counts().idxmax()).reset_index(name='Group')

# Calculate the percentage of rows with the highest group for each cluster
highest_group_percent = df_sorted.groupby('Cluster')['Group'].apply(lambda x: (x.value_counts(normalize=True).max() * 100).round(2)).reset_index(name='Group_Acc')

# Merge the dfs
result_df = pd.merge(cluster_sizes, highest_group, on='Cluster')
result_df = pd.merge(result_df, highest_group_percent, on='Cluster')

result_df = result_df.sort_values(by='Group_Acc').reset_index(drop=True)

# Remove issue clusters
result_df = result_df[result_df['Cluster'] != 72]

result_df


In [None]:
## Only takes clusters with high ethnicity/gender agreement

# Filter clusters with Ethnicity and Gender Acc > 50%
chosen_clusters = result_df[(result_df['Group_Acc'] > 50)]

# Only keep size of 10 or above
chosen_clusters = chosen_clusters[chosen_clusters['Size'] > 9]

# Print the filtered DataFrame
print(chosen_clusters.reset_index(drop=True).tail())
print("Average Cluster Size: " + str(chosen_clusters['Size'].mean()))

In [None]:
grouped_df = chosen_clusters.groupby(['Group'])
selected_rows_df = pd.DataFrame()

for group_id, (group_label, group_data) in enumerate(grouped_df):
    cluster_list = group_data['Cluster'].to_list()

    # Find all matching rows in 'results_df' with the same 'Group'
    # from a list of clusters
    matching_rows = df_sorted[
        (df_sorted['Group'] == group_label) &
        (df_sorted['Cluster'].isin(cluster_list))
    ]

    # Take a random 10 rows from the matching rows
    selected_rows = matching_rows.sample(n=10, random_state=32)

    # Concatenate the selected rows with the 'selected_rows_df' DataFrame
    selected_rows_df = pd.concat([selected_rows_df, selected_rows], ignore_index=True)


selected_rows_df

In [None]:
# Find any missing groups
missing_groups = []

unique_groups = list(chosen_clusters['Group'].unique())

ethnicity_gender_dict = {}

for ethnicity, gender in unique_groups:
    if ethnicity not in ethnicity_gender_dict:
        ethnicity_gender_dict[ethnicity] = set()
    ethnicity_gender_dict[ethnicity].add(gender)

missing_groups = []

for ethnicity, genders in ethnicity_gender_dict.items():
    if 'F' not in genders:
        missing_groups.append((ethnicity, 'F'))
    elif 'M' not in genders:
        missing_groups.append((ethnicity, 'M'))

print(missing_groups)

unique_groups_sorted = sorted(unique_groups + missing_groups)
for group in unique_groups_sorted:
    print(group)

In [None]:
# Print 10 random first names from each ethnicity with missing gender pair
missing_groups.append(('ENGLISH', 'F'))
missing_groups.append(('ENGLISH', 'M'))

for group in missing_groups:
    selected_rows = df_sorted[(df_sorted["Group"] == group) & (df_sorted['Ethnicity Probability'] > 90)].sample(n=10, replace=False, random_state=4)
    selected_rows_df = pd.concat([selected_rows_df, selected_rows], ignore_index=True)
    print(f"{group}: {selected_rows['firstname'].tolist()}")


In [None]:
selected_rows_df.to_csv('../data/3_name_groups.csv')

In [None]:
print("Number of different groups: ", len(selected_rows_df['Group'].unique()))