In [2]:
import pandas as pd
import numpy as np
import ast
from sklearn.cluster import KMeans, MiniBatchKMeans




In [3]:
filename = '../data/embeddings_raw_data.csv'

# Read the TSV file into a pandas DataFrame
df = pd.read_csv(filename, on_bad_lines='warn')

In [4]:
df = df[df['Genni'] != "-"]
df['Embedding'] = df['Embedding'].apply(ast.literal_eval)
df['Embedding'] = df['Embedding'].apply(lambda x: [float(num) for num in x])

df

Unnamed: 0,auid,name,EthnicSeer,prop,lastname,firstname,Ethnea,Genni,SexMac,SSNgender,Highest_probF_ethnicity,Highest_probF_value,Embedding
0,13294057_2,aarne Rintala,IND,0.829189,Rintala,Aarne,NORDIC,M,male,M,NORDIC,99.973,"[-0.020803842693567276, 0.0041910335421562195,..."
1,18124682_1,otho ROSS,GER,0.612647,ROSS,Otho,GERMAN,M,male,M,INDIAN,46.995,"[-0.001662388676777482, -0.01729852892458439, ..."
2,9858962_2,Ernesta Parisi,ITA,0.815189,Parisi,Ernesta,ITALIAN,F,female,F,ITALIAN,44.376,"[-0.023890763521194458, 0.0015228496631607413,..."
3,7475844_2,Lui Forni,ITA,0.875418,Forni,Lui,ITALIAN-CHINESE,M,andy,M,CHINESE,82.237,"[-0.00348041788674891, -0.010045161470770836, ..."
4,8423332_6,marjoleine Dekker,GER,0.429062,Dekker,Marjoleine,DUTCH,F,female,-,DUTCH,96.696,"[-0.012544992379844189, -0.004416018724441528,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95312,13501857_1,korkmaz Altug,GER,0.702783,Altug,Korkmaz,TURKISH,M,male,-,TURKISH,65.933,"[0.005675738211721182, -0.02606821246445179, 0..."
95316,5450086_2,dirkingo Wolfrum,GER,0.908264,Wolfrum,Dirkingo,GERMAN,M,andy,-,JAPANESE,75.179,"[-0.020973743870854378, 0.007845391519367695, ..."
95327,14663758_1,Manikandan,IND,0.615995,S,Manikandan,INDIAN,M,andy,-,INDIAN,84.538,"[-0.0020628455094993114, -0.005263116210699081..."
95331,16634083_2,Aekaterini Sakellariou,ENG,0.914353,Sakellariou,Aekaterini,GREEK,F,andy,-,GREEK,97.934,"[-0.0036378998775035143, -0.004696441814303398..."


In [5]:
%%time
# Convert the 'Embedding' column to a numpy array
X = np.array(df['Embedding'].tolist())

# Normalize the embeddings
X_normalized = X / np.linalg.norm(X, axis=1, keepdims=True)

print(len(X))

35879
CPU times: user 1.34 s, sys: 142 ms, total: 1.48 s
Wall time: 1.48 s


In [7]:
ethnea_counts = df['Highest_probF_ethnicity'].value_counts()
print(ethnea_counts.to_string())
print(len(ethnea_counts))

HISPANIC      4748
ENGLISH       4298
INDIAN        3822
ARAB          3242
CHINESE       2308
JAPANESE      2162
NORDIC        2122
SLAV          2006
GERMAN        1455
FRENCH        1423
TURKISH       1401
ITALIAN       1245
DUTCH         1192
GREEK          814
ISRAELI        797
AFRICAN        751
KOREAN         708
THAI           583
BALTIC         306
ROMANIAN       206
HUNGARIAN      200
INDONESIAN      32
VIETNAMESE      31
CARIBBEAN       23
MONGOLIAN        1
25


In [46]:
%%time
# List of n_clusters values to try
n_clusters_list = [50, 100, 500, 1000]

for n_clusters in n_clusters_list:
    # Make a copy of the original DataFrame to work with
    df_copy = df.copy()

    # Apply K-means clustering
    kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=0, batch_size=2048)
    kmeans.fit(X_normalized)

    y_kmeans = kmeans.predict(X_normalized)
    print(y_kmeans[0:10])
    print(len(y_kmeans))

    # Add 'Cluster' column to the copied DataFrame
    df_copy['Cluster'] = y_kmeans

    # Sort DataFrame by 'Cluster' column
    df_sorted = df_copy[['firstname', 'Cluster', 'Highest_probF_ethnicity', 'Highest_probF_value', 'Genni']].sort_values(by='Cluster')

    # Export sorted DataFrame to CSV file
    csv_filename = f'../data/3_clusters_{n_clusters}.csv'
    df_sorted.to_csv(csv_filename, index=False)

    print(f"Sorted embeddings saved to {csv_filename}.")


