In [9]:
import pandas as pd
import numpy as np
import ast
from sklearn.cluster import KMeans

In [10]:
filename = '../data/embeddings.csv'

# Read the TSV file into a pandas DataFrame
df = pd.read_csv(filename, on_bad_lines='warn')

In [11]:
df['Embedding'] = df['Embedding'].apply(ast.literal_eval)
df['Embedding'] = df['Embedding'].apply(lambda x: [float(num) for num in x])

df

Unnamed: 0,auid,name,EthnicSeer,prop,lastname,firstname,Ethnea,Genni,SexMac,SSNgender,Highest_probF_ethnicity,Highest_probF_value,Embedding
0,9731334_2,Cameron 'Dale' Bass,ITA,0.653567,'Dale' Bass,Cameron,ENGLISH,M,mostly_male,-,ENGLISH,92.191,"[0.002752959029749036, -0.006888952571898699, ..."
1,2155715_1,Bert Hart,ENG,0.772359,Hart,Bert,DUTCH,M,male,M,DUTCH,87.200,"[-0.012196633964776993, -0.034759119153022766,..."
2,7867892_1,Leen Hart,ENG,0.980865,Hart,Leen,DUTCH,-,male,F,DUTCH,98.679,"[0.00720670260488987, -0.011822231113910675, 0..."
3,14609221_2,Esther Nolte- Hoen,GER,0.665081,Nolte- Hoen,Esther,GERMAN,F,female,F,HISPANIC,43.243,"[-0.02559061162173748, -0.02379501983523369, -..."
4,8101337_1,Ellen 't Hoen,CHI,0.665526,'t Hoen,Ellen,DUTCH,F,female,F,DUTCH,37.459,"[-0.014605682343244553, -0.030205124989151955,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,10974234_3,Veeramuthu Balakrishnan,IND,1.000000,Balakrishnan,Veeramuthu,INDIAN,-,andy,-,INDIAN,97.739,"[-0.026633543893694878, -0.02088879607617855, ..."
3996,5532244_1,Vallath Balakrishnan,IND,0.997178,Balakrishnan,Vallath,INDIAN,-,andy,-,INDIAN,73.630,"[-0.011875685304403305, -0.009170440025627613,..."
3997,11506617_2,Rengarajan Balamurugan,IND,0.996938,Balamurugan,Rengarajan,INDIAN,M,andy,-,INDIAN,74.383,"[0.0070220064371824265, 0.0025403141044080257,..."
3998,10426449_1,Balamurugan Sampathkumar,IND,0.999982,Sampathkumar,Balamurugan,INDIAN,M,andy,-,INDIAN,79.123,"[-0.01752883940935135, -0.017201565206050873, ..."


In [12]:
# Convert the 'Embedding' column to a numpy array
X = np.array(df['Embedding'].tolist())

# Normalize the embeddings
X_normalized = X / np.linalg.norm(X, axis=1, keepdims=True)

# Apply K-means clustering
kmeans = KMeans(n_clusters=36, random_state=0)
kmeans.fit(X_normalized)

distances = kmeans.transform(X_normalized)
print(distances)

y_kmeans = kmeans.predict(X_normalized)
print(y_kmeans[0:10])
print(len(y_kmeans))



[[0.55672985 0.49156912 0.49729583 ... 0.47458441 0.48275444 0.47161205]
 [0.54949415 0.50336317 0.50057728 ... 0.48176613 0.5257838  0.45489831]
 [0.54640057 0.50411595 0.50556675 ... 0.48179563 0.51497566 0.47982215]
 ...
 [0.38307962 0.49611513 0.49669702 ... 0.49076456 0.46039051 0.49809855]
 [0.37955405 0.5431036  0.5095641  ... 0.5296735  0.50367174 0.54935152]
 [0.54263133 0.50550633 0.51426677 ... 0.50215622 0.52964048 0.50410912]]
[13 13  8  6  6 12  8 31 12  6]
4000


In [13]:
df['Cluster'] = y_kmeans


In [14]:
# Sort DataFrame by 'Cluster' column
df_sorted = df[['firstname', 'Cluster', 'Highest_probF_ethnicity', 'Highest_probF_value', 'Genni']].sort_values(by='Cluster')

# Export sorted DataFrame to CSV file
df_sorted.to_csv('../data/embeddings_sorted.csv', index=False)

print("Sorted embeddings saved to embeddings_sorted.csv.")

Sorted embeddings saved to embeddings_sorted.csv.


In [15]:
cluster_1_names = df[df['Cluster'] == 0]['firstname']
print(cluster_1_names)

753            Adaikkalam
854             Kunamneni
970           Thirugnanam
1323                Ajith
1324               Ajitha
1325         Valaparambil
1526           Akulapalli
1695           Kannayiram
1703              Alamelu
2060               Alluri
2464            Kadirvelu
2494         Anandakathir
2506           Muthuswamy
2518            Anbalagan
2519               Asokan
2521             Anbarasu
2697               Angadi
2699            Angamuthu
2748            Anilkumar
2759           Muragundla
2760           Yerramilli
2773            Annamalai
2921             Appasamy
2923               Appavu
2927            Appuswamy
2963           Aranapakam
2969                 Arul
3017              Aravind
3133         Palaniyappan
3139              Arjunan
3161       Arunmozhiarasi
3190             Arulandu
3245          Arulkumaran
3247             Arumugam
3249           Nagalingam
3251          Arumugasamy
3252            Arumugham
3255          Arunachalam
3257    Arun

In [16]:
from sklearn.manifold import TSNE
import matplotlib
import matplotlib.pyplot as plt

tsne = TSNE(n_components=2, perplexity=15, random_state=42, init="random", learning_rate=200)
vis_dims2 = tsne.fit_transform(X)

x = [x for x, y in vis_dims2]
y = [y for x, y in vis_dims2]

for category, color in enumerate(["purple", "green", "red", "blue"]):
    xs = np.array(x)[df.Cluster == category]
    ys = np.array(y)[df.Cluster == category]
    plt.scatter(xs, ys, color=color, alpha=0.3)

    avg_x = xs.mean()
    avg_y = ys.mean()

    plt.scatter(avg_x, avg_y, marker="x", color=color, s=100)
    plt.title("Clusters identified visualized in language 2d using t-SNE")

KeyboardInterrupt: 