In [3]:
import pandas as pd
import configparser
import openai

# Load configuration file
config = configparser.ConfigParser()
config.read('../config.ini')

openai.api_key = config.get('API', 'OPENAI_API_KEY')

In [4]:
filename = '../data/first_names_0-4000.csv'

# Read the TSV file into a pandas DataFrame
df = pd.read_csv(filename, on_bad_lines='warn')

In [5]:
df

Unnamed: 0,auid,name,EthnicSeer,prop,lastname,firstname,Ethnea,Genni,SexMac,SSNgender,Highest_probF_ethnicity,Highest_probF_value
0,9731334_2,Cameron 'Dale' Bass,ITA,0.653567,'Dale' Bass,Cameron,ENGLISH,M,mostly_male,-,ENGLISH,92.191
1,2155715_1,Bert Hart,ENG,0.772359,Hart,Bert,DUTCH,M,male,M,DUTCH,87.200
2,7867892_1,Leen Hart,ENG,0.980865,Hart,Leen,DUTCH,-,male,F,DUTCH,98.679
3,14609221_2,Esther Nolte- Hoen,GER,0.665081,Nolte- Hoen,Esther,GERMAN,F,female,F,HISPANIC,43.243
4,8101337_1,Ellen 't Hoen,CHI,0.665526,'t Hoen,Ellen,DUTCH,F,female,F,DUTCH,37.459
...,...,...,...,...,...,...,...,...,...,...,...,...
4141,13079875_2,Tamas Banyasz,ENG,0.548975,Banyasz,Tamas,HUNGARIAN,M,male,M,HUNGARIAN,93.765
4142,12316049_1,Banza Baya,SPA,0.312835,Baya,Banza,HISPANIC,-,andy,-,HISPANIC,88.393
4143,16360893_1,Nadja Erika Banziger-Tobler,GER,0.988877,Erika Banziger-Tobler,Nadja,GERMAN,F,female,F,GERMAN,86.357
4144,8645296_1,Qing Bao Tian,CHI,0.999304,Bao Tian,Qing,CHINESE,-,andy,F,CHINESE,93.511


In [6]:
names = df['firstname'].head(4000).tolist()
print(names[:10])


['Cameron', 'Bert', 'Leen', 'Esther', 'Ellen', 'Peter', 'Geert', 'Andrea', 'Max', 'Carolyn']


In [7]:
# List to store embeddings
embeddings = []

# Retrieve embeddings for each name
for name in names:
    response = openai.Embedding.create(
        input=name,
        model="text-embedding-ada-002"
    )
    embedding = response['data'][0]['embedding']
    embeddings.append(embedding)


In [8]:
# Select the first 200 rows using iloc
subset_df = df.head(4000).copy()

# Create a new column 'Embedding' and assign the embeddings
subset_df['Embedding'] = embeddings[:4000]

# Print the updated DataFrame
print(subset_df)

# Export DataFrame to CSV file
subset_df.to_csv('../data/embeddings.csv', index=False)

            auid                      name EthnicSeer      prop      lastname  \
0      9731334_2       Cameron 'Dale' Bass        ITA  0.653567   'Dale' Bass   
1      2155715_1                 Bert Hart        ENG  0.772359          Hart   
2      7867892_1                 Leen Hart        ENG  0.980865          Hart   
3     14609221_2        Esther Nolte- Hoen        GER  0.665081   Nolte- Hoen   
4      8101337_1             Ellen 't Hoen        CHI  0.665526       't Hoen   
...          ...                       ...        ...       ...           ...   
3995  10974234_3   Veeramuthu Balakrishnan        IND  1.000000  Balakrishnan   
3996   5532244_1      Vallath Balakrishnan        IND  0.997178  Balakrishnan   
3997  11506617_2    Rengarajan Balamurugan        IND  0.996938   Balamurugan   
3998  10426449_1  Balamurugan Sampathkumar        IND  0.999982  Sampathkumar   
3999   8422397_3             Chenera Balan        RUS  0.444481         Balan   

        firstname    Ethnea