In [7]:
import pandas as pd
import configparser
import openai

# Load configuration file
config = configparser.ConfigParser()
config.read('../config.ini')

openai.api_key = config.get('API', 'OPENAI_API_KEY')

In [8]:
filename = '../data/to_fix.csv'

# Read the TSV file into a pandas DataFrame
df = pd.read_csv(filename, on_bad_lines='warn')

In [9]:
df

Unnamed: 0,auid,name,EthnicSeer,prop,lastname,firstname,Ethnea,Genni,SexMac,SSNgender,Highest_probF_ethnicity,Highest_probF_value
0,9731334_2,Cameron 'Dale' Bass,ITA,0.653567,'Dale' Bass,Cameron,ENGLISH,M,mostly_male,-,ENGLISH,92.191
1,2155715_1,Bert Hart,ENG,0.772359,Hart,Bert,DUTCH,M,male,M,DUTCH,87.200
2,7867892_1,Leen Hart,ENG,0.980865,Hart,Leen,DUTCH,-,male,F,DUTCH,98.679
3,14609221_2,Esther Nolte- Hoen,GER,0.665081,Nolte- Hoen,Esther,GERMAN,F,female,F,HISPANIC,43.243
4,8101337_1,Ellen 't Hoen,CHI,0.665526,'t Hoen,Ellen,DUTCH,F,female,F,DUTCH,37.459
...,...,...,...,...,...,...,...,...,...,...,...,...
1997,5539805_1,Alison Moore,ENG,0.973204,Moore,Alison,ENGLISH,F,female,F,ENGLISH,98.446
1998,2591432_4,Gerasimos Alivizatos,RUS,0.625319,Alivizatos,Gerasimos,GREEK,M,male,M,GREEK,98.341
1999,16921340_2,Tufik Alizade Sani,ARA,0.961992,Alizade Sani,Tufik,ARAB,M,andy,-,ARAB,77.443
2000,17914518_3,Azin Alizadeh Asl,ARA,0.947498,Alizadeh Asl,Azin,ARAB,F,female,-,ARAB,99.801


In [10]:
names = df['firstname'].head(1000).tolist()
print(names[:10])


['Cameron', 'Bert', 'Leen', 'Esther', 'Ellen', 'Peter', 'Geert', 'Andrea', 'Max', 'Carolyn']


In [11]:
# List to store embeddings
embeddings = []

# Retrieve embeddings for each name
for name in names:
    response = openai.Embedding.create(
        input=name,
        model="text-embedding-ada-002"
    )
    embedding = response['data'][0]['embedding']
    embeddings.append(embedding)


In [12]:
# Select the first 200 rows using iloc
subset_df = df.head(1000).copy()

# Create a new column 'Embedding' and assign the embeddings
subset_df['Embedding'] = embeddings[:1000]

# Print the updated DataFrame
print(subset_df)

# Export DataFrame to CSV file
subset_df.to_csv('../data/embeddings.csv', index=False)

           auid                   name EthnicSeer      prop      lastname  \
0     9731334_2    Cameron 'Dale' Bass        ITA  0.653567   'Dale' Bass   
1     2155715_1              Bert Hart        ENG  0.772359          Hart   
2     7867892_1              Leen Hart        ENG  0.980865          Hart   
3    14609221_2     Esther Nolte- Hoen        GER  0.665081   Nolte- Hoen   
4     8101337_1          Ellen 't Hoen        CHI  0.665526       't Hoen   
..          ...                    ...        ...       ...           ...   
995   8296003_1     Luigi Lazzari Agli        ITA  0.999036  Lazzari Agli   
996  10750176_2         Marlene Aglony        GER  0.397017        Aglony   
997  15564501_6  Habibatou Diallo-Agne        FRN  0.998308   Diallo-Agne   
998     69097_2      Giancarlo Agnelli        ITA  0.999509       Agnelli   
999  15853199_2              Agnes Ban        ENG  0.461761           Ban   

     firstname           Ethnea Genni       SexMac SSNgender  \
0      Came