In [40]:
import pandas as pd
import configparser
import openai

# Load configuration file
config = configparser.ConfigParser()
config.read('../config.ini')

openai.api_key = config.get('API', 'OPENAI_API_KEY')

In [41]:
filename = '../data/1_first_names_raw_data.csv'

# Read the TSV file into a pandas DataFrame
df = pd.read_csv(filename, on_bad_lines='warn')

In [43]:
df

Unnamed: 0,auid,name,EthnicSeer,prop,lastname,firstname,Ethnea,Genni,SexMac,SSNgender,Highest_probF_ethnicity,Highest_probF_value
0,9731334_2,Cameron 'Dale' Bass,ITA,0.653567,'Dale' Bass,Cameron,ENGLISH,M,mostly_male,-,ENGLISH,92.191
1,2155715_1,Bert Hart,ENG,0.772359,Hart,Bert,DUTCH,M,male,M,DUTCH,87.200
2,14609221_2,Esther Nolte- Hoen,GER,0.665081,Nolte- Hoen,Esther,GERMAN,F,female,F,HISPANIC,43.243
3,8101337_1,Ellen 't Hoen,CHI,0.665526,'t Hoen,Ellen,DUTCH,F,female,F,DUTCH,37.459
4,9804785_2,Peter 't Hoen,GER,0.330864,'t Hoen,Peter,DUTCH,M,male,M,GERMAN,34.203
...,...,...,...,...,...,...,...,...,...,...,...,...
35887,9239360_5,Senda Mezghani,ARA,0.691154,Mezghani,Senda,ARAB,F,andy,F,HISPANIC,60.288
35888,9258893_4,Raquelle Mesholam-Gately,FRN,0.671148,Mesholam-Gately,Raquelle,ENGLISH-HISPANIC,F,andy,F,HISPANIC,99.978
35889,12621069_2,Niki Messini-Nikolaki,RUS,0.946401,Messini-Nikolaki,Niki,GREEK,F,female,F,GREEK,96.862
35890,3154416_5,Tahar Mestiri,FRN,0.376373,Mestiri,Tahar,ARAB,M,male,-,ARAB,83.593


In [44]:
names = df['firstname'].tolist()
print(names[:10])


['Cameron', 'Bert', 'Esther', 'Ellen', 'Peter', 'Geert', 'Andrea', 'Max', 'Carolyn', 'Stephen']


In [47]:
# Run once to make sure Embedding column exists
# df['Embedding'] = None

In [48]:
# Function to get the embedding for each firstname
def get_embedding(row):
    if row['Embedding'] is None:
        try:
            response = openai.Embedding.create(
                input=row['firstname'],
                model="text-embedding-ada-002"
            )
            embedding = response['data'][0]['embedding']
        except Exception as e:
            # Handle the exception (e.g., log the error, use a default value, etc.)
            print(f"Error in API call for '{row['firstname']}': {str(e)}")
            embedding = None  # Set a default value if the API call fails
    else:
        embedding = row['Embedding']
    
    return embedding


In [51]:
for index, row in df.iterrows():
    embedding_value = get_embedding(row)
    df.at[index, 'Embedding'] = embedding_value


In [52]:
# Assuming your DataFrame is named 'df'
not_null_count = df['Embedding'].count()

print("Number of not null values in the 'Embedding' column:", not_null_count)

Number of not null values in the 'Embedding' column: 35892


In [53]:
# Export DataFrame to CSV file
df.to_csv('../data/2_embeddings_raw_data.csv', index=False)

In [55]:
df

Unnamed: 0,auid,name,EthnicSeer,prop,lastname,firstname,Ethnea,Genni,SexMac,SSNgender,Highest_probF_ethnicity,Highest_probF_value,Embedding
0,9731334_2,Cameron 'Dale' Bass,ITA,0.653567,'Dale' Bass,Cameron,ENGLISH,M,mostly_male,-,ENGLISH,92.191,"[0.0027535639237612486, -0.006899471394717693,..."
1,2155715_1,Bert Hart,ENG,0.772359,Hart,Bert,DUTCH,M,male,M,DUTCH,87.200,"[-0.012196633964776993, -0.034759119153022766,..."
2,14609221_2,Esther Nolte- Hoen,GER,0.665081,Nolte- Hoen,Esther,GERMAN,F,female,F,HISPANIC,43.243,"[-0.02559061162173748, -0.02379501983523369, -..."
3,8101337_1,Ellen 't Hoen,CHI,0.665526,'t Hoen,Ellen,DUTCH,F,female,F,DUTCH,37.459,"[-0.014605682343244553, -0.030205124989151955,..."
4,9804785_2,Peter 't Hoen,GER,0.330864,'t Hoen,Peter,DUTCH,M,male,M,GERMAN,34.203,"[0.018846435472369194, -0.026805326342582703, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
35887,9239360_5,Senda Mezghani,ARA,0.691154,Mezghani,Senda,ARAB,F,andy,F,HISPANIC,60.288,"[0.0009060240699909627, -0.012940743006765842,..."
35888,9258893_4,Raquelle Mesholam-Gately,FRN,0.671148,Mesholam-Gately,Raquelle,ENGLISH-HISPANIC,F,andy,F,HISPANIC,99.978,"[-0.009965583682060242, 0.001120477681979537, ..."
35889,12621069_2,Niki Messini-Nikolaki,RUS,0.946401,Messini-Nikolaki,Niki,GREEK,F,female,F,GREEK,96.862,"[-0.00857141800224781, -0.01564069651067257, 0..."
35890,3154416_5,Tahar Mestiri,FRN,0.376373,Mestiri,Tahar,ARAB,M,male,-,ARAB,83.593,"[-0.012485790997743607, -0.01225669402629137, ..."
