In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split

file_path = 'data/full_name.txt'

directory = 'data/20220621_VoterDetail'
if not os.path.exists(file_path):
    # iterate over files in that directory
    for filename in os.listdir(directory):
        f = os.path.join(directory, filename)
        # checking if it is a file
        if os.path.isfile(f):
            with open(f, 'r') as file:
                for line in file:
                    with open(file_path, 'a') as data:
                        data.write(line)
else:
    print(f'{file_path} already exists')

data/full_name.txt already exists


In [2]:
data_file_path = 'data/full_name.csv'
if not os.path.exists(data_file_path):
    read_data = pd.read_csv(file_path, sep='\t', header=None)
    read_data.to_csv(data_file_path, index=None)
else:
    print(f'{data_file_path} already exists')

data/full_name.csv already exists


In [3]:
df = pd.read_csv(data_file_path, header=None)
df = df.drop(0)

In [4]:
df = df.filter([4, 2, 20], axis=1)
df.columns = ['first_name', 'last_name', 'race_code']

In [5]:
races = ['', 'aian', 'api', 'black', 'hisp', 'white', 'other', 'multiracial', '', 'unknown']
race = [races[x] for x in df['race_code']]
df['race'] = race

In [6]:
# Drop rows with missing first and last name
df.dropna(subset=['first_name', 'last_name'], inplace=True)
print(f'Size after dropping missing first and last names: {df.shape}')

# Drop rows where people did not report race
unknown = df[df['race'] == 'unknown'].index
df.drop(unknown, inplace=True)
print(f'Size after dropping unknown: {df.shape}')

# Drop rows were last name is one letter
one_letter = df[df['last_name'].str.len() < 2].index
df.drop(one_letter, inplace=True)
print(f'Size after dropping one letter last names: {df.shape}')

Size after dropping missing first and last names: (15454908, 4)
Size after dropping unknown: (15009183, 4)
Size after dropping one letter last names: (14933273, 4)


In [7]:
# Full name
df['first_name'] = df['first_name'].str.strip().str.title()
df['last_name'] = df['last_name'].str.strip().str.title()
df['full_name'] = df['first_name'] + ' ' + df['last_name']

# Remove special characters using regular expressions
# [^ ... ] means not
# Pattern will match any character that is not a-z, A-Z, ', -, or space
df['full_name'] = df['full_name'].str.replace("[^a-zA-Z' -]", '', regex=True)

df = df.filter(['full_name', 'race_code', 'race'])
df.head()

Unnamed: 0,full_name,race_code,race
1,Daniel Sengbusch,5,white
2,Eric Kemlage,5,white
3,Lori Carter,5,white
4,Rebecca Yates,5,white
5,Electa Waddell,5,white


In [8]:
train_df, rest_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['race_code'])
val_df, test_df = train_test_split(rest_df, test_size=0.5, random_state=42, stratify=rest_df['race_code'])
train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

print(f'Train size: {train_df.shape}')
print(f'Validation size: {val_df.shape}')
print(f'Test size: {test_df.shape}')

Train size: (11946618, 3)
Validation size: (1493327, 3)
Test size: (1493328, 3)


In [9]:
train_df.groupby('race').agg({'full_name': 'nunique'})

Unnamed: 0_level_0,full_name
race,Unnamed: 1_level_1
aian,37387
api,225691
black,1229670
hisp,1393074
multiracial,66758
other,219876
white,4768185


In [10]:
val_df.groupby('race').agg({'full_name': 'nunique'})

Unnamed: 0_level_0,full_name
race,Unnamed: 1_level_1
aian,4765
api,31062
black,185608
hisp,218757
multiracial,8503
other,28647
white,776584


In [11]:
test_df.groupby('race').agg({'full_name': 'nunique'})

Unnamed: 0_level_0,full_name
race,Unnamed: 1_level_1
aian,4777
api,30994
black,185704
hisp,218237
multiracial,8512
other,28637
white,775622


In [12]:
# Save data to csv
train_df.to_csv('data/full_name1.csv', index=False)
val_df.to_csv('data/val_full_name1.csv', index=False)
test_df.to_csv('data/test_full_name1.csv', index=False)