In [1]:
import pandas as pd
import numpy as np

adf = pd.read_csv("../senate_info.csv")
adf = adf.drop(["Unnamed: 0","API URL"],axis=1)
#suffix_list = ["Jr.","Sr.","IV","II","III"]

# Extract Suffix and remove from source column
adf['Suffix'] = adf['Name'].str.extract(r'(Jr\.|III|IV|II|Sr\.)')

# Replace suffixes with empty strings in original source column (so it isn't pulled again)
adf['Name'] = adf['Name'].str.replace(r'(Jr\.|III|IV|II|Sr\.)', '',regex=True)

# Extract nicknames and remove from source column (so it isn't pulled again)
adf["Nickname"] = adf["Name"].str.extract(r'("[^"]+"|\([^)]+\))')
adf["Name"] = adf["Name"].str.replace(r'("[^"]+"|\([^)]+\))',"",regex=True)

# Split name into separate columns
adf[["Last","First_temp","Suffix_temp"]] = adf["Name"].str.split(",",expand=True)

# Remove extra characters and spaces from Nickname
adf["Nickname"] = adf["Nickname"].str.replace("(","",regex=False).str.replace(")", "",regex=False).str.replace('"', "",regex=False).str.replace(",","",regex=False)

# If you need to check a specific row
#adf.loc[779]


In [2]:
# Verify Suffix looks good
adf["Suffix"].value_counts()

Jr.    20
III     3
IV      1
Name: Suffix, dtype: int64

In [3]:
# Verify Nickname looks good
adf["Nickname"].value_counts()

Dick    1
Dee     1
Jake    1
Name: Nickname, dtype: int64

In [4]:
# Split First_temp based on space and store in temp df
name_split = adf["First_temp"].str.split(" ",expand=True)
#name_split.to_csv("test.csv")

name_split

Unnamed: 0,0,1,2,3
0,,Ted,,
1,,Elizabeth,,
2,,Sheldon,,
3,,Todd,,
4,,Peter,,
...,...,...,...,...
418,,Judd,,
419,,Rod,,
420,,Phil,,
421,,Albert,,


In [5]:
# Check different columns of name_split to see what it contains
# Should hopefully only need first three worth
name_split.loc[name_split[3].notnull(),3].tolist()

['']

In [6]:
# After confirming name_split looks good, bring into actuals
adf["First_Actual"] = name_split[1]
adf["Middle"] = name_split[2]
#adf["Middle_2"] = name_split[3]

In [7]:
# Confirm Suffix is empty before dropping
adf["Suffix_temp"].value_counts()

      22
       1
Name: Suffix_temp, dtype: int64

In [8]:
# Clean up columns and empty strings
adf = adf.drop(["First_temp","Suffix_temp",],axis=1)
adf = adf.replace("",np.nan)

In [9]:
# Run these two to make sure Middle_1 and Middle_2 look good
adf.loc[adf["Middle"].notnull(),"Middle"].tolist()
#adf["Middle_2"].value_counts()

['F.',
 'D.',
 'G.',
 'R.',
 'E.',
 'C.',
 'E.',
 'J.',
 'S.',
 'M.',
 'Ray',
 'W.',
 'K.',
 'Wood',
 'E.',
 'J.',
 'P.',
 'A.',
 'M.',
 'Moore',
 'R.',
 'L.',
 'Boyd',
 'A.',
 'F.',
 'T.',
 'C.',
 'J.',
 'W.',
 'M.',
 'D.',
 'E.',
 'M.',
 'Steven',
 'Patrick',
 'P.',
 'S.',
 'Grattan',
 'Rodham',
 'E.',
 'R.',
 'C.',
 'A.',
 'V.',
 'B.',
 'E.',
 'R.',
 'A.',
 'D.',
 'P.',
 'V.',
 'G.',
 'M.',
 'F.',
 'F.',
 'L.',
 'Glenn',
 'F.',
 'H.',
 'R.',
 'B.',
 'C.',
 'E.',
 'S.',
 'G.',
 'D.',
 'J.',
 'S.',
 'W.',
 'B.',
 'A.',
 'M.',
 'E.',
 'E.',
 'C.',
 'T.',
 'J.',
 'K.',
 'Lloyd',
 'S.',
 'H.',
 'B.',
 'V.',
 'D.',
 'S.',
 'A.',
 'H.',
 'H.',
 'B.',
 'O.',
 'S.',
 'H.',
 'Patrick',
 'E.',
 'B.',
 'M.',
 'F.',
 'J.',
 'M.',
 'J.',
 'W.',
 'A.',
 'L.',
 'McC.',
 'G.',
 'G.',
 'B.',
 'I.',
 'D.',
 'R.',
 'F.',
 'Robert',
 'M.',
 'Landon',
 'K.',
 'Bennett',
 'W.',
 'K.',
 'M.',
 'K.',
 'Bailey',
 'H.',
 'J.',
 'E.',
 'L.',
 'F.',
 'Ichiye',
 'D.',
 'O.',
 'G.',
 'K.',
 'A.',
 'W.',
 'P.',
 '

In [11]:
# Export 
adf.to_csv("../senate_info_name_cleaned.csv")