# Chaining Methods in Pandas

For data scientist who uses python in their workflow, there is a high probability he/she uses **pandas**. When using pandas it is normal to have a code that looks like this

In [17]:
import pandas as pd
import re

In [23]:
# Get dataset
csv_url = 'https://opendata.socrata.com/api/views/d2kg-fyem/rows.csv?accessType=DOWNLOAD'
data = pd.read_csv(csv_url)
data.head()

Unnamed: 0,FIPS code,Jurisdiction,Division,Precincts,Total Registration,Make,Model,Equipment Type,VVPAT,Accessible Use,Early Voting,Absentee Ballots,Polling Place,State
0,200000000,Alaska State,,441,574441,,,Hand Counted Paper Ballots,,No,Yes,Yes,Yes,
1,200000000,Alaska State,,441,574441,Premier/Diebold (Dominion),AccuVote TSX,DRE-Touchscreen,Yes,Yes,Yes,No,No,
2,200000000,Alaska State,,441,574441,Premier/Diebold (Dominion),AccuVote OS,Optical Scan,,No,Yes,Yes,Yes,
3,100000000,Alabama State,,2527,2986782,,,,,No,No,No,No,
4,100100000,Autauga County,,17,33806,Election Systems & Software,AutoMARK,Ballot Marking Device or System,,Yes,No,No,No,


Dataset contains the locations of electronic voting machines in the United States. Supposed we process the dataset.

In [24]:
def tidy_string(string):
    string = string.lower()
    string = re.sub('[^\w]+', '_', string)
    return string

processed_data = data.copy()
processed_data.columns = map(tidy_string, processed_data.columns)
processed_data = processed_data.reset_index()
processed_data['location'] = processed_data['jurisdiction'].str.split().str.get(0)
processed_data['state'] = processed_data['state'].fillna(processed_data['location'])
processed_data.head()

Unnamed: 0,index,fips_code,jurisdiction,division,precincts,total_registration,make,model,equipment_type,vvpat,accessible_use,early_voting,absentee_ballots,polling_place,state,location
0,0,200000000,Alaska State,,441,574441,,,Hand Counted Paper Ballots,,No,Yes,Yes,Yes,Alaska,Alaska
1,1,200000000,Alaska State,,441,574441,Premier/Diebold (Dominion),AccuVote TSX,DRE-Touchscreen,Yes,Yes,Yes,No,No,Alaska,Alaska
2,2,200000000,Alaska State,,441,574441,Premier/Diebold (Dominion),AccuVote OS,Optical Scan,,No,Yes,Yes,Yes,Alaska,Alaska
3,3,100000000,Alabama State,,2527,2986782,,,,,No,No,No,No,Alabama,Alabama
4,4,100100000,Autauga County,,17,33806,Election Systems & Software,AutoMARK,Ballot Marking Device or System,,Yes,No,No,No,Autauga,Autauga


We can improve this, by using pandas feature called `pipe` and `assign`

In [28]:
def fillna(df, missing, filled):
    df[missing] = df[missing].fillna(df[filled])
    return df

processed_data = (data
    .copy()
    .rename(columns=tidy_string)
    .reset_index()
    .assign(location=lambda x: x['jurisdiction'].str.split().str.get(0))
    .pipe(fillna, 'state', 'location')
)

processed_data.head()

Unnamed: 0,index,fips_code,jurisdiction,division,precincts,total_registration,make,model,equipment_type,vvpat,accessible_use,early_voting,absentee_ballots,polling_place,state,location
0,0,200000000,Alaska State,,441,574441,,,Hand Counted Paper Ballots,,No,Yes,Yes,Yes,Alaska,Alaska
1,1,200000000,Alaska State,,441,574441,Premier/Diebold (Dominion),AccuVote TSX,DRE-Touchscreen,Yes,Yes,Yes,No,No,Alaska,Alaska
2,2,200000000,Alaska State,,441,574441,Premier/Diebold (Dominion),AccuVote OS,Optical Scan,,No,Yes,Yes,Yes,Alaska,Alaska
3,3,100000000,Alabama State,,2527,2986782,,,,,No,No,No,No,Alabama,Alabama
4,4,100100000,Autauga County,,17,33806,Election Systems & Software,AutoMARK,Ballot Marking Device or System,,Yes,No,No,No,Autauga,Autauga
