Download a list of commissioners from the Office of Advisory Neighborhood Commissions

In [1]:
import os
os.chdir('../')

In [2]:
import time
import requests
import pandas as pd
from tqdm import tqdm
from fuzzywuzzy import fuzz

from scripts.common import hash_dataframe, current_timestamp
from scripts.data_transformations import list_commissioners

pd.set_option('display.max_rows', 500)

In [3]:
ancs = pd.read_csv('data/ancs.csv')
ancs = ancs[ancs.redistricting_year == 2022].copy()

In [4]:
comm_dict = {}

for idx, anc in tqdm(ancs.iterrows(), total=len(ancs)):
    r = requests.get(anc.dc_oanc_link)
    
    table_list = pd.read_html(r.text, converters={'SMD': str})

    if 'SMD' in table_list[0].columns:
        comm_dict[anc.anc_id] = table_list[0].copy()
    elif anc.and_id == 'anc_5F_2022':
        # First row of this ANC is not correct and should be dropped
        # comm_dict[anc.anc_id] = table_list[0].drop().copy()
        print('5F')
        bad = table_list[0].copy()
    else:
        print(f'Warning: ANC {anc.anc_id} table not present on OANC site.')
        print(table_list)
    
    time.sleep(2)

 63%|███████████████████████████                | 29/46 [01:00<00:35,  2.06s/it]

[      0                    1                                             2  \
0   SMD                 Name                                       Address   
1  5F01           Tony Hurst                          Washington, DC 20002   
2  5F02            Aru Sahni                          Washington, DC 20002   
3  5F03    Patricia Williams  401 Edgewood Street NE  Washington, DC 20002   
4  5F04          Mark Galvan                          Washington, DC 20002   
5  5F05    Jennifer Anderson         117 U Street NE  Washington, DC 20002   
6  5F06  Joe Bishop-Henchman         415 W Street NE  Washington, DC 20002   
7  5F07    Sylvia M. Pinkney          34 R Street NE  Washington, DC 20002   

              3                  4  
0         Phone              Email  
1  202-725-6087  [email protected]  
2           NaN  [email protected]  
3  202-709-9375  [email protected]  
4  202-455-5615  [email protected]  
5  202-935-2703  [email protected]  
6  202-599-0929  [email protected]  


100%|███████████████████████████████████████████| 46/46 [01:35<00:00,  2.07s/it]


In [5]:
comm_official = pd.concat(comm_dict).reset_index().rename(columns={'level_0': 'anc_id'}).drop(columns='level_1')

In [6]:
comm_official['smd_id'] = 'smd_2022_' + comm_official['SMD']

In [7]:
comm_official['is_vacant'] = comm_official.Name.str.lower()== 'vacant'
comm_official['is_chairperson'] = comm_official.Name.str.lower().str.contains('chairperson|chairpeson')

In [8]:
comm_official['official_name'] = (
    comm_official.Name
    .str.replace(' Chairperson', '')
    .str.replace(' Chairpeson', '')
    .str.strip()
)

In [9]:
comm_official['oanc_hash_id'] = hash_dataframe(comm_official, ['SMD', 'official_name'])

In [10]:
# Compare official to OpenANC
comm_openanc = list_commissioners(status='current')
comm = pd.merge(comm_official, comm_openanc, how='left', on='smd_id')

comm['official_name'] = comm['official_name'].fillna('Vacant')
comm['commissioner_name'] = comm['commissioner_name'].fillna('Vacant')
comm['name_score'] = comm.apply(lambda x: fuzz.ratio(x.official_name, x.commissioner_name), axis=1)

comm[['smd_id', 'official_name', 'commissioner_name', 'name_score']].sort_values(by='name_score')

Unnamed: 0,smd_id,official_name,commissioner_name,name_score
286,smd_2022_7E03,Vacant,Beverly F. Smith,9
93,smd_2022_3G06,Peter Gosselin,Vacant,10
88,smd_2022_3G01,Lisa R. Gore,Vacant,11
92,smd_2022_3G05,Peter Lynch,Vacant,12
19,smd_2022_1C01,Vacant,Howard Bauleke,20
257,smd_2022_7B02l,Jamaal Maurice Pearsall,Vacant,21
90,smd_2022_3G03,James Nash,Vacant,25
141,smd_2022_4A07,Vacant,Carolyn C. Steptoe,25
91,smd_2022_3G04,Michael Zeldin,Vacant,30
89,smd_2022_3G02,Bruce Sherman,Vacant,32


In [11]:
columns_to_save = [
    'smd_id'
    , 'Name'
    , 'official_name'
    , 'is_vacant'
    , 'is_chairperson'
    , 'oanc_hash_id'
]

filename = 'data/oanc/commissioners_{}.csv'.format(current_timestamp().strftime('%Y-%m-%d'))

comm_official[columns_to_save].to_csv(filename, index=False)

KeyError: "['name_score'] not in index"

In [None]:
# Confirm that there is one chairperson per ANC
comm_official['is_chairperson_int'] = comm_official['is_chairperson'].astype(int)
num_chairs = comm_official.groupby('anc_id').is_chairperson_int.sum()
num_chairs.sum() == comm_official.anc_id.nunique()