In [1]:
import camelot.io as camelot
import warnings
import pandas as pd
import re
import numpy as np
from unidecode import unidecode
from pypdf import PdfReader

warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)

  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4


## Rosters

### Senate

In [10]:
special_cases = {
    "7,8": {
        'party_check': True,
    },
    '15,16': {
        'select_a': [0, 5, 9, 10, 11, 12, 13],
        'combine_a': [(0, 1), (5, 6)],
        'select_b': [0, 1, 13, 14, 15, 16, 17],
        'combine_b': [(1, 2), (17, 18, 19, 20, 21, 22, 23, 25)],
        'party_check': True
    },
    "21,22": {
        'select_a': [0, 10, 29, 30, 31, 32, 33],
        'combine_a': [(0, 1)],
        'select_b': [0, 10, 13, 14, 15, 16, 17],
        'combine_b': [(10, 11)],
        'party_check': False
    },
    "23,24": {
        'select_a': [0, 10, 25, 26, 27, 28, 29],
        'combine_a': [(0, 1), (10, 11)],
        'select_b': [0, 10, 13, 14, 15, 16, 17],
        'combine_b': [(10, 11), (17, 18)],
        'party_check': False
    },
    "25,26": {
        'select_a': [0, 9, 28, 29, 30, 31, 32],
        'select_b': [0, 6, 16, 17, 18, 19, 20],
        'combine_b': [(0, 1), (6, 7, 9, 15), (20, 21, 22)],
        'party_check': False
    }
}

In [11]:
def text_clean(word, name=False):
    if name:
        pattern = r'[^A-Za-z\s,]'
    else:
        pattern = r'[^A-Za-z\s,\'-]'
    return re.sub(pattern, '', word)

def row_overlap(t):
    missing_counties = t.loc[(t['Name'] == '') & (t['Counties'] != '')].sort_index(ascending=False).index
    for i in missing_counties:
        t.loc[i-1, 'Counties'] = t.loc[i-1, 'Counties'] + t.loc[i, 'Counties']
    return t

def senate_table_1(pages, special_params=None, version=1):
    tables = camelot.read_pdf("Senate_Rosters.pdf", pages=pages, flavor='stream')
    t1 = tables[0].df
    t2 = tables[1].df
    if (special_params is None) or (pages == '7,8'):
        if version == 1:
            t1 = t1.loc[((t1[0] == '') & (t1[6] != '')) | (t1[0] != ''), :6]
            t2 = t2.loc[((t2[0] == '') & (t2[6] != '')) | (t2[0] != ''), :6]
            col_names = ['Name', 'Occupation', 'Party', 'District No.', 'Seat No.', 'Phone', 'Counties']
        elif version == 2:
            t1 = t1.loc[((t1[0] == '') & (t1[5] != '')) | (t1[0] != ''), :5]
            t2 = t2.loc[((t2[0] == '') & (t2[5] != '')) | (t2[0] != ''), :5]
            col_names = ['Name', 'Occupation', 'Party', 'District No.', 'Seat No.' 'Counties']
    else:
        if 'combine_a' in special_params[pages].keys():
            for combo in special_params[pages]['combine_a']:
                t1[combo[0]] = t1[[c for c in combo]].apply(lambda x: ' '.join(x), axis=1)
        if 'combine_b' in special_params[pages].keys():
            for combo in special_params[pages]['combine_b']:
                t2[combo[0]] = t2[[c for c in combo]].apply(lambda x: ' '.join(x), axis=1)
        t1 = t1[special_params[pages]['select_a']]
        t2 = t2[special_params[pages]['select_b']]
        col_names = ['Name', 'Occupation', 'Party', 'District No.', 'Seat No.', 'Phone', 'Counties']
        if pages == '25,26':
            t2.iloc[29, 0] = ''
            t2.iloc[27, 0] = t2.iloc[27, 0] + ' Martinez'

    t1.columns = col_names
    t2.columns = col_names
    t1.reset_index(drop=True, inplace=True)
    t2.reset_index(drop=True, inplace=True)
    if pages == "25,26":
        for i, row in t2.iterrows():
            if "\n." not in row['Occupation']:
                t2.loc[i, 'Occupation'] = t2.loc[i, 'Occupation'] + ' ' + t2.loc[i, 'Occupation']
        t2['Occupation'] = t2['Occupation'].apply(lambda x: re.sub(r'\n.', '', x))
        t2.loc[t2['Name'].str.startswith('Weber'), 'Occupation'] = 'Physician'
    for t in [t1, t2]:
        for col in ['Occupation', 'Counties']:
            t[col] = t[col].apply(lambda x: text_clean(x, name=False))
        t['Name'] = t['Name'].apply(lambda x: text_clean(x, name=True))
    t1 = row_overlap(t1)
    t2 = row_overlap(t2)

    t1['Name'] = t1['Name'].apply(lambda x: x.strip())
    t2['Name'] = t2['Name'].apply(lambda x: x.strip())
    if special_params is not None:
        if special_params[pages]['party_check']:
            pstring_1 = " ".join(p for p in t1['Party'] if ("Party" not in p) and (p != ' ') and (p != ''))
            pstring_2 = " ".join(p for p in t2['Party'] if ("Party" not in p) and (p != ' ') and (p != ''))
            p_series1 = pd.Series(pstring_1.split(' '))
            p_series2 = pd.Series(pstring_2.split(' '))

    t1 = t1.loc[(t1['Name'] != '') & (t1['Name'] != 'Name')& (t1['Name'] != ' ') & (t1['Name'] != 'Vacancy')].reset_index(drop=True)
    t2 = t2.loc[(t2['Name'] != '') & (t2['Name'] != 'Name') & (t2['Name'] != ' ') & (t2['Name'] != 'Vacancy')].reset_index(drop=True)
    if special_params is not None:
        if special_params[pages]['party_check']:
            t1['Party'] = p_series1
            t2['Party'] = p_series2

    t = pd.concat([t1, t2])
    t['pages'] = pages
    return t

In [None]:
sc = []
for i in np.arange(1, 27, 2):
    pages = f"{i},{i+1}"

    if pages in special_cases:
        sc.append(senate_table_1(pages, special_cases, version=1))
    else:
        try:
            sc.append(senate_table_1(pages, special_params=None, version=1))
        except:
            sc.append(senate_table_1(pages, special_params=None, version=2))
t = pd.concat(sc)
# t.to_csv("senate_roster.csv", index=False)

### Assembly

In [2]:
def name_clean(name):
    wd = str(unidecode(name))
    if wd.endswith("Jr"):
        wd = re.sub(r"\,\s*(J|S)r\.*\s*$", "", wd)
    if wd.endswith("III"):
        wd = re.sub(r"\,\s*I{2,}\s*$", "", wd)
    clean = re.sub(r"\-", " ", wd)
    clean = re.sub(r"[^a-zA-Z\,/\s]", "", clean).strip()
    return clean


def assembly_name_cleaner(name):
    try:
        name2 = name_clean(name)
        first, last = first_last_name(name2)
        return first, last
    except:
        name1 = re.sub(r"\s{2,}", " ", name).strip()
        name2 = re.sub(r"(?<=[a-z])\s+(?=[A-Z][a-z]+)", ", ", name1)
        name3 = name_clean(name2)
        first, last = first_last_name(name3)
        return first, last


def assembly_cleaner(df, i):
    df["Term"] = f"{2000 + i}-{2000 + i + 1}"
    df["Occupation"] = df["Occupation"].apply(word_cleaner)
    df["Party"] = df["Party"].str.strip()
    df_clear = df.loc[~df["Name"].isin(["", " ", "Name"])]
    df_clear[["First", "Last"]] = (
        df_clear["Name"].apply(assembly_name_cleaner).apply(pd.Series)
    )

    return df_clear.reset_index(drop=True)

def word_cleaner(word, name=False):
    wd = unidecode(word)
    if name:
        clean = re.sub(r"[^a-zA-Z\,/]", "", str(wd))
        clean2 = re.sub(r"[-(?:\s{2,})]", " ", clean)
    else:
        clean = re.sub(r"[^a-zA-Z\,/]", "", str(wd))
        clean2 = re.sub(r"\s{2,}", " ", clean)
    return clean2.strip()


def first_last_name(name):
    if (name.strip(" ") == "Vacancy") or "Vacancy" in name:
        return "Vacancy", "Vacancy"

    if name.endswith("Jr"):
        name = re.sub(r"\,\s*(J|S)r\.*\s*$", "", name)
    if name.endswith("III"):
        name = re.sub(r"\,\s*III*\s*$", "", name)

    try:
        last, first = name.split(", ")
        return first, last
    except ValueError:
        try:
            last, first = name.split(",")
            return last, first
        except:
            try:
                last, first = name.split(" ")
                return first, last
            except:
                return '', ''

In [3]:
def process_candidate_data(t, i):

    if len(t[0].df.columns) <= 9:
        t_a = t[0].df.iloc[:, :5]
        t_b = t[1].df.iloc[:, :5]
        t_a.columns = ["Name", "Occupation", "Party", "District No.", "Seat No."]
        t_b.columns = ["Name", "Occupation", "Party", "District No.", "Seat No."]
    elif len(t[0].df.columns) == 45:
        t_a = t[0].df.iloc[:, [0, 10, 11, 39, 40, 41]]
        t_a.loc[:, 10] = t_a.loc[:, 10] + " " + t_a.loc[:, 11]

        t_a = t_a.drop(columns=[11])
        t_a.columns = ["Name", "Occupation", "Party", "District No.", "Seat No."]
        t_b = t[1].df.iloc[:, [0, 8, 9, 28, 29, 30]]
        t_b.loc[:, 8] = t_b.loc[:, 8] + " " + t_b.loc[:, 9]
        t_b = t_b.drop(columns=[9])
        t_b.columns = ["Name", "Occupation", "Party", "District No.", "Seat No."]
    elif len(t[0].df.columns) == 29:
        t_a = t[0].df.iloc[:, [0, 1, 7, 8, 23, 24, 25]]
        t_a.loc[:, 0] = t_a.loc[:, 0] + " " + t_a.loc[:, 1]
        t_a.loc[:, 7] = t_a.loc[:, 7] + " " + t_a.loc[:, 8]
        t_a = t_a.drop(columns=[1, 8])
        t_a.columns = ["Name", "Occupation", "Party", "District No.", "Seat No."]
        t_b = t[1].df.iloc[:, [0, 1, 15, 48, 49, 50]]
        t_b.iloc[16, 1] = "Valladres, Suzette Martinez"
        t_b.iloc[17, 0] = ""
        t_b.loc[:, 0] = t_b.loc[:, 0] + " " + t_b.loc[:, 1]
        t_b = t_b.drop(columns=[1])
        t_b.columns = ["Name", "Occupation", "Party", "District No.", "Seat No."]
    elif len(t[0].df.columns) == 26:
        t_a = t[0].df.iloc[:, [0, 6, 7, 20, 21, 22]]
        t_a.loc[:, 6] = t_a.loc[:, 6] + " " + t_a.loc[:, 7]
        t_a = t_a.drop(columns=[7])
        t_a.columns = ["Name", "Occupation", "Party", "District No.", "Seat No."]
        if len(t[1].df.columns) == 30:
            t_b = t[1].df.iloc[:, [0, 6, 7, 24, 25, 26]]
            t_b.loc[:, 6] = t_b.loc[:, 6] + " " + t_b.loc[:, 7]
            t_b = t_b.drop(columns=[7])
        else:
            t_b = t[1].df.iloc[:, [0, 7, 40, 41, 42]]
        t_b.columns = ["Name", "Occupation", "Party", "District No.", "Seat No."]
    table = pd.concat([t_a, t_b]).reset_index(drop=True)
    table = assembly_cleaner(table, i)

    positions = t[2].df.iloc[:, :2].rename(columns={0: "Position", 1: "Name"})
    positions = positions.loc[~positions["Name"].isin(['', ' ', 'Name'])]
    positions['Position'] = positions['Position'].apply(word_cleaner)
    positions[['First', 'Last']] = positions['Name'].apply(assembly_name_cleaner).apply(pd.Series)
    positions = positions.loc[~positions['Position'].isin(['ChiefClerk', 'ChiefSergeantatArms'])].drop(columns=['Name'])
    tale = table.merge(positions, on=['First', 'Last'], how='left').drop(columns=['Name'])
    return tale

In [4]:
assembly_roster = pd.DataFrame(columns=['Occupation', 'Party', 'District No.', 'Seat No.', 'Term', 'First', 'Last', 'Position'])
for i in np.arange(1, 27, 2):
    page_numbers = f"{i},{i+1}"
    t = camelot.read_pdf('Assembly_Rosters.pdf', pages=page_numbers, flavor='stream')
    table = process_candidate_data(t, i)
    assembly_roster = pd.concat([assembly_roster, table], ignore_index=True)

In [9]:
camelot.read_pdf('Assembly_Rosters.pdf', pages="1,2", flavor='stream')[0].df

Unnamed: 0,0,1,2,3,4,5,6,7
0,,,,District,Seat,Office,Local,
1,Name,Occupation,Party,No.,No.,,,District Office Mailing Address
2,,,,,,No.,Phone,
3,"Aanestad, Sam\n. . . . . . . . . . . . . . . ....",Oral Surgeon\n. . . . . . . . . . . . . . . . ...,R,3,60,4144,319-2003,"350 Crown Point Circle, Suite 150, Grass Valle..."
4,"Alquist, Elaine\n. . . . . . . . . . . . . . ....",Businesswoman/Educator\n. . . . . . . . . .,D,22,15,3120,319-2022,"100 Paseo De San Antonio, Room 319, San Jose 9..."
5,"Aroner, Dion\n. . . . . . . . . . . . . . . . ...",State Social Services Specialist\n. . . . .,D,14,8,2163,319-2014,"918 Parker Street, Suite A–13, Berkeley 94710"
6,"Ashburn, Roy\n. . . . . . . . . . . . . . . . ...",Legislator\n. . . . . . . . . . . . . . . . . ...,R,32,65,4167,319-2032,"1200 Truxtun Avenue, Suite 120, Bakersfield 93301"
7,"Bates, Patricia\n. . . . . . . . . . . . . . ....",Legislator\n. . . . . . . . . . . . . . . . . ...,R,73,64,6031,319-2073,"30012 Ivy Glenn Drive, Suite 120, Laguna Nigue..."
8,"Bogh, Russ 1\n. . . . . . . . . . . . . . . . ...",Businessman/Legislator\n. . . . . . . . . . .,R,65,53,2002,319-2065,"34932 Yucaipa Boulevard, Yucaipa 92399"
9,"Briggs, Mike\n. . . . . . . . . . . . . . . . ...",Legislator\n. . . . . . . . . . . . . . . . . ...,R,29,49,2130,319-2029,"83 East Shaw Avenue, Suite 202, Fresno 93710"


In [13]:
assembly_roster.to_csv("assembly_roster.csv", index=False)

## Senate Committees

In [3]:
senate_reader = PdfReader('Senate_Committees.pdf')

In [4]:
def early_format_sen_coms(page, page_num):
    senate_coms = {}
    t = unidecode(page)
    t_ = re.sub(r'(?:(?:MEMBERSHIPS)|(?:Continued)|(?:continued))(?=[\nA-Za-z])', '. ', t)
    if page_num in [0, 3, 5,6, 7]:
        matches = [m for m in re.finditer(r'(?<=\.)\s*([A-Z][A-Za-z\s\']+)(?=\s*--\(\d+\)--\s*)', str(t_))]
    else:
        matches = [m for m in re.finditer(r'(?<=\.)\s*\n([A-Z][A-Za-z\s\'-]+)(?=\s*--\(\s*\d+\s*\)--\s*)', str(t_))]
    for i, m in enumerate(matches):
        start_ = m.start()
        end_ = matches[i+1].start() - 1 if i < len(matches)-1 else len(str(t_))
        string_ = str(t_)[start_:end_].split(r'--')
        name = re.sub(r'(?<=\w)\n(?=\w)', '', string_[0]).strip()
        committees = string_[2].strip().split(';')
        senate_coms[name] = {'committees': {}}
        for committee in committees:
            committee_ = re.sub(r'\n', ' ', committee)
            position_match = re.search(r'((?:\s*C\s*o\s*-\s*C\s*h\s*a\s*i\s*r\s*)|(?:(?:\s*C\s*o)*\s*C\s*h\s*a\s*i\s*r)|(?:\s*V\s*ic\s*e\s*-*C\s*hai\s*r)|(?:(?:\s*R\s*e\s*p\s*u\s*b\s*l\s*i\s*c\s*a\s*n)|(?:\s*D\s*e\s*m\s*o\s*c\s*r\s*a\s*t\s*i\s*c\s*)\s*A\s*l\s*ter\s*nate))', committee_)
            if position_match:
                position = position_match.group(1).strip()
                replace = rf'\(\s*{re.escape(position)}\s*\)'
                committee_ = re.sub(replace, '', committee_).strip()
            else:
                position = 'Member'
                committee_ = committee_.strip()
            committee__ = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', committee_).strip()
            committee___ = re.sub(r'(?<=[a-z])\s+(?=[a-z]{1,2}\b)', '', committee__).strip()

            senate_coms[name]['committees'].update({committee___: position})
    return senate_coms

In [5]:
def later_format_sen_coms(page, page_num, year, leftovers=None):
    senate_coms = {}
    holding = None
    t = unidecode(page)
    t_ = re.sub(r'(\n*SENATOR.*COMMITTEE.*(?:\n\s*(?:AND)*.*COUNCIL).*MEMBERSHIPS)', '', t)
    t__ = re.sub(r'Boar\nds', 'Boards', t_)
    if page_num == 24:
        matches = [m for m in re.finditer(r'--[Cc]ontinued\s*\n*([A-Z][A-Za-z\s\']+)(?=\s*--continued)', str(t__))]
    elif page_num == 32:
        matches = [m for m in re.finditer(r'(?:Continued)*\n\s*([A-Z][A-Za-z\s\']+)\n', str(t__))]
    elif page_num == 46:
        matches = [m for m in re.finditer(r'\n([A-Z][\w\s]+)\s*\n(?:(?:S\s*t\s*a\s*n\s*d\s*i\s*n\s*g)|(?:\s*S\s*e\s*l\s*e\s*c\s*t))\s*\n*\s*C\s*o\s*m\s*m\s*i\s*t\s*t\s*e\s*e\s*s\s*--\(\s*\w+\s*\)', str(t__))]
    else:
        matches = [m for m in re.finditer(r'\n([A-Z][\w\s\']+)(?:\s*-*-*\s*continued\s*)*\n[\w\s,]+--\(\s*\w+\s*\)', str(t__))]
    for i, match in enumerate(matches):
        end = matches[i+1].start() - 1 if i < len(matches)-1 else len(str(t__))
        if leftovers is None or i > 0:
            if page_num == 32:
                name_ = re.search(r'\n(.*)\n', match.group()).group().strip()
            else:
                name_ = re.search(r'\n(.*)\s*\n(?=Standing\s*\n*\s*C\s*om\s*m\s*i\s*t\s*t\s*e\s*es*)', match.group()).group().strip()
        else:
            if page_num == 24 or page_num == 30:
                name_ = match.group(1).strip()
            else:
                name_ = re.search(r'\n(.*)(?=\s*-*-*(?:[C|c]ontinued\s*)|\n*[A-Z](?:.*\n*\s*C\s*o\s*m\s*m\s*i\s*t\s*t\s*e\s*es*)]*)', match.group()).group().strip()
                name_ = re.sub(r'--\s*(?:[C|c]ontinued)*', '', name_).strip()
        _start = match.end()
        coms = str(t__)[_start:end]
        clean = [c for c in coms.split('--') if re.search(r'\([1-9]+\)', c) is None]
        cleaned = [re.sub(r'\n*(?:\s*S\s*e\s*l\s*e\s*c\s*t\s* C\s*o\s*m\s*m\s*i\s*t\s*t\s*e\s*e\s*s)|(?:S\s*u\s*b\s*c\s*o\s*m\s*m\s*i\s*t\s*t\s*e\s*e\s*s)|(?:J\s*o\s*i\s*n\s*t C\s*o\s*m\s*m\s*i\s*t\s*t\s*e\s*e\s*s)|(?:B\s*o\s*a\s*r\s*d\s*s, C\s*o\s*m\s*m\s*i\s*s\s*s\s*i\s*o\s*n\s*s, a\s*n\s*d C\s*o\s*u\s*n\s*c\s*i\s*l\s*s)', ' ', c) for c in clean]
        cleaned_ = re.sub(r'(?<!No)(?<=[a-z\s\)])\.(?=[\s\n]*[A-Z]*)', '; ', ''.join(cleaned), flags=re.UNICODE)
        committees = cleaned_.split(';')
        senate_coms[name_] = {'committees': {}}
        if not (leftovers is None or i > 0):
            if name_ in leftovers:
                senate_coms[name_]['committees'].update(leftovers[name_]['committees'])
        for j, committee in enumerate(committees):
            committee_ = re.sub(r'\n', ' ', committee)
            position_match = re.search(r'((?:\s*C\s*o\s*-\s*C\s*h\s*a\s*i\s*r\s*)|(?:(?:\s*C\s*o)*\s*C\s*h\s*a\s*i\s*r)|(?:\s*V\s*ic\s*e\s*-*C\s*hai\s*r)|(?:(?:\s*R\s*e\s*p\s*u\s*b\s*l\s*i\s*c\s*a\s*n)|(?:\s*D\s*e\s*m\s*o\s*c\s*r\s*a\s*t\s*i\s*c\s*)\s*A\s*l\s*ter\s*nate))', committee_)
            if position_match:
                position = position_match.group(1).strip()
                replace = rf'\(\s*{re.escape(position)}\s*\)'
                committee__ = re.sub(replace, '', committee_).strip()
            else:
                position = 'Member'
                committee__ = committee_.strip()
            committee___ = re.sub(r'\(\s*\d+\s*\)*|(?<=\w)-(?=\s*\w)', '', committee__).strip()
            committee____ = re.sub(r'\s{2,}', ' ', committee___).strip()
            committee____ = committee____.replace('(Serves Ex Officioon all Standing and Joint Committees)', '')
            if j == len(committees) - 1:
                committee____ = re.sub(r'(?<=[^No])\s*\.\s*.*$', '', committee____)

            if committee____ != '':
                if 'Rules' in senate_coms[name_]['committees'].keys() and 'Rules' in committee____:
                    senate_coms[name_]['committees'].update({f"Joint Committee on {committee____}": position})
                else:
                    if committee____ == "Select Committees2020 United States Census":
                        committee____ = "Select Committee on 2020 United States Census"
                    if re.sub(r'\s*\(\s*\d+\s*\)\s*', '', committee____) != '':
                        senate_coms[name_]['committees'].update({committee____: position})

        if i == len(matches) - 1:
            if ((year == 2010) and not any(['Joint Committee' in c for c in clean])) or ((year != 2010) and not any(['Boards, Commissions, and Councils' in c for c in clean])):
                holding = {name_: senate_coms[name_]}
                senate_coms.pop(name_)
    return holding, senate_coms


In [6]:
current_year = 2002
senator_committees = {}
leftover = None
year_values = []
for i, page_raw in enumerate(senate_reader.pages):
    try:
        hold = leftover
        leftover = None
        page = page_raw.extract_text()
        if (i <= 8) or (i >= 49):
            page_ = early_format_sen_coms(page, i)
        else:
            leftover, page_ = later_format_sen_coms(page, i, current_year, hold)
        keys = [k for k in page_.keys()]

        if (keys[0].startswith('A')) and i != 0:
            senator_committees[str(current_year)] = {k: v for d in year_values for k, v in d.items()}
            year_values = [page_]
            current_year += 2
        elif i == len(senate_reader.pages) - 1:
            year_values.append(page_)
            senator_committees[str(current_year)] = dict((k, v) for d in year_values for k, v in d.items())
        else:
            year_values.append(page_)
    except:
        print(i)


In [7]:
senators = []
for term, politicians in senator_committees.items():
    for politician, details in politicians.items():
        for committee, position in details['committees'].items():
            senators.append({'term': term, 'politician': politician, 'committee': committee, 'position': position})
senators_committees = pd.DataFrame(senators)
senators_committees['term'] = senators_committees['term'].apply(lambda x: f"{int(x)-1}-{x}")

In [8]:
senators_committees = senators_committees.loc[(senators_committees['committee'] != "(Serves Ex Officioon all Standing and Joint Committees)") & (senators_committees['committee'] != "(4)")]

In [9]:
senators_committees.loc[senators_committees['committee'].str.contains('(Serves Ex Officioon)'), 'committee'] = senators_committees.loc[senators_committees['committee'].str.contains('(Serves Ex Officioon)'), 'committee'].apply(lambda x: x.split('.')[0])

  senators_committees.loc[senators_committees['committee'].str.contains('(Serves Ex Officioon)'), 'committee'] = senators_committees.loc[senators_committees['committee'].str.contains('(Serves Ex Officioon)'), 'committee'].apply(lambda x: x.split('.')[0])


In [10]:
senators_committees.to_csv('senators_committees.csv', index=False)

#### Assembly Committees

In [11]:
assembly_reader = PdfReader('assembly_committees.pdf')

In [12]:
def assembly_committees(page, year, page_num):
    t = unidecode(page)
    assembly_coms = {}
    y = year
    matches = [m for m in re.finditer(r'(?:(?:[Cc]ontinued)|\.)\s*\n\s*([A-Z]+[\w+,\'\-\s]*)\s*--', str(t))]
    if (matches[0].group(1).strip().startswith('A')) or (matches[0].group(1).strip().startswith('B')):
        if page_num < 74:
            term = re.search(r'(\d{4}-\s*\d{2,})\s*R\s*E\s*G\s*U\s*L\s*A\s*R', str(t))
            if term is not None:
                term = f"20{term.group(1).split('-')[1].strip()}"
                if term != year:
                    y = term
        else:
            term = re.search(r'[A-Z]+\s*\d+,*\s*(\d+)\s*\n', str(t))
            if term is not None:
                if len(term.group(1)) > 4:
                    term_ = term.group(1)[:4]
                else:
                    term_ = term.group(1)
                if term_ != year:
                    if term_ == '2025':
                        y = '2026'
                    else:
                        y = term_
    for i, m in enumerate(matches):
        end = matches[i+1].start() if i < len(matches)-1 else len(str(t))
        start = m.end()
        c_string = str(t)[start:end].strip()
        name = m.group(1).strip()
        assembly_coms[name] = {'committees': {}}
        committees = c_string.split(';')
        for committee in committees:
            committee_ = re.sub(r'\n', ' ', committee)
            position_match = re.search(r'((?:\s*C\s*o\s*-\s*C\s*h\s*a\s*i\s*r\s*)|(?:(?:\s*C\s*o)*\s*C\s*h\s*a\s*i\s*r)|(?:\s*V\s*ic\s*e\s*-*C\s*hai\s*r)|(?:(?:\s*R\s*e\s*p\s*u\s*b\s*l\s*i\s*c\s*a\s*n)|(?:\s*D\s*e\s*m\s*o\s*c\s*r\s*a\s*t\s*i\s*c\s*)\s*A\s*l\s*ter\s*nate))', committee_)
            if position_match:
                position = position_match.group(1).strip()
                replace = rf'\(\s*{re.escape(position_match.group())}\s*\)'
                committee__ = re.sub(replace, '', committee_).strip()
            else:
                position = 'Member'
                committee__ = committee_.strip()
            if re.search(r'\.\s*-+$', committee__) is not None:
                committee__ = re.sub(r'\.\s*-+$', '', committee__).strip()
            if committee__ != '':
                assembly_coms[name]['committees'].update({committee__: position})
    return y, assembly_coms

In [13]:
assembly_coms = {}
term_values = []
year = '2002'
for i, page in enumerate(assembly_reader.pages):
    p = page.extract_text()
    year, v = assembly_committees(p, year, i)
    if year not in assembly_coms.keys():
        assembly_coms[year] = v
    else:
        assembly_coms[year].update(v)

In [14]:
assembly = []
for term, politician in assembly_coms.items():
    for politician, details in politician.items():
        for committee, position in details['committees'].items():
            assembly.append({'term': term, 'politician': politician, 'committee': committee, 'position': position})
assembly_committees = pd.DataFrame(assembly)
assembly_committees['term'] = assembly_committees['term'].apply(lambda x: f"{int(x)-1}-{x}")

In [15]:
assembly_committees.to_csv('assembly_committees.csv', index=False)