In [55]:
import camelot.io as camelot
import warnings
import pandas as pd
import re
import numpy as np
from unidecode import unidecode
from pypdf import PdfReader

warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)

## Rosters

In [6]:
special_cases = {
    '11,12': {
        'flavor': 'network',
        '0_x': 3,
        '0_y': [0, 2, 3, 4, 5, 6, 7],
        '1_x': 3,
        '1_y': [0, 1, 2, 3, 4, 5],
        'combine': {'0': [(2, 3)]},
        'checks': ['Occupation']
    },
    '15,16': {
        'flavor': 'stream',
        '0_x': 4,
        '0_y': [0, 1, 6, 7, 10, 11, 12, 14],
        '1_x': 0,
        '1_y': [0, 1, 2, 13, 14, 15, 17, 18, 19, 20, 21, 22, 23, 25],
        'combine': {'0': [(0,1), (6,7)], '1': [(1, 2), (17, 18, 19, 20, 21, 22, 23, 25)]},
        'checks': [['Name', 'Occupation'], ['Occupation']]
    },
    '21,22': {
        'flavor': 'stream',
        '0_x': 4,
        '0_y': [0, 1, 10, 29, 30, 31, 33],
        '1_x': 0,
        '1_y': [0, 10, 11, 13, 14, 15, 17],
        'combine': {'0': [(0,1)], '1': [(10, 11)]},
        'checks': [['Name'], ['Occupation']]
    },
    '23,24': {
        'flavor': 'stream',
        '0_x': 4,
        '0_y': [0, 1, 10, 11, 25, 26, 27, 29],
        '1_x': 0,
        '1_y': [0, 12, 13, 25, 26, 27, 29],
        'combine': {'0': [(0,1), (10, 11)], '1': [(12, 13)]},
        'checks': [['Name', 'Occupation'], ['Occupation']]
    },
    '25,26': {
        'flavor': 'stream',
        '0_x': 4,
        '0_y': [0, 9, 28, 29, 30, 32],
        '1_x': 0,
        '1_y': [0, 1, 6, 7, 10, 11, 12, 14, 15, 16],
        'combine': {'1': [(0, 1), (6, 7), (14, 15, 16)]},
        'checks': ['Name', 'Occupation']
    }
}

In [7]:
def word_cleaner(word, name=False):
    wd = unidecode(word)
    if name:
        clean = re.sub(r'[^a-zA-Z\,\s/\-]', '', str(wd))
    else:
        clean = re.sub(r'[^a-zA-Z\,\s/]', '', str(wd))
    return clean.strip()

def column_cleaner(df):
    df.columns = ['Name', 'Occupation', 'Party', 'District No.', 'Seat No.', 'Counties']
    df['Name'] = df['Name'].apply(word_cleaner, name=True)

    county_skips = df.loc[(df['Name'] == '') & (df['Counties'] == '')].index.sort_values(ascending=False).tolist()
    for ind in county_skips:
        leftover = df.loc[ind, 'Counties']
        search = True
        c = ind - 1
        if c < 0:
            search = False
        while search:
            if df.loc[c, 'Name'].strip() == '':
                leftover = df.loc[c, 'Counties'] + ' ' + leftover
                df.loc[c, 'Counties'] = ''
                c -= 1
            else:
                df.loc[c, 'Counties'] = df.loc[c, 'Counties'] + ' ' + leftover
                df.loc[ind, 'Counties'] = ''
                search = False
    cleaned = df.loc[(df['Name'] != '') & (df['Counties'] != '')].reset_index(drop=True)
    cleaned['Party'] = cleaned['Party'].apply(lambda x: re.sub(r'[^a-zA-Z\s]+', '', x))

    party_skips = cleaned.loc[(~cleaned['Party'].isin(['D', 'R'])) & (cleaned['Party'] != ''), 'Party']
    for i, p in party_skips.items():
        missed = p.split(' ')
        if len(missed) == 2:
            cleaned.loc[i, 'Party'] = missed[1]
            cleaned.loc[i-1, 'Party'] = missed[0]
        elif len(missed) == 3:
            cleaned.loc[i, 'Party'] = missed[1]
            cleaned.loc[i-1, 'Party'] = missed[0]
            cleaned.loc[i+1, 'Party'] = missed[2]

    district_skips = cleaned.loc[cleaned['District No.'].apply(lambda x: re.search(r'\s+', str(x)) is not None), 'District No.']
    for i, d in district_skips.items():
        d_real = d.split(' ')
        if len(d_real) == 2:
            cleaned.loc[i, 'District No.'] = d_real[1]
            cleaned.loc[i-1, 'District No.'] = d_real[0]

    cleaned['District No.'] = cleaned['District No.'].apply(lambda x: re.sub(r'[^0-9]+', '', str(x)))
    cleaned['Seat No.'] = cleaned['Seat No.'].apply(lambda x: re.sub(r'[^0-9]+', '', str(x)))

    for col in ['Occupation', 'Counties']:
        cleaned[col] = cleaned[col].apply(word_cleaner)

    return cleaned

def missing_combiner(col, df):
    missing = df.loc[(df['Name'] == '') & (df[col] != '')].index.sort_values(ascending=False).tolist()
    if len(missing) > 0:
        for ind in missing:
            leftover = df.loc[ind, col]
            df.loc[ind-1, col] = df.loc[ind-1, col] + ' ' + leftover
            df.loc[ind, col] = ''
    return df[col]

def table_cleaner(table, p=False, params=None):
    if not p:
        df_a = table[0].df.iloc[3:, [0, 1, 2, 3, 4, 6]].reset_index(drop=True)
        df_b = table[1].df.iloc[:, [0, 1, 2, 3, 4, 6]]
    else:
        x0 = params['0_x']
        y0 = params['0_y']
        x1 = params['1_x']
        y1 = params['1_y']
        df_a = table[0].df.iloc[x0:, y0]
        df_b = table[1].df.iloc[x1:, y1]
        if 'combine' in params.keys():
            if '0' in params['combine'].keys():
                for pair in params['combine']['0']:
                    df_a[pair[0]] = df_a[[p for p in pair]].apply(lambda x: " ".join(x.dropna().astype(str)), axis=1)
                    drops = [p for p in pair if p != pair[0]]
                    df_a = df_a[[c for c in df_a.columns if c not in drops]]

                df_a.columns = ['Name', 'Occupation', 'Party', 'District No.', 'Seat No.', 'Counties']
                if len(params['checks']) == 1:
                    df_a[params['checks'][0]] = missing_combiner(params['checks'][0], df_a)
                elif isinstance(params['checks'][0], list):
                    for check in params['checks'][0]:
                        df_a[check] = missing_combiner(check, df_a)
                else:
                    for check in params['checks']:
                        df_a[check] = missing_combiner(check, df_a)


            if '1' in params['combine'].keys():
                for pair in params['combine']['1']:
                    df_b[pair[0]] = df_b[[p for p in pair]].apply(lambda x: " ".join(x.dropna().astype(str)), axis=1)

                    drops = [p for p in pair if p != pair[0]]
                    df_b = df_b[[c for c in df_b.columns if c not in drops]]

                df_b.columns = ['Name', 'Occupation', 'Party', 'District No.', 'Seat No.', 'Counties']
                if len(params['checks']) == 1:
                    df_b[params['checks'][0]] = missing_combiner(params['checks'][0], df_b)
                elif isinstance(params['checks'][1], list):
                    for check in params['checks'][1]:
                        df_b[check] = missing_combiner(check, df_b)
                else:
                    for check in params['checks']:
                        df_b[check] = missing_combiner(check, df_b)

    clean_b = column_cleaner(df_b).reset_index(drop=True)
    clean_a = column_cleaner(df_a).reset_index(drop=True)
    table = pd.concat([clean_a, clean_b]).reset_index(drop=True)
    return table

def table_scraper(pages, p=False, params=None):
    if not p:
        flavor = 'stream'
    else:
        flavor = params['flavor']
    tables = camelot.read_pdf('senator_people.pdf', pages=pages, flavor=flavor)
    return table_cleaner(tables, p=p, params=params)

In [8]:
senator_roster = pd.DataFrame(columns=['Name', 'Occupation', 'Party', 'District No.', 'Seat No.', 'Counties', 'Term'])
for i in np.arange(1, 27, 2):
    page_numbers = f"{i},{i+1}"
    if i in [11, 15, 21, 23, 25]:
        pars = special_cases[page_numbers]
        tab = table_scraper(page_numbers, p=True, params=pars)
    else:
        tab = table_scraper(page_numbers, p=False)
    term = f'{2000 + i}-{2000 + i + 1}'
    tab['Term'] = term
    senator_roster = pd.concat([senator_roster, tab])

In [None]:
senator_roster.reset_index(drop=True, inplace=True)

Unnamed: 0,Name,Occupation,Party,District No.,Seat No.,Counties,Term
0,"Ackerman, Dick",Business Lawyer,R,33,4066,Orange,2001-2002
1,"Alarcon, Richard",Fulltime Legislator,D,20,4035,Los Angeles,2001-2002
2,"Alpert, Dede",Fulltime Legislator,D,39,5050,San Diego,2001-2002
3,"Battin, Jim",Businessman,R,37,3074,"Imperial, Riverside, San Diego",2001-2002
4,"Bowen, Debra",Public Law Attorney,D,28,4040,Los Angeles,2001-2002
...,...,...,...,...,...,...,...
515,"Valladares, Suzette",,R,23,7140,"Los Angeles, San Bernardino",2025-2026
516,"Wahab, Dr Aisha",Businesswoman,D,10,8530,"Alameda, Santa Clara",2025-2026
517,"Weber-Pierson, Dr Akilah",Physician,D,39,7310,San Diego,2025-2026
518,"Wiener, Scott D",Legislator/Attorney,D,11,8620,"San Francisco, San Mateo",2025-2026


In [10]:
def name_clean(name):
    wd = str(unidecode(name))
    if wd.endswith("Jr"):
        wd = re.sub(r"\,\s*(J|S)r\.*\s*$", "", wd)
    if wd.endswith("III"):
        wd = re.sub(r"\,\s*I{2,}\s*$", "", wd)
    clean = re.sub(r"\-", " ", wd)
    clean = re.sub(r"[^a-zA-Z\,/\s]", "", clean).strip()
    return clean


def assembly_name_cleaner(name):
    try:
        name2 = name_clean(name)
        first, last = first_last_name(name2)
        return first, last
    except:
        name1 = re.sub(r"\s{2,}", " ", name).strip()
        name2 = re.sub(r"(?<=[a-z])\s+(?=[A-Z][a-z]+)", ", ", name1)
        name3 = name_clean(name2)
        first, last = first_last_name(name3)
        return first, last


def assembly_cleaner(df, i):
    df["Term"] = f"{2000 + i}-{2000 + i + 1}"
    df["Occupation"] = df["Occupation"].apply(word_cleaner)
    df["Party"] = df["Party"].str.strip()
    df_clear = df.loc[~df["Name"].isin(["", " ", "Name"])]
    df_clear[["First", "Last"]] = (
        df_clear["Name"].apply(assembly_name_cleaner).apply(pd.Series)
    )

    return df_clear.reset_index(drop=True)

def word_cleaner(word, name=False):
    wd = unidecode(word)
    if name:
        clean = re.sub(r"[^a-zA-Z\,/]", "", str(wd))
        clean2 = re.sub(r"[-(?:\s{2,})]", " ", clean)
    else:
        clean = re.sub(r"[^a-zA-Z\,/]", "", str(wd))
        clean2 = re.sub(r"\s{2,}", " ", clean)
    return clean2.strip()


def first_last_name(name):
    if (name.strip(" ") == "Vacancy") or "Vacancy" in name:
        return "Vacancy", "Vacancy"

    if name.endswith("Jr"):
        name = re.sub(r"\,\s*(J|S)r\.*\s*$", "", name)
    if name.endswith("III"):
        name = re.sub(r"\,\s*III*\s*$", "", name)

    try:
        last, first = name.split(", ")
        return first, last
    except ValueError:
        try:
            last, first = name.split(",")
            return last, first
        except:
            try:
                last, first = name.split(" ")
                return first, last
            except:
                return '', ''

In [11]:
def process_candidate_data(t, i):

    if len(t[0].df.columns) <= 9:
        t_a = t[0].df.iloc[:, :5]
        t_b = t[1].df.iloc[:, :5]
        t_a.columns = ["Name", "Occupation", "Party", "District No.", "Seat No."]
        t_b.columns = ["Name", "Occupation", "Party", "District No.", "Seat No."]
    elif len(t[0].df.columns) == 45:
        t_a = t[0].df.iloc[:, [0, 10, 11, 39, 40, 41]]
        t_a.loc[:, 10] = t_a.loc[:, 10] + " " + t_a.loc[:, 11]

        t_a = t_a.drop(columns=[11])
        t_a.columns = ["Name", "Occupation", "Party", "District No.", "Seat No."]
        t_b = t[1].df.iloc[:, [0, 8, 9, 28, 29, 30]]
        t_b.loc[:, 8] = t_b.loc[:, 8] + " " + t_b.loc[:, 9]
        t_b = t_b.drop(columns=[9])
        t_b.columns = ["Name", "Occupation", "Party", "District No.", "Seat No."]
    elif len(t[0].df.columns) == 29:
        t_a = t[0].df.iloc[:, [0, 1, 7, 8, 23, 24, 25]]
        t_a.loc[:, 0] = t_a.loc[:, 0] + " " + t_a.loc[:, 1]
        t_a.loc[:, 7] = t_a.loc[:, 7] + " " + t_a.loc[:, 8]
        t_a = t_a.drop(columns=[1, 8])
        t_a.columns = ["Name", "Occupation", "Party", "District No.", "Seat No."]
        t_b = t[1].df.iloc[:, [0, 1, 15, 48, 49, 50]]
        t_b.iloc[16, 1] = "Valladres, Suzette Martinez"
        t_b.iloc[17, 0] = ""
        t_b.loc[:, 0] = t_b.loc[:, 0] + " " + t_b.loc[:, 1]
        t_b = t_b.drop(columns=[1])
        t_b.columns = ["Name", "Occupation", "Party", "District No.", "Seat No."]
    elif len(t[0].df.columns) == 26:
        t_a = t[0].df.iloc[:, [0, 6, 7, 20, 21, 22]]
        t_a.loc[:, 6] = t_a.loc[:, 6] + " " + t_a.loc[:, 7]
        t_a = t_a.drop(columns=[7])
        t_a.columns = ["Name", "Occupation", "Party", "District No.", "Seat No."]
        if len(t[1].df.columns) == 30:
            t_b = t[1].df.iloc[:, [0, 6, 7, 24, 25, 26]]
            t_b.loc[:, 6] = t_b.loc[:, 6] + " " + t_b.loc[:, 7]
            t_b = t_b.drop(columns=[7])
        else:
            t_b = t[1].df.iloc[:, [0, 7, 40, 41, 42]]
        t_b.columns = ["Name", "Occupation", "Party", "District No.", "Seat No."]
    table = pd.concat([t_a, t_b]).reset_index(drop=True)
    table = assembly_cleaner(table, i)

    positions = t[2].df.iloc[:, :2].rename(columns={0: "Position", 1: "Name"})
    positions = positions.loc[~positions["Name"].isin(['', ' ', 'Name'])]
    positions['Position'] = positions['Position'].apply(word_cleaner)
    positions[['First', 'Last']] = positions['Name'].apply(assembly_name_cleaner).apply(pd.Series)
    positions = positions.loc[~positions['Position'].isin(['ChiefClerk', 'ChiefSergeantatArms'])].drop(columns=['Name'])
    tale = table.merge(positions, on=['First', 'Last'], how='left').drop(columns=['Name'])
    return tale

In [12]:
assembly_roster = pd.DataFrame(columns=['Occupation', 'Party', 'District No.', 'Seat No.', 'Term', 'First', 'Last', 'Position'])
for i in np.arange(1, 27, 2):
    page_numbers = f"{i},{i+1}"
    t = camelot.read_pdf('assembly_people.pdf', pages=page_numbers, flavor='stream')
    table = process_candidate_data(t, i)
    assembly_roster = pd.concat([assembly_roster, table], ignore_index=True)

## Senate Committees

In [538]:
senate_reader = PdfReader('senators_committees.pdf')

In [539]:
def early_format_sen_coms(page):
    senate_coms = {}
    t = unidecode(page)
    t_ = re.sub(r'\n*SENATOR.*COMMITTEE.*MEMBERSHIPS(?:.*Continued)*', '', t)
    matches = [m for m in re.finditer(r'\n([A-Z][A-Za-z\s\']+)(?=\s*--\(\d+\)--\s*)', str(t_))]
    for i, m in enumerate(matches):
        start_ = m.start()
        end_ = matches[i+1].start() - 1 if i < len(matches)-1 else len(str(t_))
        string_ = str(t_)[start_:end_].split(r'--')
        name = string_[0].strip()
        committees = string_[2].strip().split(';')
        senate_coms[name] = {'committees': {}}
        for committee in committees:
            committee_ = re.sub(r'\n', ' ', committee)
            position_match = re.search(r'((?:Co-Chair)|(?:(?:Co)*\s*C\s*hair)|(?:\s*V\s*ice\s*Chair)|(?:(?:Republican)|(?:Democratic)\s*Alternate))', committee_)
            if position_match:
                position = position_match.group(1).strip()
                replace = rf'\({re.escape(position)}\)'
                committee_ = re.sub(replace, '', committee_).strip()
            else:
                position = 'Member'
                committee_ = committee_.strip()
            committee__ = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', committee_).strip()
            committee___ = re.sub(r'(?<=[a-z])\s+(?=[a-z]{1,2}\b)', '', committee__).strip()

            senate_coms[name]['committees'].update({committee___: position})
    return senate_coms

In [540]:
def later_format_sen_coms(page, year, leftovers=None):
    senate_coms = {}
    holding = None
    t = unidecode(page)
    t_ = re.sub(r'(\n*SENATOR.*COMMITTEE.*(?:\n\s*(?:AND)*.*COUNCIL).*MEMBERSHIPS(?:.*Continued)*)', '', t)
    t__ = re.sub(r'Boar\nds', 'Boards', t_)
    matches = [m for m in re.finditer(r'\n([A-Z][\w\s\']+)(?:-*-*continued\s*)*\n[\w\s,]+--\(\s*\w+\s*\)', str(t__))]
    for i, match in enumerate(matches):
        end = matches[i+1].start() - 1 if i < len(matches)-1 else len(str(t__))
        if leftovers is None or i > 0:
            name_ = re.search(r'\n(.*)\s*\n(?=Standing\n* C\s*ommittees*)', match.group()).group().strip()
        else:
            name_ = re.search(r'\n(.*)(?=\s*-*-*(?:continued\s*)|\n*[A-Z](?:.* Committees*)]*)', match.group()).group().strip()
            name_ = re.sub(r'--\s*(?:[Cc]ontinued)*', '', name_).strip()

        _start = match.end()
        coms = str(t__)[_start:end]
        clean = [c for c in coms.split('--') if re.search(r'\([1-9]+\)', c) is None]
        cleaned = [re.sub(r'\n*(?:Select Committees*)|(?:Subcommittees*)|(?:Joint Committees*)|(?:Boards, Commissions, and Councils)', ' ', c) for c in clean]
        cleaned_ = re.sub(r'(?<!No)(?<=[a-z\s])\.(?=[\s\n][A-Z]*)', '; ', ''.join(cleaned), flags=re.UNICODE)

        committees = cleaned_.split(';')
        senate_coms[name_] = {'committees': {}}
        if not (leftovers is None or i > 0):
            if name_ in leftovers:
                senate_coms[name_]['committees'].update(leftovers[name_]['committees'])
        for committee in committees:
            committee_ = re.sub(r'\n', ' ', committee)
            position_match = re.search(r'((?:Co-Chair)|(?:(?:Co)*\s*Chair)|(?:V\s*ice\s*Chair)|(?:(?:Republican)|(?:Democratic)\s*Alternate))', committee_)
            if position_match:
                position = position_match.group(1).strip()
                replace = rf'\({position}\)'
                committee__ = re.sub(replace, '', committee_).strip()
            else:
                position = 'Member'
                committee__ = committee_.strip()
            committee___ = re.sub(r'\(\s*\d+\s*\)|(?<=\w)-(?=\s*\w)', '', committee__).strip()
            committee____ = re.sub(r'\s{2,}', ' ', committee___).strip()
            if committee____ != '':
                if 'Rules' in senate_coms[name_]['committees'].keys() and 'Rules' in committee____:
                    senate_coms[name_]['committees'].update({f"Joint {committee____}": position})
                else:
                    senate_coms[name_]['committees'].update({committee____: position})

        if i == len(matches) - 1:
            if ((year == 2010) and not any(['Joint Committee' in c for c in clean])) or ((year != 2010) and not any(['Boards, Commissions, and Councils' in c for c in clean])):
                holding = {name_: senate_coms[name_]}
                senate_coms.pop(name_)
    return holding, senate_coms


In [541]:
current_year = 2002
senator_committees = {}
leftover = None
year_values = []
for i, page_raw in enumerate(senate_reader.pages):
    hold = leftover
    leftover = None
    page = page_raw.extract_text()
    if (i <= 8) or (i >= 49):
        page_ = early_format_sen_coms(page)
    else:
        leftover, page_ = later_format_sen_coms(page, current_year, hold)
    keys = [k for k in page_.keys()]

    if (keys[0].startswith('A')) and i != 0:
        senator_committees[str(current_year)] = {k: v for d in year_values for k, v in d.items()}
        year_values = [page_]
        current_year += 2
    elif i == len(senate_reader.pages) - 1:
        year_values.append(page_)
        senator_committees[str(current_year)] = dict((k, v) for d in year_values for k, v in d.items())
    else:
        year_values.append(page_)


In [542]:
senators = []
for term, politicians in senator_committees.items():
    for politician, details in politicians.items():
        for committee, position in details['committees'].items():
            senators.append({'term': term, 'politician': politician, 'committee': committee, 'position': position})
senators_committees = pd.DataFrame(senators)
senators_committees['term'] = senators_committees['term'].apply(lambda x: f"{int(x)-1}-{x}")

In [545]:
senators_committees.loc[senators_committees['committee'] == 'Public Safety. Smallwood-Cuevas', 'committee'] = 'Public Safety'
for c, p in zip(['Labor, Public Employment and Retirement', 'Budget and Fiscal Review', 'Business, Professions, and Economic Development', 'Governmental Organization'], ['Chair', 'Member', 'Member', 'Member']):
    row = {'term': '2025-2026', 'politician': 'Smallwood-Cuevas', 'committee': c, 'position': p}
    senators_committees = pd.concat([senators_committees, pd.DataFrame([row])], ignore_index=True)


In [547]:
senators_committees.to_csv('senators_committees.csv', index=False)

#### Assembly Committees

In [306]:
assembly_reader = PdfReader('assembly_committees.pdf')

In [None]:
def assembly_committees(page, year, page_num):
    t = unidecode(page)
    assembly_coms = {}
    y = year
    matches = [m for m in re.finditer(r'(?:(?:SESSION)|\.|(?:Continued))\s*\n([A-Z].*)\s*--', str(t)) if 'COMMITTEE MEMBERSHIP' not in m.group()]
    if (matches[0].group(1).strip().startswith('A')) or (matches[0].group(1).strip().startswith('B')):
        if page_num < 74:
            term = re.search(r'(\d{4}-\s*\d{2,})\s*R\s*E\s*G\s*U\s*L\s*A\s*R', str(t))
            if term is not None:
                term = f"20{term.group(1).split('-')[1].strip()}"
                if term != year:
                    y = term
        else:
            term = re.search(r'[A-Z]+\s*\d+,*\s*(\d+)\s*\n', str(t))
            if term is not None:
                if len(term.group(1)) > 4:
                    term_ = term.group(1)[:4]
                else:
                    term_ = term.group(1)
                if term_ != year:
                    if term_ == '2025':
                        y = '2026'
                    else:
                        y = term_
    for i, m in enumerate(matches):
        end = matches[i+1].start() if i < len(matches)-1 else len(str(t))
        start = m.end()
        c_string = str(t)[start:end].strip()
        name = m.group(1).strip()
        assembly_coms[name] = {'committees': {}}
        committees = c_string.split(';')
        for committee in committees:
            committee_ = re.sub(r'\n', ' ', committee)
            position_match = re.search(r'\(((?:Co-Chair)|(?:(?:Co)*\s*Chair\s*)|(?:\s*V\s*ice\s*Chair\s*)|(?:Republican\s*Alternate\s*)|(?:Democratic\s*Alternate\s*))\)', committee_)
            if position_match:
                position = position_match.group(1).strip()
                replace = rf'\({re.escape(position)}\)'
                committee__ = re.sub(replace, '', committee_).strip()
            else:
                position = 'Member'
                committee__ = committee_.strip()
            if re.search(r'\.\s*-+$', committee__) is not None:
                committee__ = re.sub(r'\.\s*-+$', '', committee__).strip()
            if committee__ != '':
                assembly_coms[name]['committees'].update({committee__: position})
    return y, assembly_coms

In [405]:
assembly_coms = {}
term_values = []
year = '2002'
for i, page in enumerate(assembly_reader.pages):
    p = page.extract_text()
    year, v = assembly_committees(p, year, i)
    if year not in assembly_coms.keys():
        assembly_coms[year] = v
    else:
        assembly_coms[year].update(v)

In [549]:
assembly = []
for term, politician in assembly_coms.items():
    for politician, details in politician.items():
        for committee, position in details['committees'].items():
            assembly.append({'term': term, 'politician': politician, 'committee': committee, 'position': position})
assembly_committees = pd.DataFrame(assembly)
assembly_committees['term'] = assembly_committees['term'].apply(lambda x: f"{int(x)-1}-{x}")

In [550]:
assembly_committees.to_csv('assembly_committees.csv', index=False)