# Web data fetching/scraping demo

Example (not particularly simple/well coded) of how to pull data from website, in this case statewide 2020 Montana candidates filed with the Montana Commissioner of Political Practices

Adapted from Montana Free Press 2020 campaign finance app, https://montanafreepress.org/apps/montana-2020/

Original code at https://github.com/eidietrich/mt-2020-election/blob/master/scrapers/state-finance-reports/

In [15]:
import pandas as pd
# Library for making HTTP network requests, see https://requests.readthedocs.io/en/master/
import requests 

In [18]:
def get_2020_statewide_candidates():
    year = '2020'
    candidate_type_code = 'SW' # statewide
    search_data = {
        'lastName': '',
        'firstName': '',
        'middleInitial': '',
        'electionYear': year,
        'candidateTypeCode': candidate_type_code,
        'officeCode': '',
        'countyCode': '',
        'partyCode': '',
    }
    
    all_candidates = get_candidate_list_cleaned(search_data)
    # include only active candidates
    active_statuses = ['Active','Reopened','Amended']
    candidates = [c for c in all_candidates if c['candidateStatusDescr'] in active_statuses]
    return candidates


def get_candidate_list(search_data):
    """
    Gets candidates for 2020 statewide races races, returns as json.
    
    State office and electionYear paramaters currently hard coded.
    """
    session = requests.Session()
    candidate_search_url = 'https://camptrackext.mt.gov/CampaignTracker/public/searchResults/searchCandidates'
    candidate_list_url = """
    https://camptrackext.mt.gov/CampaignTracker/public/searchResults/listCandidateResults?sEcho=1&iColumns=9&sColumns=&iDisplayStart=0&iDisplayLength=400&mDataProp_0=checked&mDataProp_1=candidateName&mDataProp_2=electionYear&mDataProp_3=candidateStatusDescr&mDataProp_4=c3FiledInd&mDataProp_5=candidateAddress&mDataProp_6=candidateTypeDescr&mDataProp_7=officeTitle&mDataProp_8=resCountyDescr&sSearch=&bRegex=false&sSearch_0=&bRegex_0=false&bSearchable_0=true&sSearch_1=&bRegex_1=false&bSearchable_1=true&sSearch_2=&bRegex_2=false&bSearchable_2=true&sSearch_3=&bRegex_3=false&bSearchable_3=true&sSearch_4=&bRegex_4=false&bSearchable_4=true&sSearch_5=&bRegex_5=false&bSearchable_5=true&sSearch_6=&bRegex_6=false&bSearchable_6=true&sSearch_7=&bRegex_7=false&bSearchable_7=true&sSearch_8=&bRegex_8=false&bSearchable_8=true&iSortCol_0=0&sSortDir_0=asc&iSortingCols=1&bSortable_0=false&bSortable_1=true&bSortable_2=true&bSortable_3=true&bSortable_4=false&bSortable_5=false&bSortable_6=true&bSortable_7=true&bSortable_8=true&_=1549556234613
    """
    
    session.post(candidate_search_url, search_data)
    r = session.get(candidate_list_url)
    return r.json()['aaData']

def get_candidate_list_cleaned(search_data):
    full = get_candidate_list(search_data)
    cleaned = list(map(lambda d: {
        'candidateId': d['candidateId'],
        'candidateName': d['candidateName'],
        'candidateLastName': d['personDTO']['lastName'],
        'partyDescr': d['partyDescr'],
        'electionYear': d['electionYear'],
        'resCountyDescr': d['resCountyDescr'],
        'officeTitle': d['officeTitle'],
        'candidateStatusDescr': d['candidateStatusDescr'],
        # More available here - home address, phone, etc.
    }, full))
    return cleaned



In [23]:
# fetches candidates as json
candidates = get_2020_statewide_candidates()
# convert to DataFrame
df = pd.DataFrame(candidates)

In [24]:
df

Unnamed: 0,candidateId,candidateName,candidateLastName,partyDescr,electionYear,resCountyDescr,officeTitle,candidateStatusDescr
0,16074,"Arntzen, Elsie",Arntzen,Republican,2020,Yellowstone,Superintendent of Public Instruction,Active
1,16068,"Bennett, Bryce",Bennett,Democrat,2020,Missoula,Secretary of State,Amended
2,15937,"Bennion, Jon",Bennion,Republican,2020,Jefferson,Attorney General,Amended
3,16010,"Black, Michael G",Black,Non-Partisan,2020,Lewis & Clark,Supreme Court Justice No. 05,Active
4,16070,"Cooney, Mike R",Cooney,Democrat,2020,Lewis & Clark,Governor,Amended
5,16179,"Crum, Jolene",Crum,Republican,2020,Gallatin,Exploratory,Active
6,16062,"Downing, Troy B",Downing,Republican,2020,Gallatin,State Auditor,Reopened
7,15939,"Dudik, Kimberly",Dudik,Democrat,2020,Missoula,Attorney General,Active
8,15938,"Fox, Tim",Fox,Republican,2020,Jefferson,Governor,Active
9,16009,"Gianforte, Greg",Gianforte,Republican,2020,Gallatin,Governor,Active


In [None]:
# Write to DataFrame to file
pd.to_csv('data/web-scraping-result-mt-2020-candidates.csv', index=False)