In [26]:
from tika import parser
import re
import pandas as pd

In [27]:
#Gets the raw text from the PDF.
raw = parser.from_file('../../data/Safety-Net-Clinic-Referral-Directory-2019.pdf')
raw_text = raw['content']

In [28]:
#Split the PDF into pages.
pages =  raw_text.split("Milwaukee County Safety Net Clinic Referral Directory")

In [29]:
#Clean the pages based on regex.
# 1. January 2019 and page number from beginning and end. Use newline as the marker.
# 2. Remove •
# 3. Information is accurate - text
# 4. Resource Directory at impactinc.org - text
# 5. Update multiple \n as just one \n.
for page_count in range(0, len(pages)):
    #Find new-line charachter and get the substring.
    page = pages[page_count]
    position = page.find('\n')
    page = page[position+1:]
    
    #Find the header information line marker to remove header lines.
    #The header and footer both contains the same type of info line.
    position = page.find('dial 2-1-1.')
    if position != -1:
        page = page[position+11:]
        #Remove the footer information as well.
        position = page.find('Information is accurate as of November 2018')
        page = page[:position]
    
    #Remove the January-2019 footer.
    position = page.find('January 2019   |')
    page = page[:position]
    
    #Remove •
    page = page.replace('•','')
    
    #Remove multiple lines
    re.sub(r'\n\s*\n', '\n\n', page)
    page = page.replace('\n\n', '\n')
    
    pages[page_count] = page

In [30]:
#Remove the index and reference pages.
pages = pages[1:68]

In [31]:
page_df = pd.DataFrame(pd.np.empty((0, 15)))

In [32]:
page_df.columns = ['Clinic', 'Location', 'Bus Routes', 'Cross Streets', 'Hours', 'Language', 'Patient Eligibility', 'Special Populations Served', 'Payer Population Served', 'Payment Practices', 'Required Documents', 'Service Description', 'Clinical Services', 'Routine Services', 'Clinic Type']
page_df


Unnamed: 0,Clinic,Location,Bus Routes,Cross Streets,Hours,Language,Patient Eligibility,Special Populations Served,Payer Population Served,Payment Practices,Required Documents,Service Description,Clinical Services,Routine Services,Clinic Type


In [34]:
#Separate each information chunk.
for page_count in range(0,len(pages)):
    print(page_count)
    page = pages[page_count]
    page_row = []
    
    anchor_strings = ['Location', 'Bus Routes:', 'Cross Streets:', 'Hours:', 
                      'Languages:', 'Patient Eligibility:', 'Special Populations Served:', 'Payer Populations Served:',
                     'Payment Practices:', 'Required Documents:', 'General Description of Services', 
                      'Clinical Services', 'Routine Services']
    clinic_types = ['STD/HIV','Behavioral Health', 'Medical', 'Dental', 'Substance Use', 'Physical Therapy',
                     'Family Planning']
    
    anchor_count = 0
    for anchor in anchor_strings:
        anchor_location = page.find(anchor)
        if anchor_location == -1:
            page_row.append('')
        else:
            value_string = page[:anchor_location]
            value_string = value_string.replace('\n',';')
            page_row.append(value_string)
            
            #Update the page.
            page = page[anchor_location:]
            page = page.replace(anchor,'')
            anchor_count = anchor_count + 1
    
    for clinic_type in clinic_types:
        anchor_location = page.find(clinic_type)
        
        if anchor_location != -1:
            #Update the services type
            value_string = page[:anchor_location]
            value_string = value_string.replace('\n',';')
            page_row.append(value_string)
            #Add the clinic type    
            value_string = page[anchor_location:]
            value_string = value_string.replace('\n',';')
            page_row.append(value_string)
            
    if len(page_row) > 15:
        page_row = page_row[0:15]
        
    if len(page_row) < 15:
        for count in range(0,15-len(page_row)):
            page_row.append('')
        
#     if page_count == 20:  
#         for count in range(0, len(page_row)):
#            print(str(count) + page_row[count])
    failed_items=[]
    try:
        page_df.loc[page_count] = page_row
    except:
        #page_df.loc[page_count] = [str(page_count) for _ in range(15)]
        print('Error',page_count)
        print(page_row)
        print(len(page_row))
    

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66


In [35]:
page_df.to_csv('clinic_data.csv')

In [36]:
page_df.to_json('clinic_data.json')

In [37]:
page_df

Unnamed: 0,Clinic,Location,Bus Routes,Cross Streets,Hours,Language,Patient Eligibility,Special Populations Served,Payer Population Served,Payment Practices,Required Documents,Service Description,Clinical Services,Routine Services,Clinic Type
0,;AIDS Resource Center of Wisconsin;ARCW Medica...,";820 N Plankinton Avenue, Milwaukee, WI 53203;...","14, 15, 19, 30, 57, and GoldLine;",N Plankinton Avenue and W Wells Street;,Monday – Friday 8:00 am - 4:00 pm; Call to sc...,"English, Spanish, Language Line;",; All ages; Clinic will assist with insuranc...,All individuals diagnosed with HIV;,Commercial Insurance; Medicaid / ;BadgerCare;...,Clinic bills 3rd party payer;,Proof of HIV;,;The AIDS Resource Center of Wisconsin is home...,; Behavioral Health Services ;(for patients w...,;(for patients with HIV); Flu Shots; HIV; I...,STD/HIV;;
1,;Angel of Hope Clinic;,";209 W Orchard Street, Milwaukee, WI 53204;(41...","15, 56, and GreenLine ;",S 2nd Street and W Orchard Street ;,Monday – Wednesday 8:30 am – 4:30 pm; Thursda...,"English, Spanish, Language Line;",; Ages 6 months and older; Clinic will assis...,Homeless;,Charity Program; Commercial Insur-;ance; Medi...,Sliding fee scale;,Photo ID; proof of income; insurance card;,;The Angel of Hope Clinic provides a full rang...,; Health Insurance ;Navigation Services; Lab...,; Diabetes; Dietary/Nutrition Services; Flu Sh...,Medical ;Free or Charitable Clinic;;
2,;Ascension Columbia St. Mary’s ;Ebenezer Heal...,";3132 N Dr Martin Luther King Drive, Milwaukee...","19, 60;",N Dr Martin Luther King Drive and W Auer Avenue;,Wednesday and Thursday 9:00 am – 4:30 pm; Wal...,"English, Language Line;",; All ages; Clinic does not provide primary ca...,Pregnant Women; People with ;chronic disease ...,Commercial Insurance; Medicaid / ;BadgerCare;...,Completely free to all users;,Call to inquire;,,;Ascension Columbia St. Mary’s Ebenezer Health...,; Health Information ;and Referral;,Medical ;Free or Charitable Clinic;;
3,;Ascension Columbia St. Mary’s ;Family Health ...,";1121 E. North Avenue, Milwaukee, WI 53212 ;(4...","14, 21;",N Humboldt Boulevard and E North Avenue;,Monday 9:00am – 5:00pm; Tuesday and Wednesday...,"English, Spanish, Language Line;",; All ages; Clinic will assist with insurance ...,Pregnant Women;,Commercial Insurance; Medicaid / ;BadgerCare;...,Clinic bills third party payer;,Photo ID; insurance card;,";Provides primary care services to all ages, f...",; Behavioral Health Services; Therapy; Dental...,; Dementia Screening; Diabetes Education and ...,Medical ;Free or Charitable Clinic;;
4,;Ascension Seton Dental Clinic;,";1730 S 13th Street, Milwaukee, WI 53204 ;(414...","17, 19, and 54 ;",S 13th Street and W Forest Home Avenue south ...,Monday – Thursday 7:30 am – 4:00 pm; ;Friday...,"English, Spanish;",; All ages; Clinic will assist with insurance ...,Homeless; Pregnant Women;,Medicaid / BadgerCare; Uninsured;,Clinic bills 3rd party payer;,"Proof of income, proof of address, photo ID;",;The clinic provides emergency dental services...,,;,Dental Health Services; Health Insurance ;Na...
5,;Ascension St. Ben’s Clinic;,";1027 N 9th Street, Milwaukee, WI 53233 ;(414)...","12, 31, 33, 80 ;",N 9th and W State Streets;,Monday-Thursday 9:00 am – 4:00 pm; ;Friday 9...,"English, clinic has access to translators thr...",; Adults 18 – 65; Clinic will assist with insu...,Homeless;,Medicaid / BadgerCare; Medicare; ;Uninsured;,Sliding fee scale; clinic bills 3rd party payer;,Call to inquire;,;Provides primary health care services to men ...,; Dental Service Referrals; Medication Servic...,; Diabetes; Flu Shots; Immunizations ;other ...,Medical ;Free or Charitable Clinic;;
6,;Aurora Family Service — Family Counseling Cli...,";3200 W Highland Boulevard, Milwaukee WI 53208...","27, 31, 35, and PurpleLine;",N 32nd Street and W Highland Boulevard;,Monday – Thursday 9:00 am – 8:00 pm; Friday 9...,"English, Spanish, Language Line;",,; All ages; Clinic will assist with insurance...,Commercial Insurance; Medicaid / ;BadgerCare;...,Sliding fee scale;,Insurance card (if applicable);,";Aurora Family Service is a safe, caring, and ...",,;,Behavioral Health Services; Therapy (Outpat...
7,;Clinical Services; Behavioral Health Services...,";130 W Bruce Street, Suite 400, Milwaukee WI 5...","15, 19, 23, BlueLine, and GreenLine ;",S 2nd and W Bruce Streets;,Monday – Thursday 9:00 am - 6:00 pm; ;Friday...,"English, Spanish, Language Line;",,; Adults 18 and older; Clinic serves survivors...,Uninsured;,Completely free to all users;,None;,,,;The Aurora Healing Center on Bruce Street pro...,Behavioral Health;;
8,;Aurora Walker’s Point Community Clinic;,";130 W Bruce Street, Suite 200, Milwaukee, WI ...","15, 19, 23, BlueLine, and GreenLine;",Two blocks north of S 2nd Street and W Nation...,Monday – Thursday 7:30 am – 5:00 pm; Friday 8...,"English, Spanish, Language Line;",; All ages - Focus is on ages 18 years old a...,Homeless; Undocumented People; ;Spanish Speak...,Uninsured;,Sliding fee scale;,None;,;Urgent and Primary Care for uninsured (best f...,; Behavioral Health Services; Therapy; Health...,; Diabetes; Dietary/Nutrition Services; Flu Sh...,Medical ;Free or Charitable Clinic;;
9,;Brady East Sexually Transmitted Disease Clinic;,";1240 E Brady Street, Milwaukee, WI 53202;(414...","14, 30, 30X, GoldLine, and GreenLine;",E Brady Street and N Arlington Place;,Monday – Tuesday 6:00 pm - 8:00 pm; Call to s...,English;,,; Ages 14 and over;,Uninsured;,Free to all users;,Call to inquire;,;BESTD Clinic provides: anonymous and confiden...,; Medication Services ; Primary Care;,; HIV; Immunizations ;other than Flu; STD;,"STD/HIV;As of December 19, 2018, this clinic h..."
