In [5]:
"""
- clone repo - 
cd 30122-project-truth-inquery
poetry install
poetry run python truth_inquery/crawler
"""
import lxml.html
import pandas as pd
import re
import scrapelib
from lxml import etree
import time
from collections import defaultdict


from bs4 import BeautifulSoup
from bs4.element import Comment
import urllib.request
from urllib.request import build_opener, HTTPCookieProcessor
#from truth_inquery.analysis_model.states import STATES


#### Constants #####

LIMIT = 25
PATTERN = r'[\[0-9()="?!}{<>.,~`@#$%&*^_+:;|\]\\\/]'
s = scrapelib.Scraper(retry_attempts=0, retry_wait_seconds=0)
opener = build_opener(HTTPCookieProcessor())
CPCIN = "truth_inquery/data/CPC_"
CPCOUT = "truth_inquery/output/CPC_state_clinics.csv"

STATES = {
    'AK': 'Alaska',
    'AL': 'Alabama',
    'AR': 'Arkansas',
    'AZ': 'Arizona',
    'CA': 'California',
    'CO': 'Colorado',
    'CT': 'Connecticut',
    'DC': 'District of Columbia',
    'DE': 'Delaware',
    'FL': 'Florida',
    'GA': 'Georgia',
    'HI': 'Hawaii',
    'IA': 'Iowa',
    'ID': 'Idaho',
    'IL': 'Illinois',
    'IN': 'Indiana',
    'KS': 'Kansas',
    'KY': 'Kentucky',
    'LA': 'Louisiana',
    'MA': 'Massachusetts',
    'MD': 'Maryland',
    'ME': 'Maine',
    'MI': 'Michigan',
    'MN': 'Minnesota',
    'MO': 'Missouri',
    'MS': 'Mississippi',
    'MT': 'Montana',
    'NC': 'North Carolina',
    'ND': 'North Dakota',
    'NE': 'Nebraska',
    'NH': 'New Hampshire',
    'NJ': 'New Jersey',
    'NM': 'New Mexico',
    'NV': 'Nevada',
    'NY': 'New York',
    'OH': 'Ohio',
    'OK': 'Oklahoma',
    'OR': 'Oregon',
    'PA': 'Pennsylvania',
    'RI': 'Rhode Island',
    'SC': 'South Carolina',
    'SD': 'South Dakota',
    'TN': 'Tennessee',
    'TX': 'Texas',
    'UT': 'Utah',
    'VA': 'Virginia',
    'VT': 'Vermont',
    'WA': 'Washington',
    'WI': 'Wisconsin',
    'WV': 'West Virginia',
    'WY': 'Wyoming'
}

name_to_abbrev = {v: k for k, v in STATES.items()}



In [22]:
def csv_extract(input_file):
    """
    Loads CPC csv input file and extracts data from columns

    Inputs:
        input_file (str): File path to .csv file

    Returns: dataframe with standardized zip and url columns
    """
    f = pd.read_csv(input_file)
    df = f[f['Website'].notna()]
    df = df[['Name ', 'Zip Code', 'State', 'Website']]

    # Rename columns, front-fill zipcode with zeros.
    df = df.rename(columns={'Name ':'name', 'Zip Code':'zip', 'State':'state', 'Website':'website_s'})
    df['zip'] = df['zip'].astype(str)
    df['zip'] = df['zip'].str.zfill(5)
    df['website_s'] = df['website_s']
    df.loc[:, ['url']] = df['website_s'].str.strip().apply(lambda x: x.split(' ')[0])
    
    
    return df

In [7]:
california_cpc_df = csv_extract("CPC_California (CA).csv")


In [8]:
def clean_df(df):
    """
    Takes in pandas dataframe, replace NaN with 0 and 
    generate new column sum 'count' across all columns

    Input: pandas dataframe
    Returns token,count pandas dataframe
    """
    output = df.fillna(0)
    output['count'] = output.sum(axis=1)
    output = output[['count']]
    return output


In [9]:
from urllib.parse import urlparse

def remove_after_domain(url):
    pattern = r'(https?://[^\s/]+)'
    match = re.match(pattern, url)
    if match:
        return match.group(0)
    else:
        return url

In [10]:
def get_root(url):
    """
    Extracts html root from url if possible

    Inputs:
        url (str): URL

    Returns: set of URLs otherwise None
    """
    try:
        response = s.get(remove_after_domain(url), timeout=5)
        root = lxml.html.fromstring(response.text)
        return root
    except:
        return None

In [11]:
# TEST 1
remove_after_domain("https://obria.org/locations/anaheim-ca/")

'https://obria.org'

In [12]:
# TEST 2
lst_urls = get_root("https://wrcathens.org")
#print(lst_urls.text_content())
print(type(lst_urls))

<class 'lxml.html.HtmlElement'>


In [13]:
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True


def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    return u" ".join(t.strip() for t in visible_texts)

html = urllib.request.urlopen("https://wrcathens.org/").read()
print(text_from_html(html))




                      Home    About    Services    Your Choices    Get Involved    Donate    Contact                          Home    About    Services    Your Choices    Get Involved    Donate    Contact                            SCHEDULE AN APPOINTMENT            SCHEDULE AN APPOINTMENT            SCHEDULE AN APPOINTMENT            SCHEDULE AN APPOINTMENT                 Schedule an Appointment     WE’RE HERE TO HELP. The Women’s Resource Center in Athens has supported local families for over 30 years through pregnancy and parenting related services. We offer free, compassionate, and confidential help for both women and men as they make family planning decisions.     Call any time: 256-233-5775 727-D West Market Street Athens, AL 35611 (On Fitness Way) Hours of operation We will be closed on June 28th & 29th & July 4th-6th Phone lines will remain open for emergency calls Tuesday 10am-4:00pm | Wednesday 10am-2pm | Thursday 10am-4:00pm            Pregnant?         Considering Abortion

  texts = soup.findAll(text=True)


In [14]:
def crawlss(url, limit):

    """
    This function starts at the base URL of a CPC and
    crawls through each embedded url, scraping the html structure of the main 
    website and the texts of the main and the rest of the urls.

    Parameters:
        * url: url of CPC's main website
        * limit: integer with the maximum number of embedded urls visited
    """
    root = get_root(url)
    
    if root is None:
        return None
    
    # Tokenize the base URL (input) and identify unique sub-urls
    dct = {}
    dct['url'] = str(url)
    dct['html_structure'] = s.get(remove_after_domain(url), timeout=5).text
    try:
        dct['text_base'] = text_from_html(opener.open(url).read())
    except:
        pass

    urls = set(root.xpath('//a/@href'))
  
    urls_visited = 0 
    for u in urls:
        subroot = get_root(u)
        if subroot is not None:
            urls_visited += 1
            
            try:
                dct['text_url'+str(urls_visited)] = text_from_html(opener.open(u).read())
            except urllib.error.HTTPError:
                pass
            

        if urls_visited == limit:
            break

    # Return df and number of urls visited including base
    df = pd.DataFrame(dct, index=[0])
    return df, urls_visited

#crawlss("https://reallifecpc.org/", 20)



In [15]:
def network_crawl(urllst, limit=LIMIT):
    """
    Crawls each URL in list and creates a pandas dataframe with the texts and
    the html structure

    Inputs
        - urllst: (list of strings) URLs 
        - limit: (int) max num of hrefs to crawl for each URL

    Returns: pandas dataframe and dictionary with number of urls visited
    """
    # Accumulators 
    df = pd.DataFrame()
    results = defaultdict(dict)

    # Crawl each URL
    for b, b_url in enumerate(urllst):

        print("Base URL", b, "crawling")
        crawldf = crawlss(b_url, limit)
        # If root is none skip
        if crawldf is None:
            continue

        # Unpack output of crawl and add to data accumulators
        nextdf, urls_visited = crawldf
        df = pd.concat([df,nextdf])
        

        results[b+1] = {'url': b_url, 'urls_visited': urls_visited}
        print("Finished")


    return df.reset_index().drop(['index'], axis=1), results


In [16]:
# TEST 3
url_list = ["https://optionspc.org/", "https://pregnantnowwhat.org/", "https://arkadelphiapregnancy.com/", "https://hannahprc.com/"]
df_test = network_crawl(url_list)
#df_test

Base URL 0 crawling


  texts = soup.findAll(text=True)


Finished
Base URL 1 crawling
Base URL 2 crawling


  texts = soup.findAll(text=True)


Finished
Base URL 3 crawling


  texts = soup.findAll(text=True)


Finished


In [17]:
# TEST 4
df_test[0]
#lxml.html.fromstring(df_test[0].loc[0,"html_structure"])
#type(list(df_test[0].loc[:, "html_structure"])[0])
#df_test[0].to_csv("dataframe_test")

Unnamed: 0,url,html_structure,text_base,text_url1,text_url3,text_url4,text_url5,text_url6,text_url7,text_url8,text_url11,text_url12,text_url13,text_url14,text_url15,text_url16,text_url17,text_url2
0,https://optionspc.org/,"\r\n<!doctype html>\r\n<html lang=""en-US"">\r\n...",Call Cabot Call Beebe Call J...,Call Cabot Call Beebe...,Call Cabot Call Beebe Call J...,Call Cabot Call Beebe Call J...,Call Cabot Call Beebe Call J...,Call Cabot Call Beebe Call J...,Call Cabot Call Beebe Call J...,Call Cabot Call Beebe Call J...,Call Cabot Call Beebe Call J...,Call Cabot Call Beebe Call J...,Call Cabot Call Beebe Call J...,Notice 登录后才能继续。 登录 Facebook 登录后才能继续。 登录 忘记帐户？...,...,Call Cabot Call Beebe Call J...,Call Cabot Call Beebe Call J...,
1,https://arkadelphiapregnancy.com/,"<!DOCTYPE html>\n <html itemscope itemtype=""h...",...,...,...,Notice 登录后才能继续。 登录 Facebook 登录后才能继续。 登录 忘记帐户？...,,,,,,,,,,,,Notice 登录后才能继续。 登录 Facebook 登录后才能继续。 登录 忘记帐户？...
2,https://hannahprc.com/,<!DOCTYPE HTML>\n<html>\n<head>\n\t<meta chars...,Need help? 870-862-1317 Donate Ser...,888-910-9995 PORTFOLIO SER...,,Donate Our Purpose What is Addiction?...,,,,,,,,,,,,...


In [23]:
# TEST 5
urls = california_cpc_df['url'].tolist()[0:30]
df_cal = network_crawl(urls)
#df_cal[0].isna().sum()

Base URL 0 crawling
Base URL 1 crawling


  texts = soup.findAll(text=True)


Finished
Base URL 2 crawling


  texts = soup.findAll(text=True)


Finished
Base URL 3 crawling


  texts = soup.findAll(text=True)


Finished
Base URL 4 crawling


  texts = soup.findAll(text=True)


Finished
Base URL 5 crawling


  texts = soup.findAll(text=True)


Finished
Base URL 6 crawling


  texts = soup.findAll(text=True)


Finished
Base URL 7 crawling


  texts = soup.findAll(text=True)


Finished
Base URL 8 crawling


  texts = soup.findAll(text=True)


Finished
Base URL 9 crawling
Base URL 10 crawling


  texts = soup.findAll(text=True)
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Finished
Base URL 11 crawling
Base URL 12 crawling


  texts = soup.findAll(text=True)


Finished
Base URL 13 crawling


  texts = soup.findAll(text=True)


Finished
Base URL 14 crawling


  texts = soup.findAll(text=True)


Finished
Base URL 15 crawling


  texts = soup.findAll(text=True)


Finished
Base URL 16 crawling
Finished
Base URL 17 crawling


  texts = soup.findAll(text=True)


Finished
Base URL 18 crawling
Finished
Base URL 19 crawling


  texts = soup.findAll(text=True)


Finished
Base URL 20 crawling
Base URL 21 crawling


  texts = soup.findAll(text=True)


Finished
Base URL 22 crawling


  texts = soup.findAll(text=True)


Finished
Base URL 23 crawling
Base URL 24 crawling


  texts = soup.findAll(text=True)


Finished
Base URL 25 crawling


  texts = soup.findAll(text=True)


Finished
Base URL 26 crawling


  texts = soup.findAll(text=True)


Finished
Base URL 27 crawling


  texts = soup.findAll(text=True)


Finished
Base URL 28 crawling


  texts = soup.findAll(text=True)


Finished
Base URL 29 crawling


  texts = soup.findAll(text=True)


Finished


In [19]:
# TEST 6
df_cal[0]

Unnamed: 0,url,html_structure,text_url1,text_url2,text_url3,text_url4,text_url5,text_url6,text_url7,text_url8,...,text_url17,text_url18,text_url19,text_url20,text_url21,text_url22,text_url23,text_url24,text_url25,text_base
0,https://www.obria.org/locations/anaheim-ca/,"\t\t\n\t\t<!DOCTYPE html>\n<html lang=""en"">\n<...",Donate  ...,Donate  ...,Donate  ...,Donate  ...,Donate  ...,Donate  ...,Donate  ...,Donate  ...,...,Donate  ...,Donate  ...,Donate  ...,Donate  ...,Donate  ...,Donate  ...,Donate  ...,Donate  ...,Donate  ...,
1,https://cdohope.org/,"<!DOCTYPE html>\n<html lang=""en-US"">\n<head>\n...",Facebook Door Of H...,Facebook Door Of H...,...,Facebook Door Of H...,Facebook Door Of H...,Facebook Door Of H...,Facebook Door Of H...,Facebook Door Of H...,...,Facebook Door Of H...,Facebook Door Of H...,,,,,,,,Facebook Door Of H...
2,http://www.treeoflifepsc.com/contact,"<!doctype html>\n<html xmlns:og=""http://opengr...",Menu ...,...,...,,,,,,...,,,,,,,,,,
3,http://wehelpyou.org/,"<!DOCTYPE html>\n<html dir=""ltr"" lang=""en-US""\...",SUPPORT US CONTACT US ...,SUPPORT US CONTACT US ...,SUPPORT US CONTACT US ...,SUPPORT US CONTACT US ...,SUPPORT US CONTACT US ...,SUPPORT US CONTACT US ...,SUPPORT US CONTACT US ...,SUPPORT US CONTACT US ...,...,SUPPORT US CONTACT US ...,SUPPORT US CONTACT US ...,SUPPORT US CONTACT US ...,SUPPORT US CONTACT US ...,SUPPORT US CONTACT US ...,SUPPORT US CONTACT US ...,SUPPORT US CONTACT US ...,,,SUPPORT US CONTACT US ...
4,https://pregnancycenterbarstow.com/,"<!DOCTYPE html>\n<html lang=""en"">\n<head>\n \...",...,...,...,登录 Facebook 登录 忘记帐户？ 或 新建帐户 中文(简体) English (U...,...,...,...,,...,,,,,,,,,,...
5,http://birthright.org/brentwood,"<!DOCTYPE html>\n<html lang=""en-US"">\n<head>\n...",Birthright 24/7 HE...,...,...,Birthright 24/7 HE...,...,...,Birthright 24/7 HE...,Birthright 24/7 HE...,...,,,,,,,,,,
6,https://www.womensresourceclinic.org/,"<!DOCTYPE html>\n<html lang=""en-GB"" dir=""ltr"">...",You need to enable JavaScript to run this app...,(530) 897-6101 24 Hour H...,GET STARTED TODAY |...,,,,,,...,,,,,,,,,,(530) 897-6101 24 Hour H...
7,http://birthright.org/chico,"<!DOCTYPE html>\n<html lang=""en-US"">\n<head>\n...",Birthright 24/7 HE...,...,...,Birthright 24/7 HE...,...,...,Birthright 24/7 HE...,Birthright 24/7 HE...,...,,,,,,,,,,
8,http://birthlineofsandiego.org/,"<!DOCTYPE html>\n\n<html class=""no-js"" lang=""e...",,,,...,,?OUֵі�b8?g� �rJ]R�5�b�^}V6�!LhY�A��[�C�*\�%�...,,Print Editions The Southern Cross The Offi...,...,,Print Editions The Southern Cross The Offi...,,...,,...,,Love Another Mother This Mother's ...,...,
9,http://www.graceelliottcenter.org/,<!DOCTYPE html>\n<html>\n\t<head>\n\t\t<!-- <h...,Grace Elliott Center 1 Home Services A...,PayPal requires JavaScript for many of its fea...,New York Los Angeles Houston S...,Sign in Use your Google Account Email or phone...,,,,,...,,,,,,,,,,Grace Elliott Center 1 Home Services A...


In [20]:
def crawling_to_df(state_dict):

    """
    This function starts from a dictionary of state abbreviations and names
    take each state's csv with the CPC urls and generates a csv with the texts 
    and html structures that is exported. 

    Parameters:
        * state_dict: dictionary with the state's abbreviated and complete name
    """

    for stabb, name in state_dict.items():
        # Crawl CPC urls
        try:
            CPCinput = CPCIN + name + " (" + stabb + ").csv"
            CPCoutput = CPCOUT.replace("state", stabb)
            df = csv_extract(CPCinput)
        except FileNotFoundError:
            print(stabb, "file does not exist")
            continue

        # CPC URLs and zip codes
        # Crawl these
        urls = df['url'].tolist()[0:15]

        print("Crawling CPCs in", name)
        network_crawl(urls, LIMIT)[0].to_csv(CPCoutput)