In [585]:
## Import Packages and Libraries ##

# Web parcing, scraping, etc.
import bs4 as bs # BeautifulSoup4 
import urllib3
import re
import requests # HTTP parser
import html5lib

# DataFrames and math
import pandas as pd
import numpy as np

# Output related packages 
import pprint as pp

# read-in and write-out
import csv

In [586]:
# stretch Jupyter coding blocks to fit screen
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>")) 

# make it run on py2 and py3
from __future__ import division, print_function

# Data Mining II
This  notebook is intended to perform the following processes:

    2.1 Read-in batched csv files and perform content extraction.

    2.2 The notebook uses beautifulsoup to extract paragraph content of each url in batch.

    2.3 Content extraction is written out as a matching csv file for future concatenation/merging

___
### **Begin Data Mining II:** Per-url, article content extraction


__2.1 Read-in batched csv files and perform content extraction.__

In [587]:
df = pd.read_csv('rawData5.csv')

In [588]:
df.tail(3)

Unnamed: 0,author,description,publisher,source_url,timeStamp,title
2493,Openstack.org,https://blueprints.launchpad.net/nova/+spec/pl...,Openstack.org,http://specs.openstack.org/openstack/nova-spec...,2018-04-17 00:00:00+00:00,Placement Forbidden Traits
2494,Tony Sheng,Just as bitcoin is a promise for sound money 1...,Tonysheng.com,https://www.tonysheng.com/sound-digital-goods,2018-04-17 00:00:00+00:00,Sound digital goods
2495,157640,The Coinbase Webhooks API notifications servic...,Programmableweb.com,https://www.programmableweb.com/api/coinbase-w...,2018-04-17 00:00:00+00:00,Coinbase Webhooks


__2.2 Use beautifulsoup to extract paragraph content of each url in batch.__

__List of known path errors -- for validate_site -- is above the function call for batch_control__ 

In [589]:
def validate_site(site, ignore):
    """
    accepts single url, and a list. Iterates of list for paths to ignore.
    If flag is found in list of know issues, url is flagged for removal.
    Returns a flag -- '' -- for 'falsy' check 
    """
    from urllib.parse import urlparse
    import copy
    url = copy.copy(site) 
    
    # Note: urlparse requires a string argument
    info = urlparse(str(url))

    for rm in ignore:
        if rm in info.path:  # validates that site is not a podcast
            return('')       # flagged for removal
        else:
            continue
    return(url)

In [590]:
def get_content(site):
    """
    Accepts dictionary of single key containing url. 
    Function uses beautiful soup to parse through paragraphs on given url
    Returns the extracted data from paragraphs.
    """
    
    src = requests.get(site).content            # accesses content of html object
    soup = bs.BeautifulSoup(src, 'lxml')        # object creation used in extracting paragraphs using built-in html parser
    body = soup.find_all('p')                   # finds all paragraphs '<p>' in html object
    return(body)

In [591]:
def extract_from_content(body):
    """
    Iterates over website content extracted by get_content 
    Function then splits the body into individual sentences and appends then to a list.
    Returns said list
    """
    
    sentence = [parags.text for parags in body]
    text = "\t".join(sentence)                  # tab delimeter for easier extraction
    return(text)

In [592]:
def get_article(site):
    """
    Accepts dict of 1 element containg url. 
    Function then uses other functions to validate the url -- checks for know issues and discards them.
    Function, if no error is detected then calls function to extract article contents
    Returns article contents
    """
    
    valid_site = validate_site(site, ignore)
    if not valid_site:
        return('Access Limit Met')                  # uses 'falsy' check if string is empty 
    else:
        body = get_content(valid_site)
        text = extract_from_content(body)
        if not text:                                # uses 'falsy' check if string is empty 
            return('403 Forbidden')
    return(text)

In [593]:
def get_text(df):
    """
    Accepts a 6xn dataframe and returns a 1xn list and a modifed dataframe
    Note: stitching a dataframes to a list is much faster that updating existing one with a new column
    """
    
    import copy
    content = []
    url = []
    author = copy.copy(df['author'])
    link = copy.copy(df['source_url'])
    
    if author == 'Ml-implode.com': 
        r = requests.get(link)
        soup = bs.BeautifulSoup(r.text, 'html.parser')
        source = soup.find(id="LIJIT_title")                   # manually found -- may differ moving forward
        link = source.find('a').get('href')                    # gets the 'href inside the a tag -- i.e. the correct url
        url.append(link)
        content.append(get_article(link))                      # send correct url out for extraction
    else:
        content.append(get_article(link))
        
    return({'url':url, 'content':content})

In [594]:
def combine_df(df1, df2):
    """
    Accepts a dataframe, and calls a function that checks for known aggregator, and known error sites. 
    If found, function replaces invalid entries with corrected ones, or drops them.
    Returns corrected df.
    """
    
    import copy
    author = copy.deepcopy(df1['author'])
    df = copy.deepcopy(df1)
    df['contents'] = "".join(df2['content']) # converts list to string
    
    #checks for know aggregator, replaces it with the url referenced by aggregator
    if author == 'Ml-implode.com':
        df['source_url'] = "".join(df2['url']) # converts 'list' item to 'str' item
        return(df)        
    else:
        return(df)

In [595]:
def replace_author_name(df):
    """
    Accepts dictionary of single url.
    It then parses through url to extracts and returns netloc of url  
    """
    
    from urllib.parse import urlparse
    import copy
    url = copy.copy(df) 
    
    # urlparse requires a string 
    info = urlparse(str(url))
    return(info.netloc)          #.netloc extract the main url -- i.e. excludes path

In [596]:
# Replace 'None' values
def rm_false_author(df,start):
    """
     Receives single row of pandas datraframe, and removes aggregator name from author feature -- if found
     Returns original dataframe if no flag found
    """
    
    author = df['author']
    source = df['source_url']
    publisher = df['publisher']
    
    #checks for know aggregator, replaces it with the url referenced by aggregator
    for i in range(len(df)):
        if author.loc[i] == 'Ml-implode.com':
            author.loc[start] = replace_author_name(source.loc[i])  # parameter is correct url as type 'str'
            publisher.loc[start] = author.loc[i]                    # replaces instances of incorrect publisher name
    return(df)

__List of known publishers and aggregators -- for stitch_df -- is a cell above the function call for batch_control__ 

In [616]:
def stitch_df(df, start, halt, init, discard):
    """
    Accepts a dataframe, start/stop condition, as well as an initializer for proper indexing and a list of row-keys to drop
    Controls all other associated functions related to data extraction
    Return a preprocessed dataframe
    """
    
    import copy
    df = copy.deepcopy(df)
    data = pd.DataFrame()
    start += init
    halt += init
    terminate = len(df) + init
    author = df['author']
    
    # displays progress
    if halt != terminate:
        print("Batching range:", start+1,"-", halt) 
    else:
        print("Batching range:", start+1,"-", terminate)
        
    # conditional merges dataframes and calls other functions that provide dataframe with data
    while start < halt:
        
        ## print statement comes in handy for debugging ##
        #print('\n',author.loc[start], df['source_url'].loc[start])
        
        ### Checks for known bogus publisher or aggregator, disposes of row if error causing ###
        for rm in discard:
            while author.loc[start] == rm:
                df.drop([start], axis = 0, inplace = True)
                start += 1
        ### -------------------------------------------------------------------------------- ###
        
        # datafram copy to prevent mutability 
        df1 = copy.deepcopy(df.loc[start])
        df2 = get_text(df1)

        datum = data.append(combine_df(df1,df2), ignore_index = True)  # append without dropping entries with same row number

        ## minor clean up ##
        # optional: data = data.drop(columns=['description'])          # removes redundant column        
        data = rm_false_author(datum, start)                           # replaces invalid author entry

        start += 1
    return(data)        

__Batch Control and out-to-csv fail safe__

In [617]:
def batch_control(df, batch, init, discard):
    """
    Accepts a pandas df, and a batch size, and iterates through extraction and writting as batches.
    The process is structured as such, to minimize computation time, and detect errors.
    Does not have a Return. Used as master control.
    """
    
    start = 0
    terminate = len(df)
    print("Number of articles to be extracted:",terminate)
    print("Total number of batches:", int(np.ceil(terminate/batch)),"\n")
    
    while start < terminate:
        
        halt = start + batch   # Batch control
    
         # ensures we don't over expand range 
        if halt > terminate:        
            halt = start + terminate
            
        temp = df[start:halt]
        temp_out = stitch_df(temp, start, halt, init, discard)
        
        ### Creates/updates csv file to ensure process batches are not lost due to bugs or errors ###
        try:
            with open('riskEx_df5.csv') as file:
                print('\t Updating existing csv file')
                temp_in = pd.read_csv('riskEx_df5.csv')
                temp_out = temp_in.append(temp_out, ignore_index=True)
                temp_out.to_csv('riskEx_df5.csv', index_label = False)
                print('\t Updated batch saved to csv')
                pass
        except IOError as e:
            print("\t Creating initial csv file")
            temp_out.to_csv('riskEx_df5.csv', index_label = False)
            print('\t Initial batch saved to csv')
        ### ------------------------------------------------------------------------------------- ###
        
        start += batch         # Batch increment
        
    print("\n*******************\n PROJECT COMPLETED\n*******************")

__2.3 Write out content as csv file__

___
---
### __NOTE__ 
__In order to ensure there is zero data override, update the initializer to the same value as the begining of the dataframe slice prior to running batch_control.__ 

Failing to do so will at best throw an error, and at worst it will override your data. 
___
---

In [623]:
##### KEY: initialize 'init' and the first row value in dataframe being passed #####
#  Failiing to initialize 'project' and 'init' to same value may lead to data loss #

print("\nProcessing",len(df),"articles in DataFrame.")
batch_size = 100
lhs = 2400
rhs = 7000
end = len(df)


# Initialize project and init
init = lhs 
project = df[lhs:]      # Approximate time for every 1000 articles:  ~25:00 
##### ------------------------------------------------------------------------ #####


Processing 2496 articles in DataFrame.


__Lists of common error causing paths and publishers__

In [619]:
# list of error causing paths
ignore = ['mp3', 'rlslog', 'mo4ch', 'rex-silentium-bitcoin-suppressor', 'PD209.html',
          'booking.com_considers_adding_flights_rules_out_bitcoin_acceptance', 'PD207.html',
         'VL201.html', 'www.wikitimes.net', '1016879623', '.mp4', '1016330206', 'photos',
         'General-Tech', 'energy', '5a78bbe5e4b00f94fe93fd8d', '4885392.htm', '.ece', '166075/',
         'www.thetruthaboutcars.com', 'www.techishaky.com', 'www.anroed.com', 'dogecoin-wallet.html',
         '4884023.htm', 'node/4249884', 'Visconti.html', 'a20180410PD210.html', 'whatthehack.life', 
         'ac7d222e4b07a3485e49e16', 'a20180330PD206.html', 'a20180326PD205.html', 'forty-seven',
         'no-news-today', '1021452297']

In [620]:
# Known bogus publishers, and aggregators. 
#List tells stitch_df to dispose of content where the author or url is one of the following

discard = ['PA', 'https://holdernews.com', 'Blogspotpoint.com', 'Connor Madsen', 
           'Pressreleasepoint.com', 'Pewinternet.org', 'www.pewinternet.org', 
           'Jeremy Hellstrom', 'Ben Potter', 'Steph Willems', 'Thetruthaboutcars.com', 
           'Adam Tonge', 'lookout', 'noreply@blogger.com', 'Dan Tan']

__Extract Content__

In [624]:
batch_control(project, batch_size, init, discard)  # execute batch extraction and write out

Number of articles to be extracted: 96
Total number of batches: 1 

Batching range: 2401 - 2496
	 Updating existing csv file
	 Updated batch saved to csv

*******************
 PROJECT COMPLETED
*******************


__Finally:__ The following few cells carry out some minor preprocessing

In [625]:
# Manual check 
riskEx = pd.read_csv('riskEx_df5.csv')
print(riskEx.info())
print(len(riskEx['description'].unique()))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2495 entries, 0 to 2494
Data columns (total 7 columns):
author         2482 non-null object
contents       2495 non-null object
description    2482 non-null object
publisher      2495 non-null object
source_url     2495 non-null object
timeStamp      2495 non-null object
title          2495 non-null object
dtypes: object(7)
memory usage: 155.9+ KB
None
2275


__Remove rows flagged for removal during preprossesing__

In [626]:
# drop duplicates, keep first instance of source-url
riskEx.drop_duplicates(['description'], keep='first', inplace = True)
print(len(riskEx['source_url'].unique()))

2272


In [None]:
# create list of labels previously flagged for removal
rm_403 = riskEx[riskEx['contents'] == '403 Forbidden'].index.values.tolist()
rm_acc_denied = riskEx[riskEx['contents'] == 'Access Limit Met'].index.values.tolist()

remove_rows = sorted(rm_403 + rm_forbidden + rm_acc_denied) # list of all values to be removed sorted for iteration
print("Total unique rows after preprocessing:", len(riskEx)-len(remove_rows))

In [628]:
# Use remove_rows to remove observation from dataframe
use_riskEx = riskEx.drop(remove_rows)

In [629]:
use_riskEx.info()
print(len(use_riskEx['source_url'].unique()))


<class 'pandas.core.frame.DataFrame'>
Int64Index: 2060 entries, 0 to 2494
Data columns (total 7 columns):
author         2048 non-null object
contents       2060 non-null object
description    2060 non-null object
publisher      2060 non-null object
source_url     2060 non-null object
timeStamp      2060 non-null object
title          2060 non-null object
dtypes: object(7)
memory usage: 128.8+ KB
2057


__Writting to disk__

In [630]:
## RUN AFTER ENTIRE DF IS COMPLETELY EXTRACTED ##
# saves dataframe as a preprocessed and cleaned (slightly) DataFrame
use_riskEx.to_csv('use_riskEx5.csv', index_label = False)

__confirming everything was performed as expected__

In [637]:
df2 = pd.read_csv('use_riskEx5.csv')
print(df2.info())
print(len(df2['description'].unique()))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2060 entries, 0 to 2494
Data columns (total 7 columns):
author         2048 non-null object
contents       2060 non-null object
description    2060 non-null object
publisher      2060 non-null object
source_url     2060 non-null object
timeStamp      2060 non-null object
title          2060 non-null object
dtypes: object(7)
memory usage: 128.8+ KB
None
2060


In [632]:
print(df2['timeStamp'].head(3))
print(df2['timeStamp'].tail(3))

0    2018-04-23 23:59:35+00:00
1    2018-04-23 23:55:00+00:00
3    2018-04-23 23:49:29+00:00
Name: timeStamp, dtype: object
2492    2018-04-17 00:00:00+00:00
2493    2018-04-17 00:00:00+00:00
2494    2018-04-17 00:00:00+00:00
Name: timeStamp, dtype: object


#### Completed Process: 
##### riskEx_df dataframes are dataframes that contained extracted data but with no preprocessing performed.
    They include more information, but there may be repeated values, or urls that are inaccessible
    
##### use_riskEx dataframes represent various dataframes of extracted features, their values, and the text content of each article's body.
    The files have undergone preprocessing and duplicates have been removed. Also gone, unfortunatelly, are the descriptions of rows with urls that were not accessible using BS4

### **End Data Mining II:** Per-url, article content extraction
___