In [31]:
from urllib.parse import urlparse

In [32]:
import networkx
import pandas as pd
from bs4 import BeautifulSoup
from os import walk
import glob

In [33]:
data_path = "/home/proctortc/Documents/OtherData/SPLCDataDive/"

In [34]:
def calculate_pagerank(data_path):
    url_df = url_dataframe(data_path)
    url_df = add_hosts(url_df, 0)
    page_graph = initialize_graph(url_df)
    edge_pairs = generate_edge_pairs(data_path, url_df)
    return add_pagerank(url_df, edge_pairs, page_graph)

In [55]:
def add_pagerank(urldf, edge_tups, page_graph):
    page_graph.add_edges_from(edge_tups)
    ser = pd.Series(networkx.pagerank(page_graph))
    ser.name = "pagerank"
    return urldf.join(ser)

In [36]:
def url_dataframe(data_path):
    return pd.read_csv(data_path + "RawUrls.txt", header=None)
    

In [37]:
def get_datapath_len(data_path):
    return len(data_path.split("/"))

In [38]:
def get_all_netlocs(soup):
    return get_netlocs(get_all_links(soup))

In [39]:
def get_all_links(soup):
    a_s = soup.find_all('a')
    hrefs = (a.attrs['href'] for a in a_s if 'href' in a.attrs.keys())
    return hrefs
    

In [40]:
get_hostname = lambda x: urlparse(x).hostname

In [53]:
def get_netlocs(hrefs):
    return (get_hostname(href) for href in hrefs if href is not None)

In [42]:
def get_netlocs_file_list(base, none, file_list):
    soups = [BeautifulSoup(open(base + "/" + file), 'lxml')
             for file in file_list if file.split(".")[-1] == "html"]
    netloc_list_o_lists = [set(get_all_netlocs(soup)) for soup in soups]
    if len(netloc_list_o_lists) > 0:
        return netloc_list_o_lists[0].union(*netloc_list_o_lists[1:])
    

In [43]:
def get_origin_site(base, none, file_list, datapath_len=7):
    return base.split("/")[datapath_len]

In [44]:
def generate_edge_pairs(data_path, url_df):
    all_urls = glob.glob(data_path + "URLScrape/*")
    datapath_len = get_datapath_len(data_path)
    l = []
    for i in all_urls:
        for j in walk(i):
            try:
                l.append(((get_origin_site(*j, datapath_len=datapath_len),
                           get_netlocs_file_list(*j))))
            except UnicodeError:
                pass
    tups = [target_urls_to_tuple_list(*tup, url_df=url_df) for tup in l if tup[1] is not None]
    flattened = flatten_list(tups)
    return [(a, b) for a, b in flattened if b is not None]

In [45]:
def add_hosts(url_dataframe, url_column):
    hosts = url_dataframe[url_column].apply(get_hostname)
    url_dataframe['hosts'] = hosts
    return url_dataframe

In [51]:
def initialize_graph(url_data_frame):
    node_indeces = url_data_frame.index.tolist()
    page_graph = networkx.Graph()
    page_graph.add_nodes_from(node_indeces)
    return page_graph

In [47]:
def url_index(netloc, urls):
    if netloc is not None:
        try:
            return urls[urls['hosts']==netloc].index[0]
        except IndexError:
            return None

In [48]:
def target_urls_to_tuple_list(origin_page, target_list, url_df=None):
    target_ids = (url_index(url, url_df) for url in target_list)
    origin_id = url_index(origin_page, url_df)
    return [(origin_id, target_id) for target_id in target_ids
            if target_id is not None]

In [49]:
def flatten_list(li):
    return [item for sublist in li for item in sublist]