# Erik Macik

# Assignment 1: Crawling and Indexing

## Define useful methods
### Recursive crawling method, robot parser method, and stop word reader

In [1]:
from urllib import request
from urllib.parse import urljoin
from urllib.parse import urlparse
from urllib.robotparser import RobotFileParser
from bs4 import BeautifulSoup
import string

# Crawl beginning at the given URL
def crawl(url, stop_words):
    print('Starting crawl at', url, '...\n')

    # Partse robots.txt
    allow, disallow = get_parsed_robots('http://www.cs.utep.edu/makbar/A3/robots.txt')

    # Open file to record results
    file_name = 'macik_erik.txt'
    results_file = open(file_name, 'w')

    # Begin recursive crawling
    crawl_r(url, stop_words, set(), 0, results_file, allow, disallow)

    # Save file
    results_file.close()
    print('\nSaved results to', file_name)

# Helper method to recursively crawl websites
def crawl_r(url, stop_words, visited, depth, results_file, allow, disallow):

    # Record Progress
    print(('\t' * depth) + 'Crawling', url, '...')

    # Read page contents
    page = request.urlopen(url).read().decode('utf8')
    soup = BeautifulSoup(page, 'html.parser')

    # Create set of unique links on this page
    links = set()
    for link in soup.find_all('a'):
        href_link = link.get('href')
        links.add(link if '://' in link else urljoin(url, href_link))

    # Get web page text
    raw = soup.get_text()

    # Remove punctuation from string
    raw = raw.translate(str.maketrans('', '', string.punctuation))

    # Record word frequencies
    word_counts = {word: raw.count(word) for word in raw.split() if word not in stop_words}

    # Write indexing to file
    results_file.write(('\t' * depth) + url + ' -> ')
    for word, count in word_counts.items():
        results_file.write(word + '::' + str(count) + ' ')
    results_file.write('\n')

    # Add url to visited to prevent looping
    visited.add(url)

    # Visit other links if they have not already been visited and if allowed by robots.txt
    for link in links:
        if link not in visited:
            allowed = True

            for disallowed in disallow:
                if disallowed in link:
                    allowed = False

            if allowed:
                crawl_r(link, stop_words, visited, depth + 1, results_file, allow, disallow)
            else:
                print(('\t' * depth) + link + ' not allowed by robots.txt.')
        else:
            print(('\t' * depth) + link + ' has already been visited.')


# Get the robots.txt file from the given link
def get_parsed_robots(url):
    page = request.urlopen(url).read().decode('utf8')
    allow = []
    disallow = []
    for line in page.split('\n'):
        parsed_line = line.split()
        if parsed_line[0] == 'Allow:':
            allow.append(parsed_line[1])
        elif parsed_line[0] == 'Disallow:':
            disallow.append(parsed_line[1])

    return allow, disallow

# Read stop words from file into set
def read_stop_words(file_name):
    return set(line.strip() for line in open(file_name))

## Begin crawling at the seeded url

In [2]:
url = 'http://www.cs.utep.edu/makbar/A3/A2.html'
stop_words = read_stop_words('stop_word.txt')
crawl(url, stop_words)

Starting crawl at http://www.cs.utep.edu/makbar/A3/A2.html ...

Crawling http://www.cs.utep.edu/makbar/A3/A2.html ...
	Crawling http://www.cs.utep.edu/makbar/A3/A2/A23.html ...
	Crawling http://www.cs.utep.edu/makbar/A3/Ae.html ...
	http://www.cs.utep.edu/makbar/A3/A4/t3.html not allowed by robots.txt.
		Crawling http://www.cs.utep.edu/makbar/A3/t2.html ...
		Crawling http://www.cs.utep.edu/makbar/A3/t1.html ...
	Crawling http://www.cs.utep.edu/makbar/A3/A2/A22.html ...
		Crawling http://www.cs.utep.edu/makbar/A3/A2/A31.html ...
			Crawling http://www.cs.utep.edu/makbar/A3/Ad.html ...
			http://www.cs.utep.edu/makbar/A3/A2/A22.html has already been visited.
				Crawling http://www.cs.utep.edu/makbar/A3/E5.html ...
				Crawling http://www.cs.utep.edu/makbar/A3/E1.html ...
		http://www.cs.utep.edu/makbar/A3/E5.html has already been visited.
	Crawling http://www.cs.utep.edu/makbar/A3/A2/A21.html ...
http://www.cs.utep.edu/makbar/A3/Ad.html has already been visited.
	Crawling http://www.cs