In [8]:
import requests
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import ProcessPoolExecutor
import multiprocessing

"""
Create a module containing a class: TextComparer with the following methods:
1. init(self, url_list)
2. download(url,filename) raises NotFoundException when url returns 404
3. multi_download() uses threads to download multiple urls as text and stores filenames as a property
4. iter() returns an iterator
5. next() returns the next filename (and stops when there are no more)
6. urllist_generator() returns a generator to loop through the urls
7. avg_vowels(text) - a rough estimate on readability returns average number of vowels in the words of the text
8. hardest_read() returns the filename of the text with the highest vowel score (use all the cpu cores on the computer for this work.

"""
"""Class used to compare books / files"""
class TextComparer(): 
    
    
    def __init__(self, url_list =[]):
        """Init method, initalizes a URL list and a filename list"""
        self.url_list = url_list
        self.filenames = []
        
    def download(self, url, filename):
        """Downloads a file from a request with a url. Raises exception if error occurs. 
        If no error is detected it will open the file and loop through, write data to a new file and appends the data.
        Afterwars the filename is added to the filename list"""
        filename = str(filename) + ".txt"
        
        request = requests.get(url)
        status_code = request.status_code

        if status_code == 404:
            print("Error")
            raise NotFoundException()
        
        with open(filename, "wb") as file_download:
            for file_data in request.iter_content(file_data_size = 1024):
                file_download.write(file_data)
        
        self.filenames.append(filename)
        
    def multi_download(self):
        """Performs downloads for every url in the url list"""
        threads = len(self.url_list)
        
        with ThreadPoolExecutor(threads) as executor:
            executor.map(self.download, self.url_list, range(len(self.url_list)))
        
    def __iter__(self):
        """Iter method, pretty standard"""
        self.index = 0
        return iter

    def __next__(self):
        """Next method, pretty standard but it raises an exception when the index reaches the size of filename_list"""
        if self.index < len(self.filenames):
            current_index = self.index
            self.index += 1
            return self.filenames[1]
        else:
            raise StopIteration
            
    def urllist_generator(self):
        """Generator to loop through URLs in the class"""
        for url in self.url_list:
            yield url
     
    
    def avg_vowels(self, filename):
        """Counting amount of vowels in a file.
        Checks if the vowels are in the files and add to a counter if true"""
        vowels = ["A", "E", "I", "O", "U", "Y"]

        with open(filename) as input_file:
            text = input_file.read()

        words = text.split()
        number_of_words = len(words)

        number_of_vowels = 0

        for word in words:
            for letter in word:
                if letter.upper() in vowels:
                    number_of_vowels += 1

        score = round(number_of_vowels / number_of_words, 5)
        return score, filename
    
    
    def hardest_read(self):
        """Using ProcessPoolExecutor to map/loop through the avg vowels of a file. Adds """
        workers = multiprocessing.cpu_count()
        
        with ProcessPoolExecutor(workers) as executor:
            results = executor.map(self.avg_vowels, self.filenames)
            
        highest_avg = None

        for result in results:
            if highest_avg is None or highest_avg[0] < result[0]:
                highest_avg = result

        return highest_avg[1]


In [None]:
tc = new TextComparerer() 

urlList = [
    "https://www.gutenberg.org/files/1232/1232-0.txt", 
    "https://www.gutenberg.org/files/1342/1342-0.txt", 
    "https://www.gutenberg.org/files/84/84-0.txt", 
    "https://www.gutenberg.org/files/11/11-0.txt",
    "https://www.gutenberg.org/files/64812/64812-0.txt",
    "https://www.gutenberg.org/files/2701/2701-0.txt",
    "https://www.gutenberg.org/files/1661/1661-0.txt",
    "https://www.gutenberg.org/files/1952/1952-0.txt",
    "https://www.gutenberg.org/files/1260/1260-0.txt",
    "https://www.gutenberg.org/files/64317/64317-0.txt"
]


bs = bs.BookScanner(urlList)

# Download all the txt from the url list.
bs.multi_download()

# Avg vowels on text 0.
print(bs.avg_vowels("0.txt"))


# Get the hardest read = highest lix nr
print(bs.hardest_read())
