# Model Functions

This notebook contains the functions that is needed for the production model for the web application.

## Text Scrapping

This function is used for scrapping all the text that is contains within a certain website.

In [5]:
# Import libraries
from urllib.request import urlopen
from bs4 import BeautifulSoup
from pandas import DataFrame
from datetime import datetime


def extract_text(urls, export_as_file=True):
    '''

    Function to extract text from a website.

    @urls: The list of website url
    @export_as_file: Boolean to export the text result as a file

    return: List of string or file containing the text

    '''


    def remove_blank_lines(paragraph):
        '''

        Function to remove extra blank lines in a paragraphs.

        @paragraph: A list of string

        return: A paragraph without extra blank lines

        '''

        lines = paragraph.split('\n')

        non_empty_lines = [line for line in lines if line.strip() != '']

        string_without_empty_lines = ''
        for line in non_empty_lines:
            string_without_empty_lines += line + '\n'

        return string_without_empty_lines


    def clean_table_data(soup):
        '''

        Function to clean the table.

        @soup: HTML page object from BeautifulSoup

        return: Clean string containing table data

        '''

        table_elements = [
        'table',
        'thead',
        'tbody',
        'tfoot',
        'tr',
        'th',
        'td'
        ]

        table_data = soup.find_all(table_elements, string=True)

        string_table_data = ''
        for data in table_data:
            string_table_data += data.get_text() + ' '

        return string_table_data


    def delete_elements(soup, elements):
        '''

        Function to delete some elements.

        @soup: HTML page object from BeautifulSoup
        @elements: List of tags to delete

        return: BeautifulSoup object without deleted elements

        '''

        for element in soup(elements):
            element.decompose()

        return soup

    # List of elements to remove
    elements = [
        'head',
        'script',
        'style',
        'header',
        'nav',
        'table',
        'form',
        'input',
        'button',
        'footer'
    ]

    # Initialize the dataframe
    df = DataFrame(columns=['texts'])

    # Loop over the list of urls
    for url in urls:
        page = urlopen(url).read()

        soup = BeautifulSoup(page, 'html.parser')

        # Clean table
        table_text = clean_table_data(soup)
        soup = delete_elements(soup, elements)

        # Fetch the text from the soup
        text = soup.get_text()

        # Clean the text
        text = text.strip()
        text = remove_blank_lines(text)

        # Append the text to the dataframe
        new_record = {'texts': text}
        df = df.append(new_record, ignore_index=True)

    # Export dataframe
    if export_as_file:
        filename = 'dataset_{:%Y%m%d_%H%M%S}.csv'.format(datatime.utcnow())
        path = r'/datasets/' + filename
        df.to_csv(path, index=False, header=False)
    else:
        return df


In [6]:
# Function test
url = ['https://simple.wikipedia.org/wiki/Zeus']
print(extract_text(url, export_as_file=False))

texts
0  Zeus\nFrom Wikipedia, the free encyclopedia\nJ...
