# Model Functions

This notebook contains the functions that is needed for the production model for the web application.

## Text Scrapping

This function is used for scrapping all the text that is contains within a certain website.

In [10]:
# Import libraries
from urllib.request import urlopen
from bs4 import BeautifulSoup
from pandas import DataFrame
from datetime import datetime


def extract_text(urls, export_as_file=True):
    '''

    Function to extract text from a website.

    @urls: The list of website url
    @export_as_file: Boolean to export the text result as a file

    return: List of string or file containing the text

    '''


    def remove_blank_lines(paragraph):
        '''

        Function to remove extra blank lines in a paragraphs.

        @paragraph: A list of string

        return: A paragraph without extra blank lines

        '''

        lines = paragraph.split('\n')

        non_empty_lines = [line for line in lines if line.strip() != '']

        string_without_empty_lines = ''
        for line in non_empty_lines:
            string_without_empty_lines += line + '\n'

        return string_without_empty_lines


    def clean_table_data(soup):
        '''

        Function to clean the table.

        @soup: HTML page object from BeautifulSoup

        return: Clean string containing table data

        '''

        table_elements = [
        'table',
        'thead',
        'tbody',
        'tfoot',
        'tr',
        'th',
        'td'
        ]

        table_data = soup.find_all(table_elements, string=True)

        string_table_data = ''
        for data in table_data:
            string_table_data += data.get_text() + ' '

        return string_table_data


    def delete_elements(soup, elements):
        '''

        Function to delete some elements.

        @soup: HTML page object from BeautifulSoup
        @elements: List of tags to delete

        return: BeautifulSoup object without deleted elements

        '''

        for element in soup(elements):
            element.decompose()

        return soup

    # List of elements to remove
    ELEMENTS = [
        'head',
        'script',
        'style',
        'header',
        'nav',
        'table',
        'form',
        'input',
        'button',
        'footer'
    ]

    # Initialize the dataframe
    df = DataFrame(columns=['texts'])

    # Loop over the list of urls
    for url in urls:
        page = urlopen(url).read()

        soup = BeautifulSoup(page, 'html.parser')

        # Clean table
        table_text = clean_table_data(soup)
        soup = delete_elements(soup, ELEMENTS)

        # Fetch the text from the soup
        text = soup.get_text()

        # Clean the text
        text = text.strip()
        text = remove_blank_lines(text)

        # Append the text to the dataframe
        new_record = {'texts': text}
        df = df.append(new_record, ignore_index=True)

    # Export dataframe
    if export_as_file:
        filename = 'dataset_{:%Y%m%d_%H%M%S}.csv'.format(datatime.utcnow())
        path = r'/datasets/' + filename
        df.to_csv(path, index=False, header=False)
    else:
        return df


In [11]:
# Function test
url = ['https://simple.wikipedia.org/wiki/Zeus']
print(extract_text(url, export_as_file=False))

texts
0  Zeus\nFrom Wikipedia, the free encyclopedia\nJ...


## Finetuning Model
This function is used to finetune the model based on the current latest dataset. 

In [None]:
# Uncomment the code below to run in Google Colaboratory
# %tensorflow_version 1.x
# !pip install gpt_2_simple

In [None]:
# Import libraries
import gpt_2_simple as gpt2


def finetune_model(dataset, model_name='124M', learning_rate=0.0001):
    '''

    Function to finetune the model and save the trained model every checkpoint on the checkpoint folder.

    @dataset: Path to the training data (CSV)
    @model_name: The name of the model: 124M, 355M, etc.
    @learning_rate: The learning rate of the model

    return: None

    '''

    sess = gpt2.start_tf_sess()

    gpt2.finetune(
        sess,
        dataset=dataset,  # Dataset CSV file
        steps=-1,
        model_name=model_name,  # Model name: 124M, 355M, etc.
        model_dir='models',
        combine=50000,
        batch_size=1,
        learning_rate=learning_rate,  # Learning rate
        accumulate_gradients=5,
        restore_from='latest',  # Start training the model from the latest model
        run_name='trained_model',  # Name of the trained model
        checkpoint_dir='checkpoint',  # Directory to save the model
        sample_every=250,
        sample_length=1023,  # Number of token generated
        sample_num=1,
        multi_gpu=False,
        save_every=500,
        print_every=50,
        max_checkpoints=1,
        use_memory_saving_gradients=False,
        only_train_transformer_layers=False,
        optimizer='adam',
        overwrite=True  # Overwrite the current model when training
    )


## Generating Text
This functions is used to generate the text based on some input from the users.

In [None]:
def generate_text(outline_to_length):
    '''

    Function to generate the text.

    @outline_to_length: A 2D array containing the list of outline and the length desired
        [[outline, length],
        [outline, length],
        [outline, length]]

    return: List of generated text

    '''

    # Initialize TensorFlow session
    sess = gpt2.start_tf_sess()

    # Create an empty list to store lists
    essay = []

    # Loop over the list
    for record in outline_to_length:
        prefix = record[0]  # The first sentence of the paragraph
        length = record[1]  # The length of the paragraph (max: 1023)

        text = gpt2.generate(
            sess,
            run_name='trained_model',
            checkpoint_dir='checkpoint',
            model_name=None,
            model_dir='models',
            sample_dir='samples',
            return_as_list=True,  # Return as list of string
            truncate=None,
            destination_path=None,
            sample_delim='\n' + '=' * 20 + '\n\n',
            prefix=prefix,
            seed=None,
            nsamples=1,  # Number of sample to be generated
            batch_size=1,
            length=length,
            temperature=0.7,
            top_k=0,
            top_p=0.0,
            include_prefix=True
        )

        essay += text

        # Add double newline
        essay.append('\n\n')

    return essay
