# Install Requirements
Let's get this out of the way up front!

In [None]:
!pip install wikipedia --quiet
!pip install spacy --quiet
!pip install pysbd --quiet
!pip install tensorflow-gpu==1.15.0 --quiet #--force-reinstall
!pip install gpt2-client==2.1.5 --quiet --no-dependencies #--force-reinstall 

# Download Wikipedia Articles
First, we need a corpus of relatively clean data. Wikipedia is crowd-sourced and written in modern English. Therefore we can trust that it is a good source of semantically, syntactically, and rhetorically sound text.

In [1]:
!pip install wikipedia
import wikipedia

keywords = ['india', 'ocean', 'astronomy', 'economics', 'economy', 'earth', 'english', 'bacon', 'egg', 'dinosaur', 'rabbit', 'america', 'usa']  # todo: maybe come up with a cool way to automatically create topic search terms.


def save_article(title, article):
    with open('wiki_' + title + '.txt', 'w', encoding='utf-8') as outfile:
        outfile.write(article)
        
        
for keyword in keywords:
    try:
        search = wikipedia.search(keyword)
        for result in search:
            article = wikipedia.page(result)
            #print(result, article.url)
            save_article(result, article.content)
    except Exception as oops:
        #print(oops)
        continue
print('Done saving articles!')

Collecting wikipedia
  Downloading https://files.pythonhosted.org/packages/67/35/25e68fbc99e672127cc6fbb14b8ec1ba3dfef035bf1e4c90f78f24a80b7d/wikipedia-1.4.0.tar.gz
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-cp36-none-any.whl size=11686 sha256=c89dceb546ccd22889c9d7a30247ecd11d3ad71dffd355d1bb2ff7aa127ed527
  Stored in directory: /root/.cache/pip/wheels/87/2a/18/4e471fd96d12114d16fe4a446d00c3b38fb9efcb744bd31f4a
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0




  lis = BeautifulSoup(html).find_all('li')


# Parse Articles
The articles need to be split up into usable chunks. This uses regex to identify the section headers and split each article into single lines of text for each section. Furthermore, it looks at the number of word characters vs other characters to identify those sections that likely contain text instead of tables or other data.

In [2]:
import os 
import re

result = list()



for file in os.listdir('.'):
    if not 'wiki_' in file:
        continue
    #print(file)
    with open(file, 'r', encoding='utf-8') as infile:
        text = infile.read()
    sections = re.split(r'={2,}.{0,80}={2,}', text)
    for section in sections:
        try:
            trimmed = section.strip()
            wordchars = re.findall(r'\w', trimmed)
            ratio = len(wordchars) / len(trimmed)
            if ratio > 0.80:
                final = re.sub(r'\s+', ' ', trimmed)
                result.append(final)
            # it seems like a ratio of greater than 80% word chars is ideal
        except:
            continue
    
print('Wikipedia sections parsed:', len(result))
with open('wikiparsed.txt', 'w', encoding='utf-8') as outfile:
    for line in result:
        outfile.write(line+'\n')

Wikipedia sections parsed: 1125


# Split Sentences
For the sake of simplicity, we don't want to go overboard and evaluate entire paragraphs. We want to only train on individual sentences. So let's use SpaCy and PYSBD (Python Sentence Boundary Detector) to split the corpus into sentences.

In [3]:
#!pip install spacy
#!pip install pysbd
import spacy
from pysbd.utils import PySBDFactory

nlp = spacy.blank('en')
nlp.add_pipe(PySBDFactory(nlp))
infile = 'wikiparsed.txt'
outfile = 'wikisentences.txt'
result = list()


with open('wikiparsed.txt', 'r', encoding='utf-8') as infile:
    lines = infile.readlines()
for line in lines:
    doc = nlp(line)
    #print('Parsing line:', line[0:80])
    for sent in list(doc.sents):
        result.append(sent)
        #print(sent)
#print('Sentences found:', len(result))
with open('wikisentences.txt', 'w', encoding='utf-8') as file:
    for line in result:
        if str(line) == '':
            continue
        file.write(str(line)+'\n')
print(outfile, 'saved!')

Collecting pysbd
[?25l  Downloading https://files.pythonhosted.org/packages/26/db/95bd39a94eae9a5149bfde3d27760fb3595a35e11a9a01f6e97288132475/pysbd-0.3.3-py3-none-any.whl (67kB)
[K     |████████████████████████████████| 71kB 2.3MB/s 
[?25hInstalling collected packages: pysbd
Successfully installed pysbd-0.3.3
<_io.TextIOWrapper name='wikisentences.txt' mode='w' encoding='utf-8'> saved!


# Generate Gibberish v1
We have a great source of sentences that are semantically, syntactically, and rhetorically sound. The simplest way to generate gibberish, then, would be to scramble these sentences! For this first version, we want words, just all mixed up. This will create good training data because the samples will contain the same exact words as the sound sentences but out of order.

In [4]:
from random import shuffle, seed


infile = 'wikisentences.txt'
outfile = 'wikiscrambled.txt'
result = list()


def scramble_sentence(sentence):
    sentence = sentence.strip()
    split = sentence.split()
    shuffle(split)
    return ' '.join(split)


seed()
with open(infile, 'r', encoding='utf-8') as file:
    lines = file.readlines()
for line in lines:
    line = line.strip()
    if line == '':
        continue
    scrambled = scramble_sentence(line)
    result.append(scrambled)
    #print('Scrambled sentence:', scrambled[0:100])
with open(outfile, 'w', encoding='utf-8') as file:
    for line in result:
        file.write(line+'\n')
print(outfile, 'saved!')        

wikiscrambled.txt saved!


# Generate Gibberish v2
This step may not be necessary but I'd like to be able to detect utter nonsense as well. So let's scramble all the characters in each sentence completely. I figure it's better to show the model random noise as well as random words.

In [5]:
from random import shuffle, seed


infile = 'wikisentences.txt'
outfile = 'wikiscrambled2.txt'
result = list()


def scramble_sentence(sentence):
    sentence = sentence.strip()
    sentence = list(sentence)
    shuffle(sentence)
    return ''.join(sentence)


seed()
with open(infile, 'r', encoding='utf-8') as file:
    lines = file.readlines()
for line in lines:
    line = line.strip()
    if line == '':
        continue
    scrambled = scramble_sentence(line)
    result.append(scrambled)
    #print('Scrambled sentence:', scrambled[0:100])
with open(outfile, 'w', encoding='utf-8') as file:
    for line in result:
        file.write(line+'\n')
print(outfile, 'saved!')

wikiscrambled2.txt saved!


# Compile Training Corpus
Let's build a training corpus that we can feed to GPT2! We need to bake the label directly into each line. 

In [6]:
from random import sample, seed

files = [
('wikisentences.txt', 'Clean'), 
('wikiscrambled2.txt', 'Gibberish'), 
('wikiscrambled.txt', 'Gibberish')
]


result = list()

max_samples = 100

corpus = 'corpus.txt' 



for file in files:
    with open(file[0], 'r', encoding='utf-8') as infile:
        lines = infile.readlines()
    for line in lines:
        line = line.strip()
        if line == '':
            continue
        line = '// %s || %s' % (line, file[1])
        result.append(line)
        #print(file, line[0:80])

seed()
subset = sample(result, max_samples)

with open(corpus, 'w', encoding='utf-8') as outfile:
    for line in subset:
        outfile.write(line+'\n\n')
print(corpus, 'saved!')

corpus.txt saved!


# Fine Tune GPT2!
This is where the rubber meets the road! Let's see if we can finetune a GPT-2 model!

In [None]:
#gast==0.2.2
#tensorboard<1.16.0,>=1.15.0
#!pip install tensorflow-gpu==1.15.0 --force-reinstall
#!pip install gpt2-client==2.1.5 --force-reinstall --no-dependencies

from gpt2_client import GPT2Client


gpt2 = GPT2Client('345M')  # options: 117M, 345M, 774M, or 1558M
gpt2.load_model(force_download=False) 

corpus = 'corpus.txt'

result = gpt2.finetune(corpus, return_text=True)
print(result)

[1 | 107.26] loss=5.38 avg=5.38
[2 | 200.27] loss=5.26 avg=5.32
