<a href="https://colab.research.google.com/github/daveshap/GibberishDetector/blob/main/WikipediaDataBuilder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Download Articles

In [None]:
!pip install wikipedia --quiet
!pip install spacy --quiet
!pip install pysbd --quiet

import wikipedia

gdrive_dir = '/content/drive/My Drive/WikiData/'

# todo: come up with a cool way to automatically create topic search terms
keywords = ['india', 'ocean', 'astronomy', 'economics', 'economy', 'earth', 
            'english', 'bacon', 'egg', 'dinosaur', 'rabbit', 'america', 'usa',
            'congress', 'virus', 'George Clooney', 'knowledge', 'Buddha']

def save_article(filepath, content):
  print('Saving:', filepath)
  with open(filepath, 'w', encoding='utf-8') as outfile:
    outfile.write(content)

for keyword in keywords:
  #print('Searching Wikipedia for keyword:', keyword)
  try:
    search = wikipedia.search(keyword)
    for result in search:
      article = wikipedia.page(result)
      filepath = '%sarticle_%s.txt' % (gdrive_dir, result)
      save_article(filepath, article.content)
      exit(0)
  except Exception as oops:
    continue
print('Done saving articles!')

# Parse Articles
The articles need to be split up into usable chunks. This uses regex to identify the section headers and split each article into single lines of text for each section. Furthermore, it looks at the number of word characters vs other characters to identify those sections that likely contain text instead of tables or other data.

In [None]:
import os 
import re

result = list()
gdrive_dir = '/content/drive/My Drive/WikiData/'
outfile = '%sparsed_sections.txt' % gdrive_dir

for file in os.listdir(gdrive_dir):
  #print(file)
  if not 'article_' in file:
    continue
  with open(gdrive_dir + file, 'r', encoding='utf-8') as infile:
    text = infile.read()
  sections = re.split(r'={2,}.{0,80}={2,}', text)
  for section in sections:
    try:
      trimmed = section.strip()
      wordchars = re.findall(r'\w', trimmed)
      ratio = len(wordchars) / len(trimmed)
      # it seems like a ratio of greater than 80% word chars is ideal
      if ratio > 0.80:
        final = re.sub(r'\s+', ' ', trimmed)
        result.append(final.strip())
    except:
      continue
  
print('Wikipedia sections parsed:', len(result))
with open(outfile, 'w', encoding='utf-8') as file:
  for line in result:
    file.write(line+'\n')

# Split Sentences
For the sake of simplicity, we don't want to go overboard and evaluate entire paragraphs. We want to only train on individual sentences. So let's use SpaCy and PYSBD (Python Sentence Boundary Detector) to split the corpus into sentences.

In [None]:
import spacy
from pysbd.utils import PySBDFactory

nlp = spacy.blank('en')
nlp.add_pipe(PySBDFactory(nlp))
gdrive_dir = '/content/drive/My Drive/WikiData/'
infile = '%sparsed_sections.txt' % gdrive_dir
outfile = '%swiki_sentences.txt' % gdrive_dir
result = list()

with open(infile, 'r', encoding='utf-8') as file:
  lines = file.readlines()

print('Lines of text:', len(lines))
for line in lines:
  doc = nlp(line)
  for sent in list(doc.sents):
    result.append(sent)

print('Sentences found:', len(result))
with open(outfile, 'w', encoding='utf-8') as file:
  for line in result:
    if str(line) == '':
      continue
    file.write(str(line)+'\n')
print(outfile, 'saved!')

Lines of text: 1623
Sentences found: 14901
/content/drive/My Drive/WikiData//wiki_sentences.txt saved!


# Generate Gibberish v1 - Word Salad
Shuffle all words around to to make a word salad.

In [None]:
from random import shuffle, seed

gdrive_dir = '/content/drive/My Drive/WikiData/'
infile = '%swiki_sentences.txt' % gdrive_dir
outfile = '%sshuffled_words.txt' % gdrive_dir
result = list()

def scramble_sentence(sentence):
  sentence = sentence.strip()
  split = sentence.split()
  shuffle(split)
  return ' '.join(split)

seed()
with open(infile, 'r', encoding='utf-8') as file:
  lines = file.readlines()
for line in lines:
  line = line.strip()
  if line == '':
    continue
  scrambled = scramble_sentence(line)
  result.append(scrambled)
with open(outfile, 'w', encoding='utf-8') as file:
  for line in result:
    file.write(line+'\n')
print(outfile, 'saved!')        

/content/drive/My Drive/WikiData/shuffled_words.txt saved!


# Generate Gibberish v2 - Random Characters
Shuffle all characters.

In [None]:
from random import shuffle, seed

gdrive_dir = '/content/drive/My Drive/WikiData/'
infile = '%swiki_sentences.txt' % gdrive_dir
outfile = '%sshuffled_characters.txt' % gdrive_dir
result = list()

def scramble_sentence(sentence):
  sentence = sentence.strip()
  sentence = list(sentence)
  shuffle(sentence)
  return ''.join(sentence)

seed()
with open(infile, 'r', encoding='utf-8') as file:
  lines = file.readlines()
for line in lines:
  line = line.strip()
  if line == '':
    continue
  scrambled = scramble_sentence(line)
  result.append(scrambled)
with open(outfile, 'w', encoding='utf-8') as file:
  for line in result:
    file.write(line+'\n')
print(outfile, 'saved!')

/content/drive/My Drive/WikiData/shuffled_characters.txt saved!


# Generate Gibberish v3 - Needle in Haystack
Replace one or two words with random words. Much harder to detect. Probably need something like wordnet to find random words. It has a dictionary right? 

In [None]:
import re
from random import sample,seed

gdrive_dir = '/content/drive/My Drive/WikiData/'
infile = '%swiki_sentences.txt' % gdrive_dir
outfile = '%smild_gibberish.txt' % gdrive_dir
result = list()

with open('%s10k_words.txt' % gdrive_dir, 'r') as file:
  all_words = file.readlines()

def random_word():
  seed()
  return sample(all_words, 1)[0].strip()

def replace_one_word(sentence):
  if len(sentence) < 21:
    return sentence
  words = re.findall(r'\w+', sentence)
  word = sample(words, 1)[0]
  rando = random_word()
  return sentence.replace(word,rando)

#sentence = 'the wheel weaves as the wheel wills'
#replace_one_word(sentence)

with open(infile, 'r', encoding='utf-8') as file:
  lines = file.readlines()
for line in lines:
  line = line.strip()
  if line == '':
    continue
  mild_gibberish = replace_one_word(line)
  mild_gibberish = replace_one_word(mild_gibberish)
  result.append(mild_gibberish)
with open(outfile, 'w', encoding='utf-8') as file:
  for line in result:
    file.write(line+'\n')
print(outfile, 'saved!')

/content/drive/My Drive/WikiData/mild_gibberish.txt saved!
