In [29]:
from transformers import pipeline, set_seed
import pandas as pd
import requests, sys, webbrowser,xml
import numpy as np
import bs4

### Pipeline usage to perform test generation, using the hugging face package transformers. Citation is below:
@article{Wolf2019HuggingFacesTS,
  title={HuggingFace's Transformers: State-of-the-art Natural Language Processing},
  author={Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and R'emi Louf and Morgan Funtowicz and Jamie Brew},
  journal={ArXiv},
  year={2019},
  volume={abs/1910.03771}
}

In [141]:
def startPipeline():
    generator = pipeline('fill-mask', model='bert-base-uncased')
    return generator

In [143]:
nlp = startPipeline()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
# Given a number letters and some sequence, returns the sequence with generated attached words
def generateText(generator,intro_sequence:str, num_words = 5)->str:
    # Imported random text generation
    text = generator(intro_sequence, max_length=len(intro_sequence) + num_words, num_return_sequences=1)[0].get("generated_text")
    return text

In [4]:
def getText(url = "https://www.keepinspiring.me/famous-quotes/" ):
    # Requesting data from url, finding specialized tags for this particular website
    res = requests.get(url)
    res.raise_for_status()
    soup = bs4.BeautifulSoup(res.text, "html.parser")
    text  = soup.find_all("div", class_ = 'author-quotes')  
    return text
text = getText()

## Game List: 
1) **Mad Libs** 

2) **Find the real quote**

3) **How well do you know your favorite song?**
    

## Word Processing Below
Scraping the data from our quote website and cleaning it up

In [5]:
#Tag processing functions to clean up nasty html formatting, replaces div tags
def processing_div(tag):
    return tag.replace('<div class="author-quotes">', "").replace("</div>", "")

#span processing, replaces span tag
def processing_span(tag):
    return tag.replace("<span class=\"quote-author-name\">", "").replace("</span>", "")

# Checks for tags that have yet to be removed, not given standard format. Reasoning - we don't know when ads will pop up
def cleaner(table):
    arr = np.array([])
    for i in table.get("quote"):
        arr = np.append(arr,("<" in i))
    clean = table[(arr != 1)]
    return clean

In [7]:
table = pd.DataFrame().assign(quote = text)

#Formatting, processing, and splitting quotes and authors
def tableProcess(table):
    table = table.assign(quote = table.get("quote").apply(str))
    table = table.assign(quote = table.get("quote").apply(processing_div).apply(processing_span))
    table = cleaner(table)
    table = table.assign(author  = table.get("quote").apply(authors), quote = table.get("quote").apply(removeAuthors))
    return table
table = tableProcess(table)
table

Unnamed: 0,quote,author
0,“You know you’re in love when you can’t fall a...,– Dr. Suess
1,"“I’m selfish, impatient and a little insecure....",– Marilyn Monroe
2,“Get busy living or get busy dying.”,– Stephen King
3,“The first step toward success is taken when y...,– Mark Caine
5,“Twenty years from now you will be more disapp...,– Mark Twain
6,“When I dare to be powerful – to use my streng...,– Audre Lorde
8,“A successful man is one who can lay a firm fo...,– David Brinkley
10,“I can’t give you a sure-fire formula for succ...,-Herbert Bayard Swope
11,“Would you like me to give you a formula for s...,– Thomas J. Watson
12,"“It is hard to fail, but it is worse never to ...",– Theodore Roosevelt


In [6]:
# Obtainin author from quote
def authors(quote):
    return quote.split("”")[1]

#Removing author and adding lost smartquote
def removeAuthors(quote):
    return (quote.split("”")[0] + ("”"))

## Random Word Replacement to be used in each individual Turing game

In [277]:
import random
def replace_words_at_random(generator, word_arr: [str], num_words: int):
    
    #  Generate num_words random indices
    indices = randomGeneration(num_words, 0, len(word_arr))
    print(indices)
    # Place word masks at each of the randomly chosen indices
    # Fill in each mask with the language model
    for i in indices:
        #Ensuring array is not overaccessed
        if(i < len(word_arr)):
            word_arr[i] = '[MASK]'
        
        #Adding together everything leading up to string so as to add some context into the model
        join  = " ".join(word_arr)
        print(join)

        # Generate the next word
        text = (generator(join)[0].get('sequence').replace("[CLS]", '').replace('[SEP]', '')).strip()
        word_arr = text.split(" ")

    return text, indices 

# Generates a desired number of unique random digits in a certain range
def randomGeneration(num_words, lowNum, highNum):
    random_digits = np.unique(np.random.randint(low = lowNum, high  = highNum, size = num_words))
    while(len(random_digits) < num_words):
        random_digits = np.append(random_digits, np.random.randint(low = lowNum, high = highNum, size = num_words-len(random_digits)))
        random_digits = np.unique(random_digits)
    return random_digits

## Wikipedia Search Below

Start at user given wikipedia page, randomly click a certain number of links from there and scrape 5 sentences from the final page

In [344]:
# Iterates through wikipedia pages
def wikiSearch():
    topic, pages = inp()
    valid_url = construct_wiki_url(topic)
    text = getWikiText(valid_url)
    print(text)

# Temporary dummy input
def inp():
    print("Please enter a topic")
    val = str(input())
    print('Please enter the number of neighbor pages: ')
    pages = input()
    return val, pages

# Takes a given user topic and constructs a valid wikipedia url
def construct_wiki_url(url_topic : str):
    return ('https://en.wikipedia.org/wiki/' + url_topic.strip().replace(" ","_"))
wikiSearch()

Please enter a topic
Mark
Please enter the number of neighbor pages: 
3
[<p><b>Mark</b> may refer to:
</p>, <li class="toclevel-1 tocsection-1"><a href="#Currency"><span class="tocnumber">1</span> <span class="toctext">Currency</span></a>
<ul>
<li class="toclevel-2 tocsection-2"><a href="#German"><span class="tocnumber">1.1</span> <span class="toctext">German</span></a></li>
</ul>
</li>, <li class="toclevel-2 tocsection-2"><a href="#German"><span class="tocnumber">1.1</span> <span class="toctext">German</span></a></li>, <li class="toclevel-1 tocsection-3"><a href="#People"><span class="tocnumber">2</span> <span class="toctext">People</span></a>
<ul>
<li class="toclevel-2 tocsection-4"><a href="#Names"><span class="tocnumber">2.1</span> <span class="toctext">Names</span></a></li>
</ul>
</li>, <li class="toclevel-2 tocsection-4"><a href="#Names"><span class="tocnumber">2.1</span> <span class="toctext">Names</span></a></li>, <li class="toclevel-1 tocsection-5"><a href="#Places"><span clas

In [343]:
def getWikiText(url):
    # Requesting data from url, finding specialized tags for this particular website
    res = requests.get(url)
    res.raise_for_status()
    soup = bs4.BeautifulSoup(res.text, "html.parser")
    text  = soup.find("div", class_ = 'mw-parser-output').find_all('p') + soup.find_all('li')
    return text