In [6]:
from transformers import pipeline, set_seed
import pandas as pd
import requests, sys, webbrowser,xml
import numpy as np
import bs4
import re

### Pipeline usage to perform test generation, using the hugging face package transformers. Citation is below:
@article{Wolf2019HuggingFacesTS,
  title={HuggingFace's Transformers: State-of-the-art Natural Language Processing},
  author={Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and R'emi Louf and Morgan Funtowicz and Jamie Brew},
  journal={ArXiv},
  year={2019},
  volume={abs/1910.03771}
}

In [1]:
# Starting up pipeline
def startPipeline():
    generator = pipeline('fill-mask', model='bert-base-uncased')
    return generator

In [None]:
nlp = startPipeline()

In [None]:
# Given a number letters and some sequence, returns the sequence with generated attached words
def generateText(generator,intro_sequence:str, num_words = 5)->str:
    # Imported random text generation
    text = generator(intro_sequence, max_length=len(intro_sequence) + num_words, num_return_sequences=1)[0].get("generated_text")
    return text

In [None]:
def getText(url = "https://www.keepinspiring.me/famous-quotes/" ):
    # Requesting data from url, finding specialized tags for this particular website
    res = requests.get(url)
    res.raise_for_status()
    soup = bs4.BeautifulSoup(res.text, "html.parser")
    text  = soup.find_all("div", class_ = 'author-quotes')  
    return text
text = getText()

## Game List: 
1) **Mad Libs** 

2) **Find the real quote**

3) **How well do you know your favorite song?**
    

## Word Processing Below
Scraping the data from our quote website and cleaning it up

In [6]:
# Obtainin author from quote
def authors(quote):
    return quote.split("”")[1]

#Removing author and adding lost smartquote
def removeAuthors(quote):
    return (quote.split("”")[0] + ("”"))

In [7]:
#Tag processing functions to clean up nasty html formatting, replaces div tags
def processing_div(tag):
    return tag.replace('<div class="author-quotes">', "").replace("</div>", "")

#span processing, replaces span tag
def processing_span(tag):
    return tag.replace("<span class=\"quote-author-name\">", "").replace("</span>", "")

# Checks for tags that have yet to be removed, not given standard format. Reasoning - we don't know when ads will pop up
def cleaner(table):
    arr = np.array([])
    for i in table.get("quote"):
        arr = np.append(arr,("<" in i))
    clean = table[(arr != 1)]
    return clean

In [8]:
table = pd.DataFrame().assign(quote = text)

#Formatting, processing, and splitting quotes and authors
def tableProcess(table):
    table = table.assign(quote = table.get("quote").apply(str))
    table = table.assign(quote = table.get("quote").apply(processing_div).apply(processing_span))
    table = cleaner(table)
    table = table.assign(author  = table.get("quote").apply(authors), quote = table.get("quote").apply(removeAuthors))
    return table
table = tableProcess(table)
table

Unnamed: 0,quote,author
0,“You know you’re in love when you can’t fall a...,– Dr. Suess
1,"“I’m selfish, impatient and a little insecure....",– Marilyn Monroe
2,“Get busy living or get busy dying.”,– Stephen King
3,“The first step toward success is taken when y...,– Mark Caine
5,“Twenty years from now you will be more disapp...,– Mark Twain
...,...,...
83,“The dream crossed twilight between birth and ...,– T. S. Eliot
84,“Don’t think. Thinking is the enemy of creativ...,– Ray Bradbury
86,“The power of imagination makes us infinite.”,– John Muir
88,“Originality is nothing but judicious imitation.”,– Voltaire


## Random Word Replacement to be used in each individual Turing game

In [7]:
import random
def replace_words_at_random(generator, word_arr: [str], num_words: int):
    
    #  Generate num_words random indices
    indices = randomGeneration(num_words, 0, len(word_arr))
    print(indices)
    # Place word masks at each of the randomly chosen indices
    # Fill in each mask with the language model
    for i in indices:
        #Ensuring array is not overaccessed
        if(i < len(word_arr)):
            word_arr[i] = '[MASK]'
        
        #Adding together everything leading up to string so as to add some context into the model
        join  = " ".join(word_arr)
        print(join)

        # Generate the next word
        text = (generator(join)[0].get('sequence').replace("[CLS]", '').replace('[SEP]', '')).strip()
        word_arr = text.split(" ")

    return text, indices 

# Generates a desired number of unique random digits in a certain range
def randomGeneration(num_words, lowNum, highNum):
    random_digits = np.unique(np.random.randint(low = lowNum, high  = highNum, size = num_words))
    while(len(random_digits) < num_words):
        random_digits = np.append(random_digits, np.random.randint(low = lowNum, high = highNum, size = num_words-len(random_digits)))
        random_digits = np.unique(random_digits)
    return random_digits

## Wikipedia Search Below

Start at user given wikipedia page, randomly click a certain number of links from there and scrape 5 sentences from the final page

In [14]:
# Obtains text from a wikipedia url
def getWikiText(url):
    # Requesting data from url, finding specialized tags for this particular website
    res = requests.get(url)
    res.raise_for_status()
    
    #Attaching soup object to page text, obtaining text in paragraphs
    soup = bs4.BeautifulSoup(res.text, "lxml")
    text = ""
    for paragraph in soup.find_all('p'):
        text+= paragraph.text
        
    # Formatting the string so that it looks normal
    text = re.sub(r'\[[0-9]*\]','', text)
    text= re.sub(r'\s+', ' ', text)
    text = text.lower()
    text = re.sub(r'\d', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

In [15]:
# Iterates through wikipedia pages
def wiki_search():
    topic, pages = inp()
    valid_url = construct_wiki_url(topic)
    textSoup = getWikiText(valid_url)
    retString = textSoup.split(' ')
    print(retString)
    return retString

    
    
# Loop through connected topics on wikipedia to find a "landing page", then return the url of that landing page
"""def looping_wiki_search(base_topic, neighbor_pages):
    current_topic, pages = inp()
    for i in range(pages): 
        url = construct_wiki_url(current_topic)
        current_topic ... get some new topic from url"""
        
        
# A function which accepts a wikipedia URL and scrapes a "story" from the text on that page
def scrape_wikipedia_story(wiki_url: str) -> str:
    pass

# Temporary dummy input
def inp():
    print("Please enter a topic")
    val = str(input())
    print('Please enter the number of neighbor pages: ')
    pages = input()
    return val, pages

# Takes a given user topic and constructs a valid wikipedia url
def construct_wiki_url(url_topic : str):
    return ('https://en.wikipedia.org/wiki/' + url_topic.strip().replace(" ","_"))

In [16]:
wiki_search()

Please enter a topic
zeus
Please enter the number of neighbor pages: 
3
['', 'zeus[a]', 'is', 'the', 'sky', 'and', 'thunder', 'god', 'in', 'ancient', 'greek', 'religion,', 'who', 'rules', 'as', 'king', 'of', 'the', 'gods', 'of', 'mount', 'olympus.', 'his', 'name', 'is', 'cognate', 'with', 'the', 'first', 'element', 'of', 'his', 'roman', 'equivalent', 'jupiter.', 'his', 'mythologies', 'and', 'powers', 'are', 'similar,', 'though', 'not', 'identical,', 'to', 'those', 'of', 'indo-european', 'deities', 'such', 'as', 'jupiter,', 'perkūnas,', 'perun,', 'indra,', 'dyaus', 'and', 'thor.', 'zeus', 'is', 'the', 'child', 'of', 'cronus', 'and', 'rhea,', 'the', 'youngest', 'of', 'his', 'siblings', 'to', 'be', 'born,', 'though', 'sometimes', 'reckoned', 'the', 'eldest', 'as', 'the', 'others', 'required', 'disgorging', 'from', "cronus's", 'stomach.', 'in', 'most', 'traditions,', 'he', 'is', 'married', 'to', 'hera,', 'by', 'whom', 'he', 'is', 'usually', 'said', 'to', 'have', 'fathered', 'ares,', 'hebe,

['',
 'zeus[a]',
 'is',
 'the',
 'sky',
 'and',
 'thunder',
 'god',
 'in',
 'ancient',
 'greek',
 'religion,',
 'who',
 'rules',
 'as',
 'king',
 'of',
 'the',
 'gods',
 'of',
 'mount',
 'olympus.',
 'his',
 'name',
 'is',
 'cognate',
 'with',
 'the',
 'first',
 'element',
 'of',
 'his',
 'roman',
 'equivalent',
 'jupiter.',
 'his',
 'mythologies',
 'and',
 'powers',
 'are',
 'similar,',
 'though',
 'not',
 'identical,',
 'to',
 'those',
 'of',
 'indo-european',
 'deities',
 'such',
 'as',
 'jupiter,',
 'perkūnas,',
 'perun,',
 'indra,',
 'dyaus',
 'and',
 'thor.',
 'zeus',
 'is',
 'the',
 'child',
 'of',
 'cronus',
 'and',
 'rhea,',
 'the',
 'youngest',
 'of',
 'his',
 'siblings',
 'to',
 'be',
 'born,',
 'though',
 'sometimes',
 'reckoned',
 'the',
 'eldest',
 'as',
 'the',
 'others',
 'required',
 'disgorging',
 'from',
 "cronus's",
 'stomach.',
 'in',
 'most',
 'traditions,',
 'he',
 'is',
 'married',
 'to',
 'hera,',
 'by',
 'whom',
 'he',
 'is',
 'usually',
 'said',
 'to',
 'have