In [22]:
from transformers import pipeline, set_seed
import pandas as pd
import requests, sys, webbrowser,xml
import numpy as np
import bs4
import re
import random

### Pipeline usage to perform test generation, using the hugging face package transformers. Citation is below:
@article{Wolf2019HuggingFacesTS,
  title={HuggingFace's Transformers: State-of-the-art Natural Language Processing},
  author={Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and R'emi Louf and Morgan Funtowicz and Jamie Brew},
  journal={ArXiv},
  year={2019},
  volume={abs/1910.03771}
}

In [148]:
# Starting up pipeline
def startPipeline():
    generator = pipeline('fill-mask', model='bert-base-uncased')
    return generator

In [149]:
nlp = startPipeline()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Given a number letters and some sequence, returns the sequence with generated attached words
def generateText(generator,intro_sequence:str, num_words = 5)->str:
    # Imported random text generation
    text = generator(intro_sequence, max_length=len(intro_sequence) + num_words, num_return_sequences=1)[0].get("generated_text")
    return text

In [None]:
def getText(url = "https://www.keepinspiring.me/famous-quotes/" ):
    # Requesting data from url, finding specialized tags for this particular website
    res = requests.get(url)
    res.raise_for_status()
    soup = bs4.BeautifulSoup(res.text, "html.parser")
    text  = soup.find_all("div", class_ = 'author-quotes')  
    return text
text = getText()

## Game List: 
1) **Mad Libs** 

2) **Find the real quote**

3) **How well do you know your favorite song?**
    

## Word Processing Below
Scraping the data from our quote website and cleaning it up

In [6]:
# Obtainin author from quote
def authors(quote):
    return quote.split("”")[1]

#Removing author and adding lost smartquote
def removeAuthors(quote):
    return (quote.split("”")[0] + ("”"))

In [7]:
#Tag processing functions to clean up nasty html formatting, replaces div tags
def processing_div(tag):
    return tag.replace('<div class="author-quotes">', "").replace("</div>", "")

#span processing, replaces span tag
def processing_span(tag):
    return tag.replace("<span class=\"quote-author-name\">", "").replace("</span>", "")

# Checks for tags that have yet to be removed, not given standard format. Reasoning - we don't know when ads will pop up
def cleaner(table):
    arr = np.array([])
    for i in table.get("quote"):
        arr = np.append(arr,("<" in i))
    clean = table[(arr != 1)]
    return clean

In [8]:
table = pd.DataFrame().assign(quote = text)

#Formatting, processing, and splitting quotes and authors
def tableProcess(table):
    table = table.assign(quote = table.get("quote").apply(str))
    table = table.assign(quote = table.get("quote").apply(processing_div).apply(processing_span))
    table = cleaner(table)
    table = table.assign(author  = table.get("quote").apply(authors), quote = table.get("quote").apply(removeAuthors))
    return table
table = tableProcess(table)
table

Unnamed: 0,quote,author
0,“You know you’re in love when you can’t fall a...,– Dr. Suess
1,"“I’m selfish, impatient and a little insecure....",– Marilyn Monroe
2,“Get busy living or get busy dying.”,– Stephen King
3,“The first step toward success is taken when y...,– Mark Caine
5,“Twenty years from now you will be more disapp...,– Mark Twain
...,...,...
83,“The dream crossed twilight between birth and ...,– T. S. Eliot
84,“Don’t think. Thinking is the enemy of creativ...,– Ray Bradbury
86,“The power of imagination makes us infinite.”,– John Muir
88,“Originality is nothing but judicious imitation.”,– Voltaire


## Random Word Replacement to be used in each individual Turing game

In [183]:
def replace_words_at_random(generator, word_arr: [str], num_words: int, difficulty = 1):
    
    #  Generate num_words random indices
    indices = randomGeneration(num_words, 0, len(word_arr))
    print(indices)
    
    # Place word masks at each of the randomly chosen indices
    # Fill in each mask with the language model
    for i in indices:
        #Ensuring array is not overaccessed
        if(i < len(word_arr)):
            word_arr[i] = '[MASK]'
        
        #Adding together everything leading up to string so as to add some context into the model
        join  = " ".join(word_arr)

        # Generate the next word
        text = (generator(join)[3-difficulty].get('sequence').replace("[CLS]", '').replace('[SEP]', '')).strip()
        word_arr = text.split(" ")

    return text, indices 

# Generates a desired number of unique random digits in a certain range
def randomGeneration(num_words, lowNum, highNum):
    random_digits = np.unique(np.random.randint(low = lowNum, high  = highNum, size = num_words))
    while(len(random_digits) < num_words):
        random_digits = np.append(random_digits, np.random.randint(low = lowNum, high = highNum, size = num_words-len(random_digits)))
        random_digits = np.unique(random_digits)
    return random_digits

## Wikipedia Search Below

Start at user given wikipedia page, randomly click a certain number of links from there and scrape 5 sentences from the final page

In [19]:
# Obtains text from a wikipedia url
def getWikiText(url):
    # Requesting data from url, finding specialized tags for this particular website
    res = requests.get(url)
    res.raise_for_status()
    
    #Attaching soup object to page text, obtaining text in paragraphs
    soup = bs4.BeautifulSoup(res.text, "lxml")
    text = ""
    
    # Problematic structure: fails to look for list items which make up substantial amount of wikipedia pages
    for paragraph in soup.find_all('p'):
        text+= paragraph.text
        
    # Formatting the string so that it looks normal
    text = re.sub(r'\[.*\]', '', text)
    text= re.sub(r'\s+', ' ', text)
    text = text.lower()
    text = re.sub(r'\d', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

In [108]:
# Iterates through wikipedia pages
def wiki_search():
    topic, pages = inp()
    textSoup, title = looping_wiki_search(topic, int(pages))
    return textSoup, title 

# Randomly selecting the next topic to be searched for
def selectNextTopic(text: [str])-> str:
    random_number = np.random.randint(1,len(text))
    return text[random_number]

# Replaces hyphens with an underscore for url purposes, removes all punctuation that could break url
def replacePunctuation(text):
    specific_case = text.strip()
    specific_case = specific_case.replace('-',' ')
    specific_case = specific_case.replace(' ', '_')
    
    # Remove all punctuation
    pattern = re.compile(r'\W')
    specific_case = re.sub(pattern, '', specific_case)
    
    return specific_case
    
    
# Loop through connected topics on wikipedia to find a "landing page", then return the text of that landing page
def looping_wiki_search(topic, neighbor_pages):
    searchText = ""
    url = construct_wiki_url(topic)
    for i in np.arange(neighbor_pages+1):
        print(i)
        searchText = getWikiText(url).split(" ")
        
        
        #So long as there are still pages left to proccess
        if(i < neighbor_pages):
            
            # Selecting next topic
            topic = selectNextTopic(searchText)

            # Replacing the punctuation of the next topic
            topic = replacePunctuation(topic)

            # Moving to next URL
            url = construct_wiki_url(topic)
        
    return searchText, topic
        

# Temporary dummy input
def inp():
    print("Please enter a topic")
    val = str(input())
    print('Please enter the number of neighbor pages: ')
    pages = input()
    return val, pages

# Takes a given user topic and constructs a valid wikipedia url
def construct_wiki_url(url_topic : str):
    return ('https://en.wikipedia.org/wiki/' + url_topic.lower().strip().replace(" ","_"))

In [110]:
wiki_search()

Please enter a topic
may
Please enter the number of neighbor pages: 
8
0
1
2
3
4
5
6
7
8


(['producer', 'or', 'producers', 'may', 'refer', 'to:', ''], 'produced')

## Last Scrape for Song Game: Scraping Genius

In [115]:
import lyricsgenius

In [187]:
# Takes artist and song name in terms of string and obtains lyrics
def obtainLyrics(artist: str, song: str):
    genius = lyricsgenius.Genius("NDltUrlbSis8n9o1FEyGUE_ruIlngdDmXwoQdvrkX0hh3le3LKF8XalcHXOetm3x")
    artist = genius.search_artist(artist, max_songs=3, sort="title")
    song = genius.search_song(song, artist.name)
    song_lyrics =  song.lyrics
    
    #String proccessing
    song_lyrics = re.sub(r'\[.*\]', '', song_lyrics)
    song_lyrics  = re.sub(r'\n+', ' ', song_lyrics)
    song_lyrics = song_lyrics.split(" ")
    
    #Making sure songs do not exceed maximum threshold
    if(len(song_lyrics) > 510):
        song_lyrics = song_lyrics[:510]
    return song_lyrics

# Master Controls obtaining and proccessing lyrics
# To do: preserve tags, ie [verse 1], and keep word replacement model from touching it 
def lyrics(artistName, songName):
    lyrics = obtainLyrics(artistName, songName) 
    return lyrics

In [189]:
obtainLyrics('Toto', 'Africa')
text, indices = replace_words_at_random(nlp, obtainLyrics('Yung Gravy', '1 Thot 2 Thot Red Thot Blue Thot'), 90, -1)

Searching for songs by Toto...

Song 1: "21st Century Blues"
Song 2: "2 Hearts"
Song 3: "99"

Reached user-specified song limit (3).
Done. Found 3 songs.
Searching for "Africa" by Toto...
Done.
1349


['',
 'I',
 'hear',
 'the',
 'drums',
 'echoing',
 'tonight',
 'But',
 'she',
 'hears',
 'only',
 'whispers',
 'of',
 'some',
 'quiet',
 'conversation',
 "She's",
 'coming',
 'in,',
 '12:30',
 'flight',
 'Her',
 'moonlit',
 'wings',
 'reflect',
 'the',
 'stars',
 'that',
 'guide',
 'me',
 'towards',
 'salvation',
 'I',
 'stopped',
 'an',
 'old',
 'man',
 'along',
 'the',
 'way',
 'Hoping',
 'to',
 'find',
 'some',
 'old',
 'forgotten',
 'words',
 'or',
 'ancient',
 'melodies',
 'He',
 'turned',
 'to',
 'me',
 'as',
 'if',
 'to',
 'say',
 '"Hurry',
 'boy,',
 "it's",
 'waiting',
 'there',
 'for',
 'you"',
 "It's",
 'gonna',
 'take',
 'a',
 'lot',
 'to',
 'drag',
 'me',
 'away',
 'from',
 'you',
 "There's",
 'nothing',
 'that',
 'a',
 'hundred',
 'men',
 'or',
 'more',
 'could',
 'ever',
 'do',
 'I',
 'bless',
 'the',
 'rains',
 'down',
 'in',
 'Africa',
 'Gonna',
 'take',
 'some',
 'time',
 'to',
 'do',
 'the',
 'things',
 'we',
 'never',
 'had',
 'The',
 'wild',
 'dogs',
 'cry',
 'out',