In [None]:
from pprint import pprint

## Quill stuff

### Deltas

In [None]:
test_delta = '''{"ops":[
  { "insert": "#" },
  { "insert": "!", "attributes": { "bold": "true" }}
]}'''

In [None]:
import json
def read_Delta(Delta):
    '''Expects a list of operations (dicts)'''
    if type(Delta) == str:
        Delta = json.loads(Delta)
    
    for op_dict in Delta["ops"]:
        for op, value in op_dict.items():
            print(value)
            if (op == 'insert') and (value=="#"): 
                hashtag = True
    return Delta

read_Delta(test_delta)

## SpaCy and TextaCy

In [None]:
import spacy
import textacy
nlp = spacy.load('en')

def to_spacy_doc(raw_doc):
    '''Converts a raw string into a spaCy document'''
    return nlp(raw_doc)

def to_textacy_doc(raw_doc):
    '''Converts a raw string into a spaCy doc, then a textacy doc'''
    if isinstance(to_spacy_doc("test"), spacy.tokens.doc.Doc):
        return textacy.Doc(raw_doc)
    else:
        return textacy.Doc(nlp(raw_doc))

### Sample input documents

In [None]:
### Setup testing documents
doc_path = 'corpus_data/comp_ling.txt'

with open(doc_path) as f:
    input_doc = f.read()

doc=to_spacy_doc(input_doc)

autism_abstract = '''The purpose of this research is to identify a subtype of autism called Developmental Verbal Dyspraxia (DVD).  DVD is a motor-speech problem, disabling oral-motor movements needed for speaking. The first phase of the project involves a screening interview where we identify DVD and Non-DVD kids.  We also use home videos to validate answers on the screening interview.  The final phase involves home visits where we use several assessments to confirm the child’s diagnosis and examine the connection between manual and oral motor challenges. By identifying DVD as a subtype of Autism, we will eliminate the assumption that all Autistics have the same characteristics. This will allow for more individual consideration of Autistic people and may direct future research on the genetic factors in autism.'''
history_abstract = '''This project involves discovering how the American Revolution was remembered during the nineteenth century.  The goal is to show that the American Revolution was memorialized by the actions of the United States government during the 1800s. This has been done by examining events such as the Supreme Court cases of John Marshall and the Nullification Crisis. Upon examination of these events, it becomes clear that John Marshall and John Calhoun (creator of the Doctrine of Nullification) attempted to use the American Revolution to bolster their claims by citing speeches from Founding Fathers. Through showing that the American Revolution lives on in memory, this research highlights the importance of the revolution in shaping the actions of the United States government.'''
games_abstract = '''The study is to show how even a “sport” video game can incorporate many types of learning, to call attention to what might be overlooked as significant forms of learning, and to understand and take advantage of the opportunities video games afford as more deliberate learning environments. The aspects explored are the skills and techniques required to be successful in the game, the environment that skaters skate in, the personal vs. group identity that is shown through the general appearance of the skater, and the values and icons that the game teaches players. We are finding that sport video games support learning; we hope to find how one learns about oneself as a learner from playing.'''

raw_abstracts = [autism_abstract, history_abstract, games_abstract]
abstracts = list(map(to_textacy_doc, raw_abstracts))

#### spaCy parse as Pandas Dataframe

In [None]:
### view how spacy parses doc
import pandas as pd

token_attributes = [(token.orth_,
                     token.prob,
                     token.is_stop,
                     token.is_punct,
                     token.is_space,
                     token.like_num,
                     token.is_oov)
                    for token in doc]

df = pd.DataFrame(token_attributes,
                  columns=['text',
                           'log_probability',
                           'stop?',
                           'punctuation?',
                           'whitespace?',
                           'number?',
                           'out of vocab.?'])

df.loc[:, 'stop?':'out of vocab.?'] = (df.loc[:, 'stop?':'out of vocab.?']
                                       .applymap(lambda x: u'Yes' if x else u''))
                                               
df

### SpaCy Vector Functions

In [None]:
def similar_to_target_by_brown_cluster(words_to_compare, target_word):
    similar_words = []
    for word_to_compare in words_to_compare:
        if abs(nlp.vocab[word_to_compare].cluster - nlp.vocab[target_word].cluster) < 1:
            similar_words.append(word_to_compare)
    return similar_words

similar_to_target_by_brown_cluster(list(nlp.vocab.strings), 'show')

## Textacy

In [None]:
doc = input_doc

### Get words

In [None]:
def get_words(doc):
    '''Gets the word tokens for a textacy document, excluding stopwords and punc'''
    if type(doc) == str:
        doc = to_textacy_doc(doc)
    return list(textacy.extract.words(doc))

#print(get_words(input_doc))

def get_content_words(doc):
    if not isinstance(doc, textacy.doc.Doc):
        doc = to_textacy_doc(doc)
    return list(textacy.extract.words(doc, filter_stops=True, filter_punct=True, filter_nums=False, include_pos=None, exclude_pos=None, min_freq=1))
print(get_content_words(doc))

#### Extract named entities

In [None]:
def get_named_entities(doc):
    nes = textacy.extract.named_entities(doc)
    return [ne for ne in nes]

def extract_named_entities(text):
    '''Given a text document, extracts named entities using spaCy and builds a dict of metadata for each.
    
    Example Entity Type Labels:
    ORGANIZATION	Georgia-Pacific Corp., WHO
    PERSON	Eddy Bonte, President Obama
    LOCATION	Murray River, Mount Everest
    DATE	June, 2008-06-29
    TIME	two fifty a m, 1:30 p.m.
    MONEY	175 million Canadian Dollars, GBP 10.40
    PERCENT	twenty pct, 18.75 %
    FACILITY	Washington Monument, Stonehenge
    GPE	South East Asia, Midlothian
    '''
    
    doc = to_spacy_doc(text)
    named_entities = defaultdict(dict)
    for ent in doc.ents:
        ent_name = ent.text
        named_entities[ent_name]['label'] = ent.label_
        named_entities[ent_name]['text'] = ent.text
        wiki_url = None #get_wiki_page(str(ent))['url']
        if wiki_url:
            named_entities[ent_name]['url'] = wiki_url
        
    return named_entities
extract_named_entities("Paul is a man. Jane is a woman.")

#### Readability Stats

In [None]:
def get_readability_stats(doc):
    if not isinstance(doc, textacy.doc.Doc):
        doc = to_textacy_doc(doc)
    ts = textacy.text_stats.TextStats(doc)
    return ts.readability_stats

get_readability_stats(doc)

In [None]:
def sentences_readability(sentences):
    try:
        sentence_stats = [get_readability_stats(str(sent)) for sent in sentences]
        return sentence_stats
    except Exception as e:
        raise e

#sentences_readability(sentences)

### Get Sentences

In [None]:
def get_sentences(doc):
    '''Returns a list of spacy spans.'''
    if not isinstance(doc, textacy.doc.Doc):
        doc = to_textacy_doc(doc)
    return list(doc.sents)

sentences = get_sentences(doc)

### Get Key Terms

#### Get key terms from semantic network

In [None]:
def get_semantic_key_terms(doc, top_n_terms=10, filtered=True):
    '''Gets key terms from semantic network. '''
    if not isinstance(doc, textacy.doc.Doc):
        doc = to_textacy_doc(doc)
    term_prob_pairs = textacy.keyterms.key_terms_from_semantic_network(doc)
    max_keyterm_weight = term_prob_pairs[0][1]
    
    # keep keyterms if they're at least half as important as the most important keyterm
    # term[0] is the word, term[1] is its keyterm-ness.
    if filtered:
        terms = [[term[0], term[1]] for term in term_prob_pairs if term[1] >= 0.5*(max_keyterm_weight)]
    else:
        terms = term_prob_pairs
    
    #textacy.keyterms.aggregate_term_variants(terms) #aggregates terms that are variations of each other
    
    return [term[0] for term in terms[:top_n_terms]]

get_semantic_key_terms(to_textacy_doc(raw_abstracts[0]))

#### Extract Keyterms with SGRank

In [None]:
#textacy.keyterms.sgrank(doc)

#### Extract Keyterms with TextRank

In [None]:
#textacy.keyterms.textrank(doc)

### Extract acronyms and their definitions

In [None]:
doc = abstracts[0]
sample_abstract = raw_abstracts[0]

In [None]:
textacy.extract.acronyms_and_definitions(to_textacy_doc(sample_abstract))

### Extract semantic chunks

In [None]:
list(textacy.extract.noun_chunks(doc))

In [None]:
### Semi-structured statements
doc = abstracts[1]
print(doc)
list(textacy.extract.semistructured_statements(doc, "project", cue=u'involve'))

In [None]:
### Extract Subject-Verb-Object Triples
list(textacy.extract.subject_verb_object_triples(doc))

### Extract tokens

In [None]:
def extract_pos_tagged_sents_from_corpus(textacy_corpus):
    '''Returns a list of documents, each composed of list of sentences.
    Sentences are lists of tuples of the form (token, POS)'''
    return [doc.pos_tagged_text for doc in textacy_corpus]

In [None]:
import itertools
def extract_verbs(doc):
    '''Returns a list of strings that are verb-tagged tokens.'''
    if not isinstance(doc, textacy.doc.Doc):
        doc = to_textacy_doc(doc)
    all_token_pos_pairs = itertools.chain(*doc.pos_tagged_text) #flatten list
    verbs = [token for token, pos in all_token_pos_pairs if pos.startswith("V")]
    return verbs
print(extract_verbs(doc))

def bag_of_words(doc, as_strings=True):
    '''Returns a dictionary with word:count pairs. Words are grouped by lemma.'''
    if not isinstance(doc, textacy.doc.Doc):
        doc = to_textacy_doc(doc)
    return doc.to_bag_of_words(as_strings=as_strings)
      
pprint(bag_of_words(doc))

## Wikipedia Data

### Wikipedia wrapper (API)

In [None]:
import wikipedia
from wikipedia import DisambiguationError, PageError, RedirectError

def get_wiki_page(search_string, summary=False, content=False):
    '''Returns results from searching for search_string with wikipedia wrapper library. 
       Note: Makes a web request'''
    try:
        page = wikipedia.page(search_string)
        page_data = {"url":page.url,
                     "title":page.title}
        if content:
            page_data["content"] = page.content # Full text content of page.
        if summary:
            page_data["summary"] = page.summary # Summary section only.
    
    except DisambiguationError as e:
        return get_wiki_page(e.options[0]) #naively choose first option
    except Exception as e:
        return None
    
    return page_data


In [None]:
test_doc = "London"

In [None]:
import bleach

def set_link_title(attrs, new=False):
    attrs[(None, u'title')] = u'AI-provided Link'
    return attrs

def linkify(string):
    '''Calls bleach.linkify.
    Converts urls in the input string into links. 
    Returns a string of HTML.'''
    if type(string) != str:
        raise TypeError("input should be a string")
        
    linker = Linker(callbacks=[set_link_title])
    return bleach.linkify(string)

def create_hyperlink(url, display_text, attrs=""):
    '''Optional attrs is a string of tag attributes.
       Example call:
       create_hyperlink('www.google.com', 'Google', 
       ... attrs = 'class=link_class title="Custom Title"')'''
    hyperlink_format = '<a href="{link}" {attrs}>{text}</a>'
    return hyperlink_format.format(link=url, attrs=attrs, text=display_text)

create_hyperlink('www.google.com', 'Google', attrs='class=link_class title="Custom Title"')

In [None]:
### HTML-related functions
def create_hyperlink(url, display_text, attrs=""):
    '''Optional attrs is a string of tag attributes.
       
       create_hyperlink('www.google.com', 'Google', 
       ... attrs = 'class=link_class title="Custom Title"')'''
    
    hyperlink_format = '<a href="{link}" {attrs}>{text}</a>'
    return hyperlink_format.format(link=url, attrs=attrs, text=display_text)

def linkify_entity(ent_dict):
    '''Operates on extracted named entities. Returns HTML string.'''
    ent_type = ent_dict['label']
    text = ent_dict['text'] 
    url = get_wiki_page(text)['url']
    attrs = 'class="{ent_type}" title="{text}"'.format(ent_type=ent_type, text=text)
    return create_hyperlink(url, text, attrs=attrs)

In [None]:
test_doc = "London is a city in the U.K."
london = extract_named_entities(test_doc)["London"]
print(london)
print(linkify_entity(london))

### Wiktionary

In [None]:
from wiktionaryparser import WiktionaryParser, WikiParse
parser = WiktionaryParser()
word = parser.fetch('test')

### Wikipedia XML Dump

#### http://www.heatonresearch.com/2017/03/03/python-basic-wikipedia-parsing.html

In [None]:
import xml.etree.ElementTree as etree
import codecs
import csv
import time
import os

PATH_WIKI_XML = '/Users/davideverling/Projects/Data'
FILENAME_WIKI = 'enwiki-latest-pages-articles.xml'
FILENAME_ARTICLES = 'articles.csv'
FILENAME_REDIRECT = 'articles_redirect.csv'
FILENAME_TEMPLATE = 'articles_template.csv'
ENCODING = "utf-8"

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

def strip_tag_name(t):
    t = elem.tag
    idx = k = t.rfind("}")
    if idx != -1:
        t = t[idx + 1:]
    return t

pathWikiXML = os.path.join(PATH_WIKI_XML, FILENAME_WIKI)
pathArticles = os.path.join(PATH_WIKI_XML, FILENAME_ARTICLES)
pathArticlesRedirect = os.path.join(PATH_WIKI_XML, FILENAME_REDIRECT)
pathTemplateRedirect = os.path.join(PATH_WIKI_XML, FILENAME_TEMPLATE)

totalCount = 0
articleCount = 0
redirectCount = 0
templateCount = 0
title = None
start_time = time.time()

In [None]:
'''Begin streaming the XML file and 
write the headers for the 3 CSV files that will be built according to the data found in the XML.'''

with codecs.open(pathArticles, "w", ENCODING) as articlesFH, \
        codecs.open(pathArticlesRedirect, "w", ENCODING) as redirectFH, \
        codecs.open(pathTemplateRedirect, "w", ENCODING) as templateFH:
    articlesWriter = csv.writer(articlesFH, quoting=csv.QUOTE_MINIMAL)
    redirectWriter = csv.writer(redirectFH, quoting=csv.QUOTE_MINIMAL)
    templateWriter = csv.writer(templateFH, quoting=csv.QUOTE_MINIMAL)
    articlesWriter.writerow(['id', 'title', 'redirect'])
    redirectWriter.writerow(['id', 'title', 'redirect'])
    templateWriter.writerow(['id', 'title'])

    for event, elem in etree.iterparse(pathWikiXML, events=('start', 'end')):
        tname = strip_tag_name(elem.tag)
        if event == 'start':
            if tname == 'page':
                title = ''
                id = -1
                redirect = ''
                inrevision = False
                ns = 0
            elif tname == 'revision':
                # Do not pick up on revision id's
                inrevision = True
        else:
            if tname == 'title':
                title = elem.text
            elif tname == 'id' and not inrevision:
                id = int(elem.text)
            elif tname == 'redirect':
                redirect = elem.attrib['title']
            elif tname == 'ns':
                ns = int(elem.text)
            elif tname == 'page':
                totalCount += 1
                if ns == 10:
                    templateCount += 1
                    templateWriter.writerow([id, title])
                elif len(redirect) > 0:
                    articleCount += 1
                    articlesWriter.writerow([id, title, redirect])
                else:
                    redirectCount += 1
                    redirectWriter.writerow([id, title, redirect])

        if totalCount > 1 and (totalCount % 100000) == 0:
            print("{:,}".format(totalCount))

    elem.clear()

    elapsed_time = time.time() - start_time
    print("Total pages: {:,}".format(totalCount))
    print("Template pages: {:,}".format(templateCount))
    print("Article pages: {:,}".format(articleCount))
    print("Redirect pages: {:,}".format(redirectCount))
    print("Elapsed time: {}".format(hms_string(elapsed_time)))


## dbpedia

In [None]:
import requests
from xml.etree import ElementTree
from collections import defaultdict

def dbpedia_keyword_search(keywords, api_host='http://localhost:1111', query_class=''):
    api_string = api_host+"/api/search/KeywordSearch?"
    query_class = 'QueryClass=' + query_class + "&"
    query_string = 'QueryString=' + keywords
    request_string = api_string + query_class + query_string
    
    response = requests.get(request_string)  
    xml_tree = ElementTree.fromstring(response.content)
    return response.content

def dbpedia_prefix_search(query_string, api_host='http://localhost:1111', query_class=''):
    '''Returns list of dicts from dbpedia API search. Keys are: label, uri, description'''
    api_string = api_host+"/api/search/PrefixSearch?"
    query_class = 'QueryClass=' + query_class + "&"
    query_string = 'QueryString=' + query_string
    request_string = api_string + query_class + query_string
    
    response = requests.get(request_string)
    xmltree = ElementTree.fromstring(response.content)
    
    lookup = './/{http://lookup.dbpedia.org/}'
    results = xmltree.findall(lookup+"Result")
    if not results:
        return None
    
    results_list = []
    for i, result in enumerate(results):
        results_list.append({
            'label': xmltree.findall(lookup+"Label")[i].text,
            'uri'  : xmltree.findall(lookup+"URI")[i].text,
            'description' : xmltree.findall(lookup+"Description")[i].text
        })
    return results_list

print(dbpedia_keyword_search("banana"))
print(dbpedia_prefix_search("banana")[0])
print(dbpedia_prefix_search("machine lea")[0])

In [None]:
def get_dbpedia_results(queries):
    '''Given a list of query strings, returns a list of result dicts.'''
    results = []
    for query in queries:
        result_for_query = dbpedia_prefix_search(query)
        if result_for_query:
            results.append(result_for_query[0])
    return results

#print(get_dbpedia_results(expanded_keywords))

def get_dbpedia_result_text(queries):
    '''Given a list of query strings, returns a list of (label+description) strings '''
    results = get_dbpedia_results(queries)
    
    results_strings = [str(r['label'] +": "+ r['description']) 
                       for r in results if r['description']]
    return list(set(results_strings))

db_docs = get_dbpedia_result_text(expanded_keywords)
pprint(db_docs)

In [None]:
def textacy_corpus_dbpedia_results(queries):
    '''Given a list of query strings, returns a textacy corpus generated from dbpedia results.'''
    corpus = textacy.corpus.Corpus('en', 
                                 get_dbpedia_result_text(queries), 
                                 metadatas=get_dbpedia_results(queries))
    return corpus

#corpus = textacy_corpus_dbpedia_results(expanded_keywords)
#print(corpus)

In [None]:
#pos_tagged_blob = extract_pos_tagged_sents_from_corpus(corpus)
#print(pos_tagged_blob)

## Empath

In [None]:
from empath import Empath
lexicon = Empath()

### Lexical Category Analysis

In [None]:
from pprint import pprint

def get_semantic_categories(raw_text):
    category_analysis = lexicon.analyze(raw_text, normalize=True, tokenizer='default')
    top_cats = [[cat[0], cat[1]] for cat in category_analysis.items()]
    top_cats.sort(key=lambda x: x[1], reverse=True)
    return top_cats

input_doc = "I hated that comment from you. It was useless."

category_analysis = lexicon.analyze(input_doc, normalize=True, tokenizer='default')

top_cats = [[cat[0], cat[1]] for cat in category_analysis.items()]
top_cats.sort(key=lambda x: x[1], reverse=True)

pprint(top_cats)

### Create Category from terms

In [None]:
tokens = '''machine, word, language, information, human, style, little, thought, vocabulary, contextual'''
tokens = tokens.split(", ")
lexicon.create_category("category_name",tokens, model="nytimes")

In [None]:
import io
from contextlib import redirect_stdout
import nltk
words = set(nltk.corpus.words.words())

def capture_from_stdout(function):
    '''Not 100% sure the interior fuction call syntax is correct, but the wrapping is correct'''
    f = io.StringIO()
    with redirect_stdout(f):
        function()
    return f.getvalue()

def category_from_keywords(keywords, model='all', clean=False):
    '''Call Empath's create category. Model options: "fiction","nytimes","reddit","all"
    Returns a list of strings.'''
    
    if type(keywords) == str:
        keywords = map(str.strip, keywords.split(",")) #split into individual items
    
    #replace spaces with underscores for Empath's lexicon format
    keywords = [keyword.replace(" ", "_") for keyword in keywords]

    category_name = keywords[0] + " " + keywords[1] #name the category after the first two keywords
    if model  == 'all':
        category_terms = []
        for model in ['reddit','nytimes','fiction']:
            f = io.StringIO()
            with redirect_stdout(f):
                lexicon.create_category(category_name, keywords, model=model, write=False)
            model_terms = f.getvalue().strip().replace("[]","")
            if model_terms:
                model_terms = model_terms[1:-1] #exclude enclosing brackets
                model_terms = model_terms.replace('"','').split(", ")
                category_terms.append(model_terms)
        category_terms = [term for model_terms in category_terms for term in model_terms] #flatten lists
        category_terms = [term.replace('_',' ') for term in category_terms] #re-separate on underscores
    
    else:
        f = io.StringIO()
        with redirect_stdout(f):
            lexicon.create_category(category_name, keywords, model=model)
        category_terms = f.getvalue().strip().replace("[]","").replace("_", " ")
        category_terms = category_terms[1:-1] #exclude enclosing brackets
        category_terms = category_terms.replace('"','').split(", ")

    
    ### Filter out non-words like urls, unusual characters
    if clean == True:
        clean_terms = []
        for term in category_terms:
            clean_terms.append(" ".join(w for w in nltk.wordpunct_tokenize(term) \
             if w.lower() in words or not w.isalpha()))
        category_terms = [term for term in clean_terms if term.isalpha()]
    
    return category_terms

keywords = '''machine, word, language, information, human, style, little, thought, vocabulary, contextual'''
expanded_keywords = category_from_keywords(keywords, model='fiction')
print(keywords)
print(expanded_keywords)

In [None]:
def get_wiki_text_for_keywords(keywords):
    all_text = ""
    if type(keywords)==str:
        keywords = keywords.split(",")
        
    print(keywords)    
    for keyword in keywords:
        wiki_data = get_wiki_page(keyword, summary=True, content=True)
        if wiki_data != None:
            try:
                #summary = wiki_data['summary']
                content = wiki_data['content']
                all_text += content
            except KeyError:
                continue
    
    return all_text

wiki_blob = get_wiki_text_for_keywords(keywords)
print(wiki_blob)        

### Category from Key Terms

In [None]:
sample_abstract = '''This research looks at the work of Margaret C. Anderson, the editor of the Little Review.  The review published first works by Sherwood Anderson, James Joyce, Wyndham Lewis, and Ezra Pound.  This research draws upon mostly primary sources including memoirs, published letters, and a complete collection of the Little Review. Most prior research on Anderson focuses on her connection to the famous writers and personalities that she published and associated with.  This focus undermines her role as the dominant creative force behind one of the most influential little magazines published in the 20th Century. This case example shows how little magazine publishing is arguably a literary art.'''
keyterms = get_textacy_key_terms(to_textacy_doc(sample_abstract))
print("Extracted Key Terms:\n", keyterms)
sample_abstract_cat = category_from_keywords([keyterm[0] for keyterm in keyterms])
print("Generated Category Words:\n", sample_abstract_cat)

## Markovify

In [None]:
import markovify
text = raw_abstracts[1]

# Build the model.
text_model = markovify.Text(text)

# Print five randomly-generated sentences
for i in range(5):
    print(text_model.make_sentence())

### Custom Tagger class using spaCy components for fast POS tagging

In [None]:
import markovify
import re
from unidecode import unidecode

# corpus_path = "/corpus_data/comp_ling.txt"
# corpus_name = "comp_ling"
#corpus = textacy.Corpus('en', [raw_abstracts[1], " "])
#corpus = textacy.Corpus.add_doc(textacy.Corpus('en'), to_textacy_doc(wiki_blob))
#corpus = wiki_blob_spacy
#print(corpus)

class TaggedText(markovify.Text):

    def sentence_split(self, text):
        """
        Splits full-text string into a list of sentences.
        """
        sentence_list = []
        for doc in corpus:
            sentence_list += list(doc.sents)

        return sentence_list

    def word_split(self, sentence):
        """
        Splits a sentence into a list of words.
        """
        #print(sentence)
        return ["::".join((word.orth_,word.pos_)) for word in sentence]

    def word_join(self, words):
        sentence = " ".join(word.split("::")[0] for word in words)
        return sentence

    def test_sentence_input(self, sentence):
        """
        A basic sentence filter. This one rejects sentences that contain
        the type of punctuation that would look strange on its own
        in a randomly-generated sentence. 
        """
        sentence = sentence.text
        reject_pat = re.compile(r"(^')|('$)|\s'|'\s|[\"(\(\)\[\])]")
        # Decode unicode, mainly to normalize fancy quotation marks
        if sentence.__class__.__name__ == "str":
            decoded = sentence
        else:
            decoded = unidecode(sentence)
        # Sentence shouldn't contain problematic characters
        if re.search(reject_pat, decoded): return False
        return True

    def generate_corpus(self, text):
        """
        Given a text string, returns a list of lists; that is, a list of
        "sentences," each of which is a list of words. Before splitting into 
        words, the sentences are filtered through `self.test_sentence_input`
        """
        sentences = self.sentence_split(text)
        passing = filter(self.test_sentence_input, sentences)
        runs = list(map(self.word_split, passing))
        #print(runs[:10])
        return runs

# Generated the model
model = TaggedText(corpus, state_size=2)
# A sentence based on the model
print(model.make_sentence())
model.make_short_sentence(max_chars=100)

### Make Sentence Starting With...

In [None]:
### Using default Markovify model
text = raw_abstracts[1]
text_model = markovify.Text(text, state_size=1)
try:
    for i in range(5):
        print(text_model.make_sentence_with_start("importance"))
except KeyError:
    None

In [None]:
### Using spaCy-fied POS parsed model
model = TaggedText(corpus, state_size=1)
try:
    start_with_token = "importance"
    for i in range(5):
        print(model.make_sentence_with_start(nlp(start_with_token)))
except KeyError:
    None
    

In [None]:
def generate_markov_completions(partial_sentence, trained_markovify_model, n_completions=10):
    spacy_sentence = nlp(partial_sentence) #convert to spaCy doc
    last_word = str(spacy_sentence[-1])
    #last_words = str(spacy_sentence[-2]) + " " + last_word

    completions = []
    for completion in range(n_completions):
        completions.append(trained_markovify_model.make_sentence_with_start(last_word))
        
    return list(set(completions))

try:
    generate_markov_completions("Overwatch", model)
except Exception as e:
    print(e)

## Synonym Generation

### Wordnet Synsets

In [None]:
from nltk.corpus import wordnet as wn

for i,j in enumerate(wn.synsets('java')):
    print("Meaning",i, "NLTK ID:", j.name())
    print("Definition:",j.definition())
    print("Synonyms:", ", ".join(j.lemma_names()))

In [None]:
for ss in wn.synsets('small'):
    print(ss, ss.examples())
    for sim in ss.similar_tos():
        print('    {}'.format(sim))
        


In [None]:
from itertools import chain

for i,j in enumerate(wn.synsets('computer')):
    print("Meaning",i, "NLTK ID:", j.name())
    hypernyms = list(chain(*[l.lemma_names() for l in j.hypernyms()]))
    hyponyms = list(chain(*[l.lemma_names() for l in j.hyponyms()]))
    print("Hypernyms:", ", ".join(hypernyms))
    print("Hyponyms:", ", ".join(hyponyms))

### PyDictionary

In [None]:
from PyDictionary import PyDictionary
from collections import defaultdict
from pprint import pprint

def dict_from_doc_tokens(unigram_tokens):
    '''Calls PyDictionary (which calls thesaurus.com) to retrieve synonyms'''
    pydict=PyDictionary(unigram_tokens)
    
    dictionary = defaultdict(dict)
    for word in unigram_tokens:
        meaning = pydict.meaning(word)
        synonyms = pydict.synonym(word)
        dictionary[word] = {"meaning":meaning, "synonyms":synonyms}
    return dictionary

def get_meanings(word, pos="all"):
    '''Returns meaning definitions from an existing pydictionary, or None if no meanings found'''
    meanings = dictionary[word]['meaning']

    if pos.startswith("all"):
        return meanings
    if pos.startswith("N"):
        return meanings['Noun']
    if pos.startswith("V"):
        return meanings["Verb"]
    if pos.startswith("J"):
        return meanings["Adjective"]
    if pos.startswith("RB"):
        return meanings["Adverb"]
    
    return meanings

tokens = ['alabaster']
dictionary = dict_from_doc_tokens(tokens)
pprint(get_meanings(tokens[0]))


## Proselint

In [None]:
import proselint
from proselint.tools import errors_to_json

suggestions = proselint.tools.lint("this is a very unique sentence")
errors_to_json(suggestions)

# for suggestion in suggestions:
#         check = suggestion[0]
#         message = suggestion[1]
#         line = suggestion[2]
#         column = suggestion[3]
#         start = suggestion[4]
#         end = suggestion[5]
#         extent = suggestion[6]
#         severity = suggestion[7]
#         replacements = suggestion[8]

In [None]:
import proselint
from proselint.tools import errors_to_json
import json

def linter_suggestions(text):
    ''' Returns suggestions as a list of dicts. Each dict is a suggestion with the following properties:
    (check, message, line, column, start, end, extent, severity, replacements)
    '''
    suggestions = proselint.tools.lint(text)
    json_string = errors_to_json(suggestions) 
    json_dict = json.loads(json_string)
    return json_dict['data']['errors']

linter_suggestions("and then I said... there goes a very unique thing")

## TextGenRNN

In [None]:
from textgenrnn import textgenrnn

In [None]:
%%time
textgen = textgenrnn()

In [None]:
%%time
textgen.generate(5)

In [None]:
generated_texts = textgen.generate(n=5, prefix="Machine learning", temperature=0.2, return_as_list=True)
pprint(generated_texts)

In [None]:
texts = [input_doc]

textgen.train_on_texts(texts, verbose=1)

In [None]:
print("---\nNormal Output\n---")
textgen.generate(5)
print("---\nHigh Temperature Output\n---")
textgen.generate(5, temperature=1.0)
print("---\nPrefix Output\n---")
textgen.generate(5, prefix="N")

## Extractive Summarization

In [None]:
from gensim.summarization import summarize, keywords

In [None]:
def extract_summary(text, ratio=0.25):
    '''Wraps gensim summarize()'''
    return summarize(text, ratio)
extract_summary(input_doc)

In [None]:
keywords(input_doc).split()

## Sentence completions

In [None]:
# Sentence Sources
#dbpedia
#PyDictionary (most flexible, but makes web call)

In [None]:
#def extract_keywords(text):
#    '''Wraps textacy keywords function, returns a list of keyword strings'''

In [None]:
#def category_from_keywords(keywords, model='all', clean=False):
#'''Call Empath's create category. Model options: "fiction","nytimes","reddit","all"
#    Returns a list of strings.'''

In [None]:
#def textacy_corpus_dbpedia_results(queries):
#    '''Given a list of query strings, returns a textacy corpus generated from dbpedia results.'''
#    corpus = textacy.corpus.Corpus('en', 
#                                 get_dbpedia_result_text(queries), 
#                                 metadatas=get_dbpedia_results(queries))
#    return corpus

In [None]:
# def get_sentences(doc):
#     '''Returns a list of spacy spans.'''
#     if not isinstance(doc, textacy.doc.Doc):
#         doc = to_textacy_doc(doc)
#     return list(doc.sents)

In [None]:
def get_dbpedia_result_text(queries):
    '''Given a list of query strings, returns a list of (label+description) strings '''
    results = get_dbpedia_results(queries)
    
    results_strings = [str(r['label'] +": "+ r['description']) 
                       for r in results if r['description']]
    return list(set(results_strings))

In [None]:
text = '''Metis is a cool data science bootcamp immersive program where I learned a ton about machine learning, natural language processing, probability, and statistics. Natural Language Processing is cool. Data Science is a burgeoning field. President Obama is cool.'''

In [None]:
import re
doc = to_textacy_doc(text)

keywords = get_semantic_key_terms(doc)
print(keywords)

entities = [str(ent) for ent in textacy.extract.named_entities(doc)]
print(entities)

knowledge = textacy_corpus_dbpedia_results(keywords+entities)
print(knowledge)
def get_completions(doc):
    '''Accepts string or textacy doc. Returns list of strings.'''
    if not isinstance(doc, textacy.doc.Doc):
        doc = to_textacy_doc(doc)
    
    ents = [str(ent) for ent in textacy.extract.named_entities(doc)]   
    completions = get_dbpedia_result_text(ents)    
        
        
        
    return completions

get_completions(text)

In [None]:
get_dbpedia_result_text(["Natural Language Processing", "dog"])

In [None]:
list(textacy.extract.subject_verb_object_triples(doc))

## PyGal Charting

In [None]:
from empath import Empath
lexicon = Empath()

def empath_analyze(text):
    '''Takes in raw text returns list of lists: [[category_name, weight],...]
    sorted by weight, descending'''
    empath_analysis = lexicon.analyze(text, normalize=True, tokenizer='default')

    lexical_categories = [[cat[0], cat[1]] for cat in empath_analysis.items()]
    lexical_categories.sort(key=lambda x: x[1], reverse=True)
    return lexical_categories

#pprint(top_cats)
#top_eight = empath_analyze(input_doc)[:8]
#pprint(top_eight)
make_radar(empath_analyze(input_doc))

In [None]:
import pygal                                                       # First import pygal
#bar_chart = pygal.Bar()                                            # Then create a bar graph object
#bar_chart.add('Fibonacci', [0, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55])  # Add some values
#bar_chart.render_to_file('static/bar_chart.svg')                          # Save the svg to a file

In [None]:
def make_radar(lex_cats):
    '''Takes a list of lists e.g. [['fruit',0.02],['science',0.01]]
       Creates an svg file called lexical_radar.svg.'''
    radar_chart = pygal.Radar()
    radar_chart.title = 'Top 8 Lexical Categories by Word Count'
    lex_cats.sort(key=lambda x: x[1], reverse=True)
    top_eight = lex_cats[:8]
    radar_chart.x_labels = [cat[0] for cat in top_eight[::-1]]
    radar_chart.add('doc', [cat[1] for cat in top_eight[::-1]])
    radar_chart.render_to_file('lexical_radar.svg', fill=True) 
    return True
    
make_radar(top_cats)

In [None]:
def make_hist(lex_cats):
    '''Takes a list of lists e.g. [['fruit',0.02],['science',0.01]]
       Creates an svg file called lexical_histogram.svg.'''
    lex_cats.sort(key=lambda x: x[1], reverse=True)
    top_eight = lex_cats[:8]
    
    hist = pygal.Histogram()
    for i, cat in enumerate(top_eight):
        #hist.add(name, [(height, start, stop)])
        hist.add(cat[0], [(cat[1],i+1,i+2)])
    hist.render_to_file('lexical_histogram.svg')
    return True
make_hist(empath_analyze('punch drunk love'))

In [None]:
def make_readability_gauge(doc):
    '''Takes a document and creates an svg file called reada_gauge'''
    gauge = pygal.Gauge()
    gauge.title = 'Flesch Reading Ease Score'
    gauge.range = [0, 100]
    gauge.add('Your writing', get_readability_stats(doc)['flesch_readability_ease'])
    gauge.add('Very easy to read', 100)
    gauge.add('Average readability', 65)
    gauge.add('A little hard to read', 35)
    gauge.add('Very hard to read', 0)
    gauge.render_to_file('reada_gauge.svg', needle_width = 1 / 10, human_readable=True)
    return
make_readability_gauge(doc)