## SpaCy and TextaCy

In [32]:
doc_path = 'corpus_data/comp_ling.txt'

with open(doc_path) as f:
    input_doc = f.read()

In [33]:
import spacy
nlp = spacy.load('en')

In [34]:
from textacy import text_stats

In [44]:
# input_doc = '''
# Oxygen is a chemical element with symbol O and atomic number 8. A member of the chalcogen group on the periodic table, it is a highly reactive nonmetal and oxidizing agent that forms oxides with most elements as well as other compounds.
# '''

def to_spacy_doc(raw_doc):
    '''Converts a raw string into a spaCy document'''
    return nlp(raw_doc)

def to_textacy_doc(raw_doc):
    '''Converts a raw string into a spaCy doc, then a textacy doc'''
    if isinstance(to_spacy_doc("test"), spacy.tokens.doc.Doc):
        return textacy.Doc(raw_doc)
    else:
        return textacy.Doc(nlp(raw_doc))

In [45]:
def get_named_entities(doc):
    nes = textacy.extract.named_entities(doc)
    return [ne for ne in nes]

get_named_entities(doc)

[Today, 
 
 The Association for Computational Linguistics, 
 
 ..., 1, 2, 2.1, 2.2, 2.3, 2.3.1, 2.3.2, 
 2.4, 3, 4, 5, 6, 7	References, 8	External, United States, 1950s, Russian, 1960s, one, one, one, four, one, Crucially, Pólya, modern-day, English, One, One, English, IBM, over 4.5 million words, American, English, One, two, early in the lifetime of a field of study, English, Japanese, Japanese, Japanese, October 2015, only half, Alan Turing, Turing Test, 1950, Alan Turing, one day, two, one, Turing, Today, Turing, Joseph Weizenbaum, MIT, ELIZA, One, ELIZA, Joseph Weizenbaum, MIT, 1966, Rogerian, ELIZA, ELIZA, ELIZA, first, first, Markov, translation.[25] The, German, French, first, five, ELIZA, Siri, Bayesian, Bledsoe, Browing, 1959, one, Bayesian, Mosteller, Wallace, 1963, Federalist Papers, Madison, 1971, Terry Winograd, SHRDLU, SHRDLU, NASA, LUNAR, Apollo, 1960s and 1970s, Markov, Rabiner, 1989.[32, late 70s, IBM, Bayesian, Apple, Siri, Google Translate, Social, Twitter, four, fou

In [49]:
if not isinstance(doc, textacy.doc.Doc):
        print("ok")

In [47]:
def get_readability_stats(textacy_doc):
    ts = text_stats.TextStats(textacy_doc)
    return ts.readability_stats

get_readability_stats(doc)

{'automated_readability_index': 22.352141689526015,
 'coleman_liau_index': 16.682927688311693,
 'flesch_kincaid_grade_level': 19.052077766395673,
 'flesch_readability_ease': 19.26507900874003,
 'gulpease_index': 41.068861371186955,
 'gunning_fog_index': 22.472696136278014,
 'lix': 72.24939361765064,
 'smog_index': 18.80095838887095,
 'wiener_sachtextformel': 12.265282847241465}

In [48]:
textacy.keyterms.key_terms_from_semantic_network(doc)

[('language', 0.02416255077377969),
 ('linguistic', 0.02043304879128454),
 ('computational', 0.01746222784332213),
 ('human', 0.011689631474031183),
 ('computer', 0.011480717062340147),
 ('model', 0.011188761077018177),
 ('word', 0.011081683346972649),
 ('approach', 0.008327125663422037),
 ('speech', 0.007841891597153119),
 ('program', 0.007792902818308204)]

In [52]:
def get_textacy_key_terms(doc):
    if not isinstance(doc, textacy.doc.Doc):
        doc = to_textacy_doc(doc)
    term_prob_pairs = textacy.keyterms.key_terms_from_semantic_network(doc)
    terms = [term[0] for term in term_prob_pairs]
    return terms
get_textacy_key_terms(doc)

['language',
 'linguistic',
 'computational',
 'human',
 'computer',
 'model',
 'word',
 'approach',
 'speech',
 'program']

In [85]:
import markovify
import re
from unidecode import unidecode

corpus_path = "/corpus_data/comp_ling.txt"
corpus_name = "comp_ling"
# corpus = textacy.Corpus.load(corpus_path, 
#         name=corpus_name, compression='gzip')

#corpus = textacy.Corpus("en", texts=[as_textacy_doc(input_doc)])
# textacy.corpus.Corpus.load(corpus_path)

corpus = textacy.datasets.CapitolWords()

class TaggedText(markovify.Text):

    def sentence_split(self, text):
        """
        Splits full-text string into a list of sentences.
        """
        sentence_list = []
        for doc in corpus:
            sentence_list += list(doc.sents)

        return sentence_list

    def word_split(self, sentence):
        """
        Splits a sentence into a list of words.
        """
        return ["::".join((word.orth_,word.pos_)) for word in sentence]

    def word_join(self, words):
        sentence = " ".join(word.split("::")[0] for word in words)
        return sentence

    def test_sentence_input(self, sentence):
        """
        A basic sentence filter. This one rejects sentences that contain
        the type of punctuation that would look strange on its own
        in a randomly-generated sentence. 
        """
        sentence = sentence.text
        reject_pat = re.compile(r"(^')|('$)|\s'|'\s|[\"(\(\)\[\])]")
        # Decode unicode, mainly to normalize fancy quotation marks
        if sentence.__class__.__name__ == "str":
            decoded = sentence
        else:
            decoded = unidecode(sentence)
        # Sentence shouldn't contain problematic characters
        if re.search(reject_pat, decoded): return False
        return True

    def generate_corpus(self, text):
        """
        Given a text string, returns a list of lists; that is, a list of
        "sentences," each of which is a list of words. Before splitting into 
        words, the sentences are filtered through `self.test_sentence_input`
        """
        sentences = self.sentence_split(text)
        passing = filter(self.test_sentence_input, sentences)
        runs = map(self.word_split, sentences)
        print(runs[0])
        return runs

# Generated the model
model = TaggedText(input_doc)
# A sentence based on the model
print(model.make_sentence())

AttributeError: module 'textacy' has no attribute 'datasets'

In [62]:
import pandas as pd

token_attributes = [(token.orth_,
                     token.prob,
                     token.is_stop,
                     token.is_punct,
                     token.is_space,
                     token.like_num,
                     token.is_oov)
                    for token in doc]

df = pd.DataFrame(token_attributes,
                  columns=['text',
                           'log_probability',
                           'stop?',
                           'punctuation?',
                           'whitespace?',
                           'number?',
                           'out of vocab.?'])

df.loc[:, 'stop?':'out of vocab.?'] = (df.loc[:, 'stop?':'out of vocab.?']
                                       .applymap(lambda x: u'Yes' if x else u''))
                                               
df

Unnamed: 0,text,log_probability,stop?,punctuation?,whitespace?,number?,out of vocab.?
0,Computational,-19.579313,,,,,
1,linguistics,-19.579313,,,,,
2,is,-4.329765,Yes,,,,
3,an,-5.953294,Yes,,,,
4,interdisciplinary,-19.579313,,,,,
5,field,-9.710699,,,,,
6,concerned,-9.861534,,,,,
7,with,-5.363765,Yes,,,,
8,the,-3.425446,Yes,,,,
9,statistical,-11.639928,,,,,


In [None]:
def get_related_terms(token, topn=10):
    """
    look up the topn most similar terms to token
    and print them as a formatted list
    """

    for word, similarity in food2vec.most_similar(positive=[token], topn=topn):

        print u'{:20} {}'.format(word, round(similarity, 3))

## Wikipedia Data

In [144]:
import wikipedia
from wikipedia import DisambiguationError, PageError, RedirectError

def get_wiki_page(search_string, summary=True, content=True):
    '''Returns results from searching for search_string with wikipedia wrapper library. 
       Note: Makes a web request'''
    try:
        page = wikipedia.page(search_string)
        page_data = {"url":page.url}
        if content:
            page_data["content"] = page.content # Full text content of page.
        if summary:
            page_data["summary"] = page.summary # Summary section only.
    
    except DisambiguationError as e:
        return get_wiki_page(e.options[0]) #naively choose first option
    except Exception as e:
        return None
    
    return page_data

In [146]:
get_wiki_page("Lon")['url']



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


'https://en.wikipedia.org/wiki/Lon_(name)'

In [124]:
def get_named_entities(text):
    doc = nlp(text)
    named_entities_list = []
    for ent in doc.ents:
        wiki_url = get_wiki_url(str(ent))
        if wiki_url:
            named_entities_list.append((ent.label_, ent.text, wiki_url))
        else:
            named_entities_list.append((ent.label_, ent.text))
    
    return named_entities_list

In [76]:
import bleach

def set_link_title(attrs, new=False):
    attrs[(None, u'title')] = u'AI-provided Link'
    return attrs

def linkify(string):
    '''Calls bleach.linkify.
    Converts urls in the input string into links. 
    Returns a string of HTML.'''
    if type(string) != str:
        raise TypeError("input should be a string")
        
    linker = Linker(callbacks=[set_link_title])
    return bleach.linkify(string)

def create_hyperlink(url, display_text, attrs=""):
    '''Optional attrs is a string of tag attributes.
       Example call:
       create_hyperlink('www.google.com', 'Google', 
       ... attrs = 'class=link_class title="Custom Title"')'''
    hyperlink_format = '<a href="{link}" {attrs}>{text}</a>'
    return hyperlink_format.format(link=url, attrs=attrs, text=display_text)

create_hyperlink('www.google.com', 'Google', attrs='class=link_class title="Custom Title"')

'<a href="www.google.com" class=link_class title="Custom Title">Google</a>'

In [80]:
nes = get_named_entities("London is a cool city. It is where Winston Churchill once lived. Also Guy McMiggles.")

In [93]:
def linkify_entity(named_entity_tuple):
    '''Operates on tuples from get_named_entities(). Returns HTML string.'''
    ent_type, label, url = named_entity_tuple #unpacks tuple
    attrs = 'class="{ent_type}" title="{ent_type}"'.format(ent_type=ent_type)
    return create_hyperlink(url, label, attrs=attrs)

['<a href="https://en.wikipedia.org/wiki/London" class="GPE" title="GPE">London</a>',
 '<a href="https://en.wikipedia.org/wiki/Winston_Churchill" class="PERSON" title="PERSON">Winston Churchill</a>',
 '<a href="https://en.wikipedia.org/wiki/Shane_Filan" class="PERSON" title="PERSON">Guy McMiggles</a>']

## Empath

In [55]:
from empath import Empath
lexicon = Empath()

In [58]:
lexicon.analyze(input_doc, normalize=True)

{'achievement': 0.0012221203788573174,
 'affection': 0.0006110601894286587,
 'aggression': 0.00030553009471432935,
 'air_travel': 0.0,
 'alcohol': 0.0,
 'ancient': 0.00030553009471432935,
 'anger': 0.0006110601894286587,
 'animal': 0.003360831041857623,
 'anonymity': 0.0006110601894286587,
 'anticipation': 0.0,
 'appearance': 0.002749770852428964,
 'art': 0.002444240757714635,
 'attractive': 0.0,
 'banking': 0.0,
 'beach': 0.0,
 'beauty': 0.00030553009471432935,
 'blue_collar_job': 0.0,
 'body': 0.0,
 'breaking': 0.00030553009471432935,
 'business': 0.010388023220287198,
 'car': 0.0006110601894286587,
 'celebration': 0.0015276504735716467,
 'cheerfulness': 0.0,
 'childish': 0.00030553009471432935,
 'children': 0.002444240757714635,
 'cleaning': 0.00030553009471432935,
 'clothing': 0.00030553009471432935,
 'cold': 0.00030553009471432935,
 'college': 0.0058050717995722576,
 'communication': 0.011610143599144515,
 'competing': 0.0006110601894286587,
 'computer': 0.00977696303085854,
 'con

In [95]:
import chromelogger as console

In [100]:

def index():
    
    console.log('Hello console!')
    console.get_header()
    return 

index()

In [101]:
console.log('Hello console!')
console.get_header()

('X-ChromeLogger-Data',
 b'eyJ2ZXJzaW9uIjogIjAuNC4zIiwgImNvbHVtbnMiOiBbImxvZyIsICJiYWNrdHJhY2UiLCAidHlwZSJdLCAicm93cyI6IFtbWyJIZWxsbyBjb25zb2xlISJdLCAiPGlweXRob24taW5wdXQtMTAxLTgzNzA0ZTZjYWI5OT4gOiAxIiwgWyJsb2ciXV1dfQ==')