# More feature extraction: Textacy

The Natural Language Toolkit is the grand daddy of text mining libraries/modules for Python, but since its inception, other tools have made themeselves available. One of those tools is called "Textacy". Textacy does many of the things done by the NLTK, and it does more. This extra functionality is both more complicated and more expressive than the 'Toolkit. 

In [None]:
# pre-configure
LIBRARY = '/Users/eric/Documents/reader-library'
CARREL  = 'homer'
KEYWORD = 'love'


In [None]:
# configure
ETC   = 'etc'
TEXT  = 'reader.txt'
MODEL = 'en_core_web_sm'


In [None]:
# require
import textacy
import os
import spacy
from  textacy.extract.keyterms.yake import yake
from pathlib import Path


In [None]:
# initialize
library = Path( LIBRARY )
text    = library/CARREL/ETC/TEXT


In [None]:
# slurp up and normalize the text
with open( text ) as handle : text = handle.read()
text = text.replace( '\n', ' ' ).replace( '\t', ' ').replace( '  ', ' ')


In [None]:
# perform a keyword in context (KWIC) query against the data; concordance
lines = list( textacy.extract.kwic.keyword_in_context( text, KEYWORD, window_width=24 ) )
for line in lines : print( line )


In [None]:
# create a spaCy "doc object"; depending on the size of the input, this may take a few minutes to process
size           = len(text)
nlp            = spacy.load( MODEL  )
nlp.max_length = size
doc            = nlp( text )


In [None]:
print( doc._.preview )

In [None]:
print( textacy.TextStats( doc ).flesch_reading_ease )

In [None]:
list(textacy.extract.ngrams( doc, 2, filter_stops=True, filter_punct=True, filter_nums=False) )

In [None]:
yake( doc )

In [None]:
list( textacy.extract.entities( doc ) )

In [None]:
list( textacy.extract.subject_verb_object_triples( doc ) )

In [None]:
list( textacy.extract.noun_chunks( doc ) )

In [None]:
list( textacy.extract.semistructured_statements( doc, entity='you', cue='be' ) ) 