In [16]:
import nltk
import requests
from nltk import word_tokenize
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import seaborn as sns
from IPython.display import display, Markdown as md
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords 
import regex as re
import numpy as np

[nltk_data] Downloading package punkt to /Users/skennedy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/skennedy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
def lexical_diversity(text):
    return len(set(text)) / len(text)

def scale(array, scale_type='standard'):
    if scale_type == 'standard': #mean zero, std = 1
        return (array - np.mean(array))/np.std(array)
    if scale_type == 'min_max':
        return (array - np.min(array))/(np.max(array) - np.min(array))

def clean_text(text):
    text = text.lower()
    text_sub = re.sub('[^A-Za-z0-9]+', ' ', text)
    display(md(f'{text_sub[:100]}'))
    tokens = word_tokenize(text_sub)
    display(md(f'*Raw tokens found: {len(tokens):,.0f}*'))
    display(md(f'*Raw Vocab Size: {len(set(tokens)):,.0f}*'))
    nltk_text = nltk.Text(tokens)
    stop_words = set(stopwords.words('english'))
    filtered_text = ' '.join(
        [w for w in nltk_text if not w in stop_words])
    tokens_filtered = word_tokenize(filtered_text)
    nltk_text_filtered = nltk.Text(tokens_filtered)
    return nltk_text_filtered

In [18]:
from nltk.metrics import edit_distance
from nltk.metrics.distance import jaro_similarity, presence

- ***computes levenshtein distance (number of subs and insertions that need to occur to transform A --> B)***

    - 3 subs:
        - e --> l
        - a --> i
        - n --> m

- ***percent string match*** (I'm guessing this is the number of identical positions/letters as a fraction of the total lengths?). Since only one position of 4 is a match (the first) then the answer would be 25%, but if its the total number of characters then the solution is 1/8 or 12.5%.

In [34]:
sean, slim = 'sean' , 'slim'

In [20]:
edit_distance(sean, slim)

3

In [21]:
jaro_similarity(sean, slim)

0.5

In [22]:
set(sean).intersection(set(slim)), set(sean).difference(set(slim)) 

({'s'}, {'a', 'e', 'n'})

***My grandma loves the bible so I grabbed it from her room.....***

In [31]:
book = '''In the beginning God created the heaven and the earth. 
        And the earth was without form, and void; and darkness was upon the face of the deep.'''

In [32]:
clean = clean_text(book)

in the beginning god created the heaven and the earth and the earth was without form and void and da

*Raw tokens found: 27*

*Raw Vocab Size: 17*

In [33]:
clean.tokens

['beginning',
 'god',
 'created',
 'heaven',
 'earth',
 'earth',
 'without',
 'form',
 'void',
 'darkness',
 'upon',
 'face',
 'deep']

***She still knows its the bible - this is an iconic line from a book that has been around for the better part of 2,000 years. Even if she wasn't devout, there are plenty of contextual clues that could lead someone to guess that this is a religious text (god/created/heaven), at the very least.***

### Stemmers

In [26]:
from nltk.stem import *

In [27]:
len(book.split(' '))

27

In [28]:
stemmer = PorterStemmer()
[stemmer.stem(x) for x in book.split(' ')]

['In',
 'the',
 'begin',
 'god',
 'creat',
 'the',
 'heaven',
 'and',
 'the',
 'earth.',
 'and',
 'the',
 'earth',
 'wa',
 'without',
 'form,',
 'and',
 'void;',
 'and',
 'dark',
 'wa',
 'upon',
 'the',
 'face',
 'of',
 'the',
 'deep.']

***The PorterStemmer incorrectly truncates two of the 27 words (created and was). 92.6 % of the words remained valid after transform.***

In [29]:
stemmer = SnowballStemmer('english')
[stemmer.stem(x) for x in book.split(' ')]

['in',
 'the',
 'begin',
 'god',
 'creat',
 'the',
 'heaven',
 'and',
 'the',
 'earth.',
 'and',
 'the',
 'earth',
 'was',
 'without',
 'form,',
 'and',
 'void;',
 'and',
 'dark',
 'was',
 'upon',
 'the',
 'face',
 'of',
 'the',
 'deep.']

***The SnowballStemmer incorrectly truncates one of the 27 words (created). 96.3 % of the words remained valid after transform.***

In [30]:
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

#nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True