## There are a few different NLP libraries with python, uncomment the code below to install them

In [255]:
# ! pip install spacy
# ! python -m spacy download en
# import nltk
# nltk.download('punkt') ##this downloads the default word tokenizer
# nltk.download('stopwords') ##this downloads all stopwords
# nltk.download('popular') ##this downloads many different popular libraries 

### 1. Tokenization (stop words, stemming, lemmatizing)
### 2. Vectorization
### 3. Cosine Similarity

### Key terminology:

* Document: each individual body of text you are observing
* Corpus: the entire collection of texts
* Tokenization: the process by which you break up each document into smaller pieces. You can tokenize into sentences or words.

### Let's take a look at a collection of Reddit comments from early 2006

In [188]:
df = pd.read_json('./RC_2006-01.bz2',compression='bz2',lines=True)

In [190]:
df.head()

Unnamed: 0,author,author_flair_css_class,author_flair_text,body,controversiality,created_utc,distinguished,edited,gilded,id,link_id,parent_id,retrieved_on,score,stickied,subreddit,subreddit_id,ups
0,jh99,,,early 2006 a probable date,0,1136074029,,False,0,c2715,t3_22569,t3_22569,1473821517,0,False,reddit.com,t5_6,0
1,jpb,,,If you are going to post something that has a ...,0,1136076410,,False,0,c2717,t3_22542,t3_22542,1473821517,0,False,reddit.com,t5_6,0
2,Pichu0102,,,Microsoft hates it's own products?\r\nWho knew?,0,1136078623,,False,0,c2718,t3_22515,t3_22515,1473821517,2,False,reddit.com,t5_6,2
3,libertas,,,"this looks interesting, but it's already aired...",0,1136079346,,False,0,c2719,t3_22528,t3_22528,1473821517,2,False,reddit.com,t5_6,2
4,mdmurray,,,I have nothing but good things to say about De...,0,1136081389,,False,0,c2722,t3_22538,t3_22538,1473821517,0,False,reddit.com,t5_6,0


In [192]:
documents = df['body']

### 1. Tokenization
Let's tokenize each of the documents. Let's try the first.

In [279]:

from nltk import word_tokenize

word_tokenize(documents[1])

['If',
 'you',
 'are',
 'going',
 'to',
 'post',
 'something',
 'that',
 'has',
 'a',
 'link',
 'to',
 'the',
 'original',
 'author',
 ',',
 'why',
 'not',
 'just',
 'post',
 'the',
 'original',
 'instead',
 'of',
 'someone',
 "'s",
 'copy',
 '?']

We have capitalization, punctuation and words that are all too frequent, such as "a", "the", "two"

In [300]:
from nltk.corpus import stopwords
import string
my_stopwords = set(stopwords.words('english'))


def tokenize(document):
    tocs = word_tokenize(document.lower())
    stop_words = set(stopwords.words('english'))
    tokenized_list = [toc for toc in tocs if toc not in stop_words]
    
    
    
    punctuation = set(string.punctuation)
    no_punctuation = [word for word in tokenized_list if word not in punctuation]
    
   
    
    return no_punctuation
    
    

In [295]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [299]:
tokenize(documents[8])

['because',
 'this',
 'one',
 'is',
 'different',
 'and',
 'i',
 'like',
 'it',
 'better',
 'here',
 "'s",
 'your',
 'sign']

In [182]:
toks = nltk.word_tokenize('OMG this is #bigly fun and #bigly cool, lets go running!')
toks

['OMG',
 'this',
 'is',
 '#',
 'bigly',
 'fun',
 'and',
 '#',
 'bigly',
 'cool',
 ',',
 'lets',
 'go',
 'running',
 '!']

In [296]:
from nltk import TweetTokenizer
twt = TweetTokenizer()
one_tweet = twt.tokenize('OMG this is so #fun and #bigly cool. SAD!')

In [297]:
one_tweet

['OMG', 'this', 'is', 'so', '#fun', 'and', '#bigly', 'cool', '.', 'SAD', '!']

We can also tokenize by sentence

In [234]:
from nltk import sent_tokenize
test_text = """This is my first sentence. This is my second sentence. Oh wow now there is a third\
 sentence. This is getting out of control!"""
sent_tokenize(test_text)

['This is my first sentence.',
 'This is my second sentence.',
 'Oh wow now there is a third sentence.',
 'This is getting out of control!']

### Stemming/ Lemmatization

* Stemming: reduces words by removing suffixes (often reduces to strings that are not real words)
* Lemmatization: reduces words to some root form that is still in the the English dictionary

Longer Explanation: https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html

Your use of stemming/lemmatization will wholly depend on the context of the problem you're trying to solve. Let's take a look at how a sample sentence might be treated:

In [214]:
music_prodigy = documents[48]

In [219]:
def stem_words(document,stemmer):
    toks = nltk.word_tokenize(document)
    wrd_list = []
    for word in toks:
        wrd_list.append(stemmer.stem(word))
    return " ".join(wrd_list)
    
    

In [21]:
snowball = nltk.stem.SnowballStemmer('english')

In [220]:
stem_words(music_prodigy,snowball)

"the reason the music industri is loos money is becaus they focus to much on dispos music . they find the front men ( or women ) to sing , get some music made by peopl who know onli how to make music that is the sound of the `` moment '' . everyon know their role , band member just do what their told to . then this shit get heavili market and peopl buy it , but buy audienc is small and this stuff onli get brought while it is in fashion . compar that to music that get to where it is through talent and passion . when i hear a market band singer sing about , say loss , it just ca n't reach me becaus it 's not true and is sung without mean . compar that to a real band where it 's the singer real experi . music like this can continu to sell for decad and by all generat . if the music industri want there sale to go up they should go to the club and do talent spot like they use to and not by peopl who think what will sell but who think what sound good and more to the point what *feels* good .

In [221]:
stem_words(music_prodigy,lancaster)

"the reason the mus industry is loos money is becaus they foc to much on dispos mus . they find the front men ( or wom ) to sing , get som mus mad by peopl who know on how to mak mus that is the sound of the `` mom '' . everyon know their rol , band memb just do what their told to . then thi shit get heavy market and peopl buy it , but buy audy is smal and thi stuff on get brought whil it is in fash . comp that to mus that get to wher it is through tal and pass . when i hear a market band sing sing about , say loss , it just ca n't reach me becaus it 's not tru and is sung without mean . comp that to a real band wher it 's the sing real expery . mus lik thi can continu to sel for decad and by al gen . if the mus industry want ther sal to go up they should go to the club and do tal spot lik they us to and not by peopl who think what wil sel but who think what sound good and mor to the point what *feels* good ."

In [15]:
stemmer = nltk.RegexpStemmer('ing$|s$|e$|able$', min=4)

In [222]:
def lem_words(document,lemmer):
    toks = nltk.word_tokenize(document)
    wrd_list = []
    for word in toks:
        wrd_list.append(lemmer.lemmatize(word))
    return " ".join(wrd_list)

In [226]:
lemmer = nltk.stem.WordNetLemmatizer()
lemmer.lemmatize('running')

'running'

In [223]:
lem_words(music_prodigy,lemmer)

"The reason the music industry is loosing money is because they focus to much on disposable music . They find the front men ( or woman ) to sing , get some music made by people who know only how to make music that is the sound of the `` moment '' . Everyone know their role , band member just do what their told to . Then this shit get heavily marketed and people buy it , but buying audience is small and this stuff only get brought while it is in fashion . Compare that to music that get to where it is through talent and passion . When I hear a marketed band singer singing about , say loss , It just ca n't reach me because it 's not true and is sung without meaning . Compared that to a real band where it 's the singer real experience . Music like this can continue to sell for decade and by all generation . If the music industry want there sale to go up they should go to the club and do talent spotting like they used to and not by people who think what will sell but who think what sound go

## Vectorization
#### Machine Learning models aren't able to operate on text because text means nothing to mathematical functions! We need to convert our text to a numerical form. SciKit Learn has a package that quickly vectorizes your text.

Let's look at CountVectorizer first, which creates a Bag of Words model.

In [56]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [235]:
## We are going to fit vectorizer with specifications to a specific corpus
## there are many different parameters you can enter into this class that will have an impact
##check them all out!!
# you can input a custom tokenizer as well
bow = CountVectorizer(stopword='english')
bow.fit(documents)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [237]:
## let's look at out our features
bow.get_feature_names()

['00',
 '000',
 '00000000000',
 '0000000000000',
 '00442070001201',
 '00442071938940',
 '0092647',
 '01',
 '01quackeryrelatedtopics',
 '02',
 '03',
 '0385720270',
 '04',
 '0452278015',
 '0471383562',
 '05',
 '051010crat_atlarge',
 '0521592712',
 '0525934189',
 '0590025',
 '06',
 '0612100',
 '07',
 '07mm',
 '09',
 '09_401',
 '0s',
 '10',
 '100',
 '1000',
 '10000103',
 '100gb',
 '100m',
 '100mhz',
 '102',
 '10260',
 '103',
 '103480',
 '104',
 '105',
 '10837',
 '10k',
 '10m',
 '10th',
 '10yo',
 '11',
 '113',
 '1136132459',
 '11519',
 '11698',
 '11th',
 '12',
 '124',
 '1286',
 '12oz',
 '12processo_eng',
 '13',
 '130',
 '130kw',
 '1318',
 '13299',
 '13328',
 '1367',
 '14',
 '14107',
 '144',
 '148',
 '14e2xpi',
 '14th',
 '15',
 '150',
 '150k',
 '1512',
 '15541',
 '1581345615',
 '15k',
 '15th',
 '16',
 '1600',
 '1605',
 '1628',
 '163',
 '165d',
 '165m',
 '1687',
 '16s',
 '17',
 '17259',
 '1751',
 '177',
 '17806',
 '17967',
 '18',
 '18157',
 '18528',
 '1880',
 '189cm',
 '19',
 '191',
 '1918944

In [239]:
## our vocabulary
bow.vocabulary_

{'early': 3977,
 '2006': 146,
 'probable': 9336,
 'date': 3221,
 'if': 6030,
 'you': 13310,
 'are': 1088,
 'going': 5355,
 'to': 12143,
 'post': 9140,
 'something': 11133,
 'that': 11983,
 'has': 5622,
 'link': 7125,
 'the': 11986,
 'original': 8486,
 'author': 1291,
 'why': 13060,
 'not': 8209,
 'just': 6758,
 'instead': 6373,
 'of': 8336,
 'someone': 11130,
 'copy': 2918,
 'microsoft': 7643,
 'hates': 5631,
 'it': 6603,
 'own': 8588,
 'products': 9369,
 'who': 13044,
 'knew': 6852,
 'this': 12055,
 'looks': 7222,
 'interesting': 6431,
 'but': 1974,
 'already': 840,
 'aired': 758,
 'and': 918,
 'like': 7101,
 'there': 12025,
 'streaming': 11482,
 'video': 12770,
 'so': 11078,
 'what': 13007,
 'point': 9037,
 'have': 5634,
 'nothing': 8220,
 'good': 5363,
 'things': 12044,
 'say': 10474,
 'about': 490,
 'dell': 3359,
 'tech': 11879,
 'support': 11678,
 'many': 7407,
 'time': 12112,
 've': 12709,
 'called': 2031,
 'in': 6142,
 'faulty': 4682,
 'part': 8676,
 'had': 5550,
 'replacement':

Now we can transform our text into our vector

In [242]:
training_data = bow.transform(documents)

In [258]:
type(training_data)

scipy.sparse.csr.csr_matrix

#### Our data is a sparse matrix, which is a matrix with far more 0 values than not 0 values

#### More about it herehttps://docs.scipy.org/doc/scipy/reference/sparse.html

#### To make better sense of it, we can put it back into a dataframe. 

#### Caution: moving from sparse matrix to array format will make the calculations very computationally inefficient, and it is not recommended

In [252]:
text_data = pd.DataFrame(training_data.toarray(),columns=bow.get_feature_names())

In [254]:
text_data.head()

Unnamed: 0,00,000,00000000000,0000000000000,00442070001201,00442071938940,0092647,01,01quackeryrelatedtopics,02,...,zogby,zombie,zone,zope,zsh,zur,zurich,zyklon,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,âµg
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [256]:
bigrams = CountVectorizer(ngram_range=(2,2))
trigrams = CountVectorizer(ngram_range=(3,3))


In [257]:
bigrams.fit(documents)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(2, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [259]:
bigrams.get_feature_names()

['00 and',
 '00 are',
 '00 html',
 '00 so',
 '00 to',
 '00 us',
 '000 00',
 '000 000',
 '000 acres',
 '000 excepting',
 '000 for',
 '000 found',
 '000 homes',
 '000 investment',
 '000 it',
 '000 million',
 '000 month',
 '000 nanometers',
 '000 or',
 '000 people',
 '000 rpm',
 '000 sgli',
 '000 since',
 '000 sq',
 '000 square',
 '000 to',
 '000 year',
 '00000000000 factorial',
 '0000000000000 factorial',
 '00442070001201 domain',
 '00442070001201 technical',
 '00442071938940 00442070001201',
 '0092647 5fencoding',
 '01 09',
 '01 11',
 '01 14',
 '01 16',
 '01 are',
 '01 colbert',
 '01 dec',
 '01 hindsights',
 '01 like',
 '01 similar',
 '01 think_in_weeks',
 '01 update_on_tech',
 '01quackeryrelatedtopics candling',
 '02 and',
 '03 06',
 '0385720270 qid',
 '04 13',
 '04 expires',
 '05 administrative',
 '05 not',
 '051010crat_atlarge which',
 '0521592712 still',
 '0525934189 ref',
 '0590025 6475319',
 '06 last',
 '06 state',
 '06 yet',
 '0612100 507846',
 '07 dangers',
 '07mm mechanical',
 

In [261]:
trigrams.fit(documents)
trigrams.get_feature_names()

['00 and had',
 '00 are owned',
 '00 html tw',
 '00 so saying',
 '00 to the',
 '00 us now',
 '000 00 are',
 '000 000 00',
 '000 000 found',
 '000 000 million',
 '000 000 people',
 '000 000 since',
 '000 acres and',
 '000 excepting very',
 '000 for work',
 '000 found that',
 '000 homes 21',
 '000 homes if',
 '000 investment for',
 '000 it was',
 '000 million short',
 '000 nanometers thick',
 '000 or so',
 '000 people are',
 '000 rpm hard',
 '000 sgli life',
 '000 since the',
 '000 sq ft',
 '000 square feet',
 '000 to be',
 '000 year but',
 '000 year mckinsey',
 '000 year old',
 '00000000000 factorial acc',
 '0000000000000 factorial acc',
 '00442070001201 domain servers',
 '00442070001201 technical contact',
 '00442071938940 00442070001201 domain',
 '00442071938940 00442070001201 technical',
 '0092647 5fencoding utf8',
 '01 09 flame_someone_anonym',
 '01 11 html',
 '01 14 do',
 '01 16 al',
 '01 are there',
 '01 colbert reports',
 '01 dec 05',
 '01 hindsights html',
 '01 like madd',
 '01 

Now if we want to transform a new test document, we can use the transform method that we previously used


In [262]:
X_test = bow.transform(['this is a test document','look at me I am a test document'])

## We can also use the tf-idf vectorizer: More on all vectorizer

* The tf-idf vectorizer takes has the ability to detect words that might be more important for our specific corpus http://scikit-learn.org/stable/modules/feature_extraction.html


* Places a higher weight on words that appear in certain documents that are infrequent in the overall corpus
<img src="./resources/tfidf.png">

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

TfidfVectorizer()

## Training a model with NLP

## Bonus

#### Spacy 

Spacy has features related to syntactic meaning of words

In [40]:
import spacy
nlp = spacy.load('en')

In [41]:
import pandas as pd

In [265]:
sample_sentence = 'the children were running towards the chickens when they fell down the hill'

In [266]:
tokenized = nlp(sample_sentence)

In [268]:
for word in tokenized:
    print(word, word.pos_)

the DET
children NOUN
were VERB
running VERB
towards ADP
the DET
chickens NOUN
when ADV
they PRON
fell VERB
down ADP
the DET
hill NOUN


It can also detect things such as "noun chunks" and many other parts of speech

In [51]:
for chunk in bleh.noun_chunks:
    print(chunk)

the children
the chickens
they
the hill


## Similarity of two documents

We can tell how similar two documents are to one another, normalizing for size, by taking the cosine similarity of the two. 

This number will range from [0,1], with 0 being not similar whatsoever, and 1 being the exact same.

<img src="./resources/better_cos_similarity.png">

In [270]:
sample = CountVectorizer()
sunday_afternoon = ['I ate a burger at burger queen and it was very good.',
           'I ate a hot dog at burger prince and it was bad',
          'I drove a racecar through your kitchen door',
          'I ate a hot dog at burger king and it was bad. I ate a burger at burger queen and it was very good']

trial.fit(sunday_afternoon)
text_data = trial.transform(sunday_afternoon)

In [271]:
from sklearn.metrics.pairwise import cosine_similarity
## the 0th and 2nd index lines are very different, a number close to 0
cosine_similarity(text_data[0],text_data[2])


array([[0.]])

In [272]:
## the 0th and 3rd index lines are very similar, despite different lengths
cosine_similarity(text_data[0],text_data[3])

array([[0.91413793]])