In [None]:
# special IPython command to prepare the notebook for matplotlib
%matplotlib inline 

import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import statsmodels.api as sm
import sklearn

# special matplotlib command for global plot configuration
from matplotlib import rcParams

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'white'

#Section 5: Clustering Continued and NLP

###Last time
- GMM for Clustering
- EM algorithm


###Today
- silhouette_score for evaluating cluster models
- NLP with nltk and sklearn

**Before you begin, you should download the resources associated with the nltk library.**  Open a terminal, and install the nltk package if you haven't already with 

```
conda install nltk
```

Then open a python shell (preferably your python 3.4 shell) and install the resources with 

```
import nltk
nltk.download()
```

This should launch a download application that looks like the image below.  Select the 'book' material and click download. It could take a little bit of time to complete. 

<img src='nltk_download.png'>

##Clustering review

We'll start with an example used in the last section.  Old Faithful.  

In [None]:
faithful = sm.datasets.get_rdataset("faithful")

In [None]:
faithful.title

In [None]:
old_faithful = faithful.data
old_faithful.head()

In [None]:
plt.hist(old_faithful.eruptions, bins = np.arange(1,6,.2), normed=True)
plt.xlabel('Duration of Eruption (min)')
plt.show()
plt.hist(old_faithful.waiting, bins = np.arange(40,100,2), normed=True)
plt.xlabel('Time between eruptions (min)')
plt.show()

In [None]:
x, y = np.array(old_faithful.eruptions), np.array(old_faithful.waiting)
c = np.array([x,y])
c = c.T
plt.scatter(x,y)
plt.xlabel('Eruption duration (mins)')
plt.ylabel('Time between eruptions (mins)')
plt.show()

In [None]:
### Scikit-learn
#Initializes with zero means and identity covariances of components
#http://scikit-learn.org/stable/modules/generated/sklearn.mixture.GMM.html
from sklearn import mixture

gm = mixture.GMM(n_components=2, n_iter=100, covariance_type='full')
print("initialized mixing weights of each component ")
print(gm.weights_)

gm.fit(c)
classes = gm.predict(c)
# print('')
# print classes
plt.scatter(c[:,0], c[:,1], c=classes, marker='+', s=100, linewidths=2)
plt.title('Scikit-Learn Solution', fontsize=14, fontweight='bold')

plt.show()

In [None]:
print("mixing weights after fit")
print(gm.weights_)
print("means for each component after fit")
print(gm.means_)
print("Covariances of components after fir")
print(gm.covars_)

##Silhouette Score

Note that this link doesn't work but it will get you close!
[Wikipedia](https://en.wikipedia.org/wiki/Silhouette_(clustering) entry for Sihouette score.  

[Silhouette score](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html) from Scikit-learn

In [None]:
from sklearn.metrics import silhouette_score

We can use the silhouette score to evaluate a particular cluster model.  Silhouette scores can be used to help evaluate the appropriate number of clusters that are truly in the data.  The value that is returned is an indicator of how closely grouped the data is.  Values close to 1 mean that the data are appropriately clustered.  If the returned value is close to -1 then the data more likely belongs in the neighboring cluster.  A value of 0 indicates that the data resides on the border of two natural clusters.  

In [None]:
gm = mixture.GMM(n_components=2, n_iter=100, covariance_type='full')
gm.fit(c)
classes = gm.predict(c)
silhouette_score(c, classes, metric='sqeuclidean')

In [None]:
range_n_clusters = [2, 3, 4, 5, 6]

for n_clusters in range_n_clusters:
    # build the model and fit the data
    gm = mixture.GMM(n_components=n_clusters, n_iter=100, covariance_type='full')
    gm.fit(c)
    classes = gm.predict(c)
    silhouette_avg = silhouette_score(c, classes, metric='sqeuclidean')
    print('For n_clusters =', n_clusters, 'the average silhouette_score is', silhouette_avg)
    
    plt.scatter(c[:,0], c[:,1], c=classes, marker='+', s=100, linewidths=2)
    plt.title('Solution with %d clusters' % (n_clusters))
    plt.show()
    

##Natural Language Processing (NLP)

A quick introduction to NLP with Python using [nltk](http://www.nltk.org) and Scikit-learn.

**Start with some basic string processing**

In [None]:
monty = "Monty Python's Flying Circus. " 
monty

In [None]:
monty*2 + " Plus just the last word:" + monty[-8:]

In [None]:
monty.find('Python') #finds position of substring within string

In [None]:
monty.upper() +' and '+ monty.lower() # turn to upper or lower case. 

In [None]:
monty.replace('y', 'x') # replace letter y in the string with letter x. 

**Regular Expressions** 

If you plan to work with text, [regular expressions](https://docs.python.org/2/library/re.html) are extremely useful tools to become familar with 

In [None]:
import re

In [None]:
word = 'onomatopoeia'
len(re.findall(r'[aeiou]', word))

### NLP using NLTK

In [None]:
import nltk
from nltk.book import text4

**List Comprehension**

What does the command below do?

In [None]:
len(set([word.lower() for word in text4 if len(word) > 5]))

What does the set method do?

In [None]:
set(['a', 'b', 'a', 'c', 'b', ])

In [None]:
[element.upper() for element in text4[0:5]]

**Words in context**

NLTK books are Text objects that have many built-in methods available like searching for results and also returning the context. 

In [None]:
text = nltk.Text('This is some text that could be a sentence.')

In [None]:
text4.concordance("America")

In [None]:
text4.similar?

In [None]:
text4.similar('citizen')

In [None]:
text4.common_contexts?

In [None]:
text4.common_contexts(['America', 'freedom'])

In [None]:
from nltk.draw.dispersion import dispersion_plot
dispersion_plot(text4, ["citizens", "democracy", "freedom", "war", "America", "vote"])

** Simple Statistics **

- Frequency distribution of words.  Find the counts for each word in the text

In [None]:
from nltk import FreqDist
freq_dist = FreqDist(text4)
freq_dist

- Access the entire list of words in a text

In [None]:
vocabulary = list(freq_dist.keys()) # list of all the distinct types in the text
vocabulary[:3] # look at first 3

- identify specific sets of words (e.g. long words) to help characterize the body of text

In [None]:
words = set(text4)
long_words = [word for word in words if len(word) > 15]
sorted(long_words)

##Lexical Resources from nltk

NLTK provides several corpora (linguistic annotations, POS tags, named entities, syntactic structures, semantic roles, etc.) along with convenient methods to access these resources. The full list of corpora resources is available [here](http://www.nltk.org/book/ch02.html#tab-corpora)



### Counting words by genre

The [Brown Corpus](https://en.wikipedia.org/wiki/Brown_Corpus) is a text collection that contains over 500 samples of english text that have been labeled with part-of-speech (POS) and genre.  It has been one of the most widely used collections in computational linguistics (statistical modeling of natual language)

In [None]:
from nltk.corpus import brown

In [None]:
cfd = nltk.ConditionalFreqDist((genre, word) 
                               for genre in brown.categories() 
                               for word in brown.words(categories=genre))

In [None]:
cfd.conditions()

Now instead of simply looking at the frequency distribution of words in a text, these methods allow inspection of word frequency distribution by genre. 

In [None]:
genre_word = [(genre, word) 
              for genre in ['government', 'religion'] 
              for word in brown.words(categories=genre)]

In [None]:
genre_word[:4]

In [None]:
genre_word[-4:]

Create a new condiation frequency distribution

In [None]:
cfd = nltk.ConditionalFreqDist(genre_word)
cfd.conditions()

In [None]:
print(cfd['religion'])

In [None]:
cfd['religion'].most_common(10)

### Stopwords

Another corpus of stopwords is also included in the nltk resources.  These are the high-frequency words like 'the', 'to', 'of', etc. that have little lexical content and contribute little to a text's distinguising characteristics

In [None]:
from nltk.corpus import stopwords
stopwords.words('english')

We can remove the stopwords from the text and then rerun the processing

In [None]:
filtered_genre_word = [word 
                       for word in genre_word 
                       if word[1].lower() not in stopwords.words('english')]

In [None]:
filtered_cfd = nltk.ConditionalFreqDist(filtered_genre_word)
filtered_cfd.conditions()

In [None]:
filtered_cfd['religion'].most_common(10)

### Punctuation

We can also remove punctuation from text

In [None]:
import string

In [None]:
clean_genre_word = [word 
                    for word in filtered_genre_word 
                    if word[1] not in string.punctuation]

In [None]:
clean_cfd = nltk.ConditionalFreqDist(clean_genre_word)
clean_cfd['religion'].most_common(10)

### Stemming using NLTK

Stemming is a reference to the process of reducing inflected or derived words to their word stem, base or root form.  There are several algorithms for stemming available today.  Most stemming algorithms function through the use of a lookup table, which is simple and effecient for languages like english but could prove difficult for other languages where inflection plays a bigger role. 

In [None]:
stem_text = 'cats catlike catty cat stemmer stemming stemmed stem fishing fished fisher fish'

In [None]:
stem_text

- The text must first be broken up into tokens.  We can use the word_tokenize method here.  There's also a method [sent_tokenize](http://www.nltk.org/api/nltk.tokenize.html) to break up larger pieces of text into sentences. 

In [None]:
tokens = nltk.word_tokenize(stem_text)

In [None]:
tokens

### Porter Stemmer

The [Porter Stemmer](http://tartarus.org/~martin/PorterStemmer/) is one of the more widely used stemming algorithms. 

In [None]:
porter = nltk.PorterStemmer()
stemmed = [porter.stem(t) for t in tokens]
stemmed

### Snowball stemmer

The [snowball stemmer](http://snowball.tartarus.org/) is based on a language that was developed specifically for stemming algorithms.  

In [None]:
snowball = nltk.SnowballStemmer('english')
snowball_stemmed = [snowball.stem(t) for t in tokens]
snowball_stemmed

### Lancaster stemmer

The [Lancaster stemmer](http://www.lancaster.ac.uk/scc/) was developed at the University of Lancaster from where it gets its name. 

In [None]:
lancaster = nltk.LancasterStemmer()
lancaster_stemmed = [lancaster.stem(t) for t in tokens]
lancaster_stemmed

## Text Similarity with TF-IDF

The tf-idf (term frequency-inverse document frequency) is used to weigh how important a word of a document is in a document collection. It is often used as a weighting factor in information retrieval and data mining. So, tf-idf weight for a term is the product of its term frequency (tf) weight and inverse document freqency (idf) weight.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

Load some simple text and calculate a comparison matrix

In [None]:
vect = TfidfVectorizer(min_df=1)
tfidf = vect.fit_transform(["New Year's Eve in New York",
                            "New Year's Eve in London",
                            "York is closer to London than to New York",
                            "London is closer to Bucharest than to New York"])

In [None]:
cosine=(tfidf * tfidf.T).A
print(cosine)

##NLP tools and beyond

This is fun, but what is next?

[Text clustering](https://gist.github.com/xim/1279283).  Try downloading this code and executing it on your own. 