In [99]:
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3
import scipy.cluster.hierarchy as hcluster

In [100]:
# topics = ["ml1", "ml2", "ml3", "ml4", "ml5", "ml6", "ml7", "ml8", "ml9", "ml10", "ml11", "ml12", "ml13", "ml14", "ml15", "ml16", "ml17", "ml18", "ml19"]
# print "The titles are ", topics

In [101]:
import glob
import numpy as np

filePath = './books'
fileCounter = len(glob.glob(filePath + "*.txt"))

paragraphs = []
topics = []
for index, filename in enumerate(glob.glob(filePath + '/*.txt')):
    print(filename)
    file = open(filename, 'r')
    txt = file.read()
    txt = txt.replace('\n', '').split('.')
    txt = filter(None, txt)
    print len(txt)
    for i in np.arange(0, len(txt)):
        topics.append('book:%d sentence:%d' % (index, i))
        paragraphs.append(txt[i])

print len(topics)
print len(paragraphs)

In [102]:
stopwords = nltk.corpus.stopwords.words('english')
print stopwords

In [103]:
# Now we want to break a word into its root using a stemmer.
#We use the snowball.
#snowball is better than porter stemmer

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [104]:
#First the parageraph is tokenized by sentence then by word
def tokenize(paragraph) :
    tokenList = [word for sentence in nltk.sent_tokenize(paragraph) for word in nltk.word_tokenize(sentence)]
    
    filteredTokens = []
    
    for token in tokenList:
        if re.search('[a-zA-Z]', token):
            filteredTokens.append(token)
    return filteredTokens

#Next step would be to remove the useless tokens. Useless tokens are raw puntuation, numeric tokens, etc.
def stem(filteredTokens) :
    stemList = [stemmer.stem(tok) for tok in filteredTokens]
    return stemList

def tokenizeAndStem(paragraph) :
    tokens = [word for sentence in nltk.sent_tokenize(paragraph) for word in nltk.word_tokenize(sentence)]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):  #normal regex search
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [105]:
import sys

reload(sys)
sys.setdefaultencoding('utf8')

tokenizedParagraphList = []
stemmedParagraphList = []
for i in paragraphs :
    tokenizedParagraph = tokenize(i)
    tokenizedParagraphList.extend(tokenizedParagraph)
    stemmedParagraphList.extend(stem(tokenizedParagraph))
print tokenizedParagraphList
print stemmedParagraphList

In [106]:
#create a pandas DataFrame with the stemmed vocabulary as the index and the tokenized words as the column.
#The benefit of this is it provides an efficient way to look up a stem and return a full token. 
#The downside here is that stems to tokens are one to many: the stem 'run' could be associated with 'ran', 'runs', 'running', etc.


vocabFrame = pd.DataFrame({'words': tokenizedParagraphList}, index = stemmedParagraphList)
print 'there are ' + str(vocabFrame.shape[0]) + ' items in vocab_frame'

In [107]:
#for tfidf, there are two important things, max_df & min_df;
#max_df - When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words).
#min_idf - When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature.


from sklearn.feature_extraction.text import TfidfVectorizer
tfidfVectorizer = TfidfVectorizer(stop_words='english', use_idf=True, tokenizer=tokenizeAndStem, ngram_range=(1,3))
#Now fit the vectorizer to synopses
%time tfidf_matrix = tfidfVectorizer.fit_transform(paragraphs)
print tfidf_matrix
print tfidfVectorizer.get_feature_names()

In [108]:
#Now we calculate the cosine similarity as follows. This will gives us the distance which wil help us in clustering in the later stage.

from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)
print dist.shape

In [108]:
thresh=1.05*np.average(dist)
clusters = hcluster.fclusterdata(dist, thresh, criterion="distance")

In [110]:
print(clusters)
numClusters = max(clusters)

In [111]:
clusterStatements = []
for i in range(0,numClusters,1) :
    clusterStatements.append([])
for i in range(0, len(paragraphs), 1) :
    clusterStatements[clusters[i]-1].append(paragraphs[i])
    #tempClusterStatements[clusters[i]] += paragraphs[i]
print clusterStatements

In [112]:
for k in range(0, numClusters, 1) :
    tfidfVectorizerCluster = TfidfVectorizer(stop_words='english', use_idf=True, tokenizer=tokenizeAndStem, ngram_range=(1,3))
    tfidfMatrixCluster = tfidfVectorizerCluster.fit_transform(clusterStatements[k])
    distCluster = 1 - cosine_similarity(tfidfMatrixCluster)
    maxClusterDistance=0
    farthestSentence=0
    for i in range(0, len(distCluster), 1) :
        temp=0
        for j in range(0, len(distCluster[i]), 1) :
            temp+=distCluster[i][j]
        farthestSentence = farthestSentence if maxClusterDistance<temp else i
        maxClusterDistance = maxClusterDistance if maxClusterDistance<temp else temp
    print clusterStatements[k][farthestSentence]