In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import nltk 
from time import time
from collections import Counter
from nltk.corpus import stopwords,wordnet
from nltk import FreqDist,ngrams,word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
# import pdftotext


In [3]:
def remove_stopwords(data):
    stop_words = Counter(stopwords.words('english'))
    ans = []
    for each in data:
        if(each not in stop_words.keys()):
            ans.append(each)
    return ans

In [88]:
def lemmatizer(tokens):
    filtered_document = []
    wordnet_pos = ''
    pos_tokens = nltk.pos_tag(tokens)
    lmtzr = WordNetLemmatizer()
    for i in pos_tokens:
        if i[1].startswith('J'):
            wordnet_pos = wordnet.ADJ
        elif i[1].startswith('V'):
            wordnet_pos = wordnet.VERB
        elif i[1].startswith('N'):
            wordnet_pos = wordnet.NOUN
        elif i[1].startswith('R'):
            wordnet_pos = wordnet.ADV
        else:
            wordnet_pos = wordnet.ADJ_SAT
        filtered_document.append(lmtzr.lemmatize(i[0],pos = wordnet_pos))
    return filtered_document

In [84]:
def chapterwise(text):
    tokens = word_tokenize(text[0:2000])
    tokens = remove_stopwords(tokens)
    tokens = lemmatizer(tokens)
        
    fDist = FreqDist(tokens)
    dictionary_freq = dict(zip(fDist.keys(),fDist.values()))
    sorted_dict_freq = sorted(dictionary_freq.items(), key=lambda x: x[1],reverse= True)
    
    return dict(sorted_dict_freq)

In [85]:
def book(dataset, top_k = 20, num_chap = 10, ch = False):
    data = pd.read_csv(dataset)
    
    if ch : 
        for i in range(num_chap):
            text = data['Text'][i]
        
            tokens = word_tokenize(text[0:2000])
            tokens = remove_stopwords(tokens)
            tokens = lemmatizer(tokens)

            fDist = FreqDist(tokens)
            dictionary_freq = dict(zip(fDist.keys(),fDist.values()))
            sorted_dict_freq = sorted(dictionary.items(), key=lambda x: x[1],reverse= True)
            
        return list(sorted_dict_freq)
        
    else :
        text = ""
        for i in range(data.shape[0]):
            text += data['Text'][i]
        text = text.decode('utf-8')

        tokens = word_tokenize(text)
        tokens = remove_stopwords(tokens)
        tokens = lemmatizer(tokens)

        # Frequency
        fDist = FreqDist(tokens)
        dictionary_freq = dict(zip(fDist.keys(),fDist.values()))
        sorted_dict_freq = sorted(dictionary_freq.items(), key=lambda x: x[1],reverse= True)
        
        print "Top "+str(top_k)+" Words according to frequency: "
        for i in range(top_k): 
            print str(i+1)+". "+sorted_dict_freq[i][0]," : ",sorted_dict_freq[i][1]


        # entropy
        total = float(len(tokens))
        dictionary_entropy = {k: -1 * (float(freq) / total) * np.log(float(freq)/total) for k, freq in dictionary_freq.iteritems()} 
        sorted_dict_entropy = sorted(dictionary_entropy.items(), key=lambda x: x[1],reverse= True)

        print "\n\nTop "+str(top_k)+" Words according to entropy: "
        for i in range(top_k): 
            print str(i+1)+". "+sorted_dict_entropy[i][0]," : ",sorted_dict_entropy[i][1]
        
        #own metric
        freq_own = np.zeros(( len(dictionary_freq), num_chap))
        index_tokens = {k: i for i,k in enumerate(dictionary_freq)}
        
        for i, chap in data.iterrows():        
            if i>=num_chap : break
            freq_dict_chapterwise = chapterwise(chap['Text'])
            for k,v in freq_dict_chapterwise.iteritems():
                freq_own[index_tokens[k]][i] = v

        stand_dev = np.std(freq_own, axis = 1)
        std_dict = {k: stand_dev[index_tokens[k]] for k in index_tokens}
        sorted_dict_std = sorted(std_dict.items(), key=lambda x: x[1],reverse= False)


        print "\n\nTop "+str(top_k)+" Words according to our metric: "
        for i in range(top_k): 
            print str(i+1)+". "+sorted_dict_std[i][0]," : ",sorted_dict_std[i][1]
            
        return list(sorted_dict_freq), list(sorted_dict_entropy), list(sorted_dict_std)
        

In [35]:
book('questions-data.csv',top_k = 200, num_chap= 10)

KeyboardInterrupt: 