In [17]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import nltk 
from time import time
from collections import Counter
from nltk.corpus import stopwords,wordnet
from nltk import FreqDist,ngrams,word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer


In [16]:
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package stopwords to /home/deepak/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/deepak/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/deepak/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/deepak/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [18]:
def remove_stopwords(data):
    stop_words = Counter(stopwords.words('english'))
    ans = []
    for each in data:
        if(each not in stop_words.keys()):
            ans.append(each)
    return ans

In [19]:
def lemmatizer(tokens):
    filtered_document = []
    wordnet_pos = ''
    pos_tokens = nltk.pos_tag(tokens)
    lmtzr = WordNetLemmatizer()
    for i in pos_tokens:
        if i[1].startswith('J'):
            wordnet_pos = wordnet.ADJ
        elif i[1].startswith('V'):
            wordnet_pos = wordnet.VERB
        elif i[1].startswith('N'):
            wordnet_pos = wordnet.NOUN
        elif i[1].startswith('R'):
            wordnet_pos = wordnet.ADV
        else:
            wordnet_pos = wordnet.ADJ_SAT
        filtered_document.append(lmtzr.lemmatize(i[0],pos = wordnet_pos))
    return filtered_document

In [20]:
def chapterwise(text):
    tokens = word_tokenize(text)
    tokens = remove_stopwords(tokens)
    tokens = lemmatizer(tokens)
        
    fDist = FreqDist(tokens)
    dictionary_freq = dict(zip(fDist.keys(),fDist.values()))
    sorted_dict_freq = sorted(dictionary_freq.items(), key=lambda x: x[1],reverse= True)
    
    return dict(sorted_dict_freq)

In [28]:
def book(dataset, top_k = 20, num_chap = 10, ch = False):
    data = pd.read_csv(dataset)
    
    if ch : 
        for i in range(num_chap):
            text = str(data['Text'][i])
        
            tokens = word_tokenize(text)
            tokens = remove_stopwords(tokens)
            tokens = lemmatizer(tokens)

            fDist = FreqDist(tokens)
            dictionary_freq = dict(zip(fDist.keys(),fDist.values()))
            sorted_dict_freq = sorted(dictionary.items(), key=lambda x: x[1],reverse= True)
            
        return list(sorted_dict_freq)
        
    else :
        text = ""
        for i in range(data.shape[0]):
            text += str(data['Text'][i])
#         text = text.decode('utf-8')

        tokens = word_tokenize(text)
        tokens = remove_stopwords(tokens)
        tokens = lemmatizer(tokens)

        # Frequency
        fDist = FreqDist(tokens)
        dictionary_freq = dict(zip(fDist.keys(),fDist.values()))
        sorted_dict_freq = sorted(dictionary_freq.items(), key=lambda x: x[1],reverse= True)
        
        print ("Top "+str(top_k)+" Words according to frequency: ")
        for i in range(top_k): 
            print (str(i+1)+". "+sorted_dict_freq[i][0]," : ",sorted_dict_freq[i][1])


        # entropy
        total = float(len(tokens))
        dictionary_entropy = {k: -1 * (float(freq) / total) * np.log(float(freq)/total) for k, freq in dictionary_freq.items()} 
        sorted_dict_entropy = sorted(dictionary_entropy.items(), key=lambda x: x[1],reverse= True)

        print ("\n\nTop "+str(top_k)+" Words according to entropy: ")
        for i in range(top_k): 
            print (str(i+1)+". "+sorted_dict_entropy[i][0]," : ",sorted_dict_entropy[i][1])
        
        #own metric
        freq_own = np.zeros(( len(dictionary_freq), num_chap))
        index_tokens = {k: i for i,k in enumerate(dictionary_freq)}
        
        for i, chap in data.iterrows():        
            if i>=num_chap : break
            freq_dict_chapterwise = chapterwise(chap['Text'])
            for k,v in freq_dict_chapterwise.items():
                freq_own[index_tokens[k]][i] = v

        stand_dev = np.std(freq_own, axis = 1)
        std_dict = {k: stand_dev[index_tokens[k]] for k in index_tokens}
        sorted_dict_std = sorted(std_dict.items(), key=lambda x: x[1],reverse= True)


        print ("\n\nTop "+str(top_k)+" Words according to our metric: ")
        for i in range(top_k): 
            print (str(i+1)+". "+sorted_dict_std[i][0]," : ",sorted_dict_std[i][1])
            
        return list(sorted_dict_freq), list(sorted_dict_entropy), list(sorted_dict_std)
        

In [29]:
a,b,c = book('chapters.csv',top_k = 20, num_chap= 10)

Top 20 Words according to frequency: 
1. ,  :  15870
2. .  :  11136
3. “  :  4244
4. ”  :  3796
5. ’  :  3381
6. Harry  :  3100
7. ''  :  2494
8. I  :  2373
9. ``  :  2369
10. say  :  2173
11. ?  :  2124
12. !  :  1540
13. Hermione  :  1206
14. He  :  1178
15. Ron  :  1154
16. 's  :  1031
17. The  :  935
18. look  :  863
19. know  :  787
20. –  :  759


Top 20 Words according to entropy: 
1. ,  :  0.22505097982466293
2. .  :  0.18179738388285233
3. “  :  0.09406600142853855
4. ”  :  0.0866996530290858
5. ’  :  0.07959057991330709
6. Harry  :  0.07460387729180665
7. ''  :  0.06330370390489885
8. I  :  0.06094679513821453
9. ``  :  0.060868253315909414
10. say  :  0.056968211731137745
11. ?  :  0.055976840374755726
12. !  :  0.04358296210309984
13. Hermione  :  0.03591521384082345
14. He  :  0.03524886493249961
15. Ron  :  0.034674505702341596
16. 's  :  0.031682057549819544
17. The  :  0.029285192001954164
18. look  :  0.027448669337695682
19. know  :  0.025470561072497504
20. –  :  0.0