In [1]:
#importing the required libraries
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from nltk.probability import FreqDist
import nltk.data
import operator
import pandas as pd


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
#loading the dataset
corpus = nltk.data.load('merged_file.txt')

#converting all the literals of the corpus to lower case
corpus_data = corpus.lower()

#(a)Performing tokenization for the whole corpus
#extracting tokens in the form of words from the corpus
tokens = word_tokenize(corpus_data)
#print(tokens)

#creating an 'output.txt' file which stores the extracted tokens(words)
output = open('output.txt','w')

for words in tokens:
    output.write(words + "\n")
output.close()

#(b)Total number of tokens found in the corpus
total_tokens = len(tokens)
print(f'Total No.of Tokens : {total_tokens}')


Total No.of Tokens : 4775318


In [4]:
#(b)Unique tokens found in the corpus
unique_tokens = set(tokens)
unique_tokens_len = len(unique_tokens)
print(f'Unique Tokens (CORPUS): {unique_tokens_len}')

print("\nA List of all the unique tokens in the dataset\n")
print(unique_tokens)

Unique Tokens (CORPUS): 47067

A List of all the unique tokens in the dataset



In [6]:
#(b)type/token ratio for the corpus
ty_token_ratio = (unique_tokens_len/total_tokens)
print(f'Type Token Ratio : {ty_token_ratio}')


Type Token Ratio : 0.009856306951704578


In [7]:
#(c)Calculating the frequency of a token
#Using the FreqDist module of nltk
token_freq = FreqDist(tokens) 
print(token_freq,"\n")


<FreqDist with 47067 samples and 4775318 outcomes> 



In [10]:
#Sorting in descending order of the frequency of tokens
token_dict = {}

for word, freq in token_freq.items():
    token_dict[word] = freq
token_dict_sorted = dict( sorted(token_dict.items(), key=operator.itemgetter(1),reverse=True))
print("Sorted Dictionary in Descending Order \n")

for i in token_dict_sorted.items():
    print(i)
#print(token_dict_sorted)

#creating a 'tokens.txt' file which stores the tokens with their frequency 
#from the most frequent to the least frequent
tokens_file = open('tokens.txt','w')
for word, freq in token_dict_sorted.items():
    write_data = f'{word} : {freq}\n'
    tokens_file.write(write_data)
tokens_file.close()
print("Data written to the file 'tokens.txt'")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
('qx2', 1)
('torys', 1)
('wellington', 1)
('cheryl', 1)
('reicin', 1)
('iredell', 1)
('marlin', 1)
('nemechek', 1)
('wally', 1)
('dallenbach', 1)
('sabco/prolong-sponsored', 1)
('two-hours', 1)
('radio/commercial', 1)
('3/15/98', 1)
('8/15/98', 1)
('10/15/98', 1)
('6/15/99', 1)
('8/15/99', 1)
('10/15/99', 1)
('3/15/2000', 1)
('6/15/2000,8/15/2000', 1)
('10/15/2000', 1)
('spectator', 1)
('mooresville', 1)
('28115', 1)
('felix', 1)
('sabates', 1)
('1210', 1)
('barsten', 1)
('92806', 1)
('secretaryexhibit', 1)
('2406', 1)
('sanibel', 1)
('palmyra', 1)
('08065.', 1)
('39132041', 1)
('755073', 1)
('state-taxing', 1)
('w-2', 1)
('state-required', 1)
('1974.', 1)
('/s/jack', 1)
('rothkopf', 1)
('6/6/2017exhibit', 1)
('biomed', 1)
('yaii', 1)
('blocker', 1)
('beckman', 1)
('beckmanexhibit', 1)
('301373', 1)
('angelou', 1)
('vlachou', 1)
('6052', 1)
('larnaca', 1)
('193010', 1)
('5,593,041', 1)
('1037700111679', 1)
('groupundertak

In [11]:
#(d)Number of tokens found only once in the corpus
one_occ_tokens = {}
for word, freq in token_dict.items():
    if freq == 1:
        one_occ_tokens[word] = freq

print(f'Tokens with single occurance : {len(one_occ_tokens)}')        
print("\n",one_occ_tokens)


Tokens with single occurance : 20416



In [12]:
#(e)Extracting only words from the tokens 
#i.e. excluding punctuations and other symbols
words_from_corpus = []
for token in tokens:
    if token.isalpha(): #to identify words
        words_from_corpus.append(token)
        
print("No. of ""WORDS"" Extracted from Tokens:",len(words_from_corpus))

#print("\n\n",words_from_corpus)


No. of WORDS Extracted from Tokens: 3809034


In [15]:
#Listing the top 20 most frequent words
#Using the FreqDist module of nltk
freq_dist = FreqDist(words_from_corpus)
#Making 2 columns as 'Word', 'Freq'
freq_dist_table = pd.DataFrame(freq_dist.most_common(20), columns = ['Word','Freq'])
freq_dist_table.index += 1
print(freq_dist_table)


         Word    Freq
1         the  257132
2          of  156122
3          to  129875
4         and  129054
5          or  105155
6          in   79933
7         any   62236
8           a   50444
9       shall   48794
10         by   44310
11  agreement   43617
12       this   39986
13         be   39701
14        for   38724
15       such   36172
16       with   33883
17         as   32907
18      party   32826
19       that   27654
20      other   26395


In [16]:
# LEXICAL DIVERSITY - type/token ratio when we use only words

lexical_diversity = (len(set(words_from_corpus))/len(tokens))
#set() – removes duplicate strings and returns unique strings
print("Lexical Diveristy : ",lexical_diversity)


Lexical Diveristy :  0.0053160857559643145


In [17]:
#(f)Loading the 'stopwords.txt' file
#Storing the stop words in a list

file = open('stopwords.txt','r')
data = file.readlines()
stop_words = []
for word in data:
    word = word.rstrip('\n')
    stop_words.append(word)
    
# print(stop_words)
print("Total stopwords to be removed : ", len(stop_words))

#(f)Excluding Stop Words from the corpus

without_stop_words = []
for word in words_from_corpus:
    if word not in stop_words:
        without_stop_words.append(word)
print("\nNo.of tokens with stop words:\n",len(words_from_corpus))
print("\nNo.of tokens without stop words:\n",len(without_stop_words))

#print("\n",without_stop_words)


Total stopwords to be removed :  779

No.of tokens with stop words:
 3809034

No.of tokens without stop words:
 1816856


In [18]:
#Listing the top 20 most frequent words after removal of stop words
#Using the FreqDist module of nltk

freqdist_filtered_words = FreqDist(without_stop_words)
filtered_words_table = pd.DataFrame(freqdist_filtered_words.most_common(20), columns = ['Word', 'Freq'])
filtered_words_table.index += 1
print(filtered_words_table)


            Word   Freq
1      agreement  43617
2          party  32826
3        parties  13509
4        section  13292
5        company  12390
6    information  10920
7        product  10756
8           date  10116
9       products   8168
10        rights   8048
11      services   7861
12    applicable   7533
13      business   7254
14           set   6981
15  confidential   6866
16       written   6799
17         terms   6714
18         right   6676
19        notice   6655
20          term   6574


In [20]:
# LEXICAL DENSITY - type/token ratio when we use only word tokens without Stop Words

lexical_density = (len(set(without_stop_words))/len(tokens))
print("Lexical Density : ", lexical_density)


Lexical Density :  0.005186670290858117


In [21]:
#(g)Computing the pair of Bigrams excluding stop words and punctuations

#nltk.bigrams() – extracts lists of word pairs from a given text
bigrams = list(nltk.bigrams(without_stop_words))
#print(bigrams)

#Using the FreqDist module of nltk
bigrams_freq = FreqDist(bigrams)
most_freq_table = pd.DataFrame(list(bigrams_freq.most_common(20)), columns = ['bigram','freq'])
most_freq_table.index += 1
print("The list of top 20 most occouring bigrams")
print(most_freq_table)

The list of top 20 most occouring bigrams
                         bigram  freq
1   (confidential, information)  3603
2      (intellectual, property)  2920
3             (effective, date)  2838
4             (written, notice)  2387
5           (terms, conditions)  2087
6                (set, section)  1825
7              (prior, written)  1814
8             (term, agreement)  1708
9     (confidential, treatment)  1530
10     (termination, agreement)  1439
11             (parties, agree)  1417
12       (securities, exchange)  1410
13           (receiving, party)  1367
14               (party, party)  1363
15          (pursuant, section)  1353
16           (written, consent)  1330
17             (united, states)  1265
18            (applicable, law)  1249
19           (agreement, party)  1215
20           (terms, agreement)  1202
