In [1]:
import os
import nltk
import glob
import pandas as pd

from string import punctuation
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

from collections import Counter

## Reading Files, getting sentence tokens and converting them to word tokens

In [2]:
all_files = glob.glob("CUAD_v1/full_contract_txt/*.txt")
len(all_files)

510

In [3]:
def readFiles(all_files):
    all_tokens = []
    for file in all_files:
        with open(file, encoding='utf-8') as info:
            sent_tokens = sent_tokenize(info.read())
            for sent_token in sent_tokens:
                all_tokens.extend(getTokens(sent_token))
    return all_tokens

In [4]:
def getTokens(sent_token):
    tokens = []
    for token in word_tokenize(sent_token):
        tokens.append(token.lower())
    return tokens

In [5]:
all_tokens = readFiles(all_files)

In [6]:
len(all_tokens)

# of tokens - 4789850 (first converted to lower case)

4789850

## Writing all tokens to output.txt

In [7]:
temp = all_tokens
temp = '\n'.join(temp)
# len(temp)

with open('output.txt', 'w') as f:
        f.write(temp)

## Counting all unique tokens

In [8]:
count_tokens = Counter(all_tokens)
len(count_tokens)

# types of tokens - 45883

45883

## Creating a list of unique tokens, sorted by decreasing frequency

In [9]:
def sort_tokens(d, reverse = False):
  return dict(sorted(d.items(), key = lambda x: x[1], reverse = reverse))

sorted_tokens = sort_tokens(count_tokens, True)

sorted_tokens_list = list(sorted_tokens.items())
sorted_tokens_list[:20]

[('the', 257132),
 (',', 240576),
 ('of', 156122),
 ('to', 129875),
 ('and', 129054),
 ('.', 117513),
 ('or', 105155),
 ('in', 79933),
 (')', 78092),
 ('(', 75436),
 ('*', 67765),
 ('any', 62236),
 ('--', 58711),
 ('a', 51002),
 ('shall', 48794),
 ('by', 44311),
 ('agreement', 43622),
 ('this', 39987),
 ('be', 39701),
 ('for', 38724)]

## Writing all unique tokens and their frequencies to tokens.txt, sorted by decreasing frequency

In [10]:
temp = ''
for token_freq in sorted_tokens_list:
    token_freq = str(token_freq)
    temp = temp + token_freq + "\n"
    
with open('tokens.txt', 'w') as f:
        f.write(temp)

In [11]:
# number of unique tokens - 45883

unique_tokens = len(sorted_tokens)
unique_tokens

45883

## Calculating type-to-token ratio for corpus

In [12]:
# type/token ratio - 0.009579214380408572

type_token_ratio = unique_tokens/len(all_tokens)

type_token_ratio

0.009579214380408572

## Getting Count of unique tokens, that appeared only once

In [13]:
# tokens appeared only once - 19649

tokens_only_once = [k for k, v in sorted_tokens.items() if v == 1]

len(tokens_only_once)

19649

## Extracting only words from list of tokens, removing punctuations, symbols etc.

In [14]:
all_word_tokens = []

for token in all_tokens:
    token = token.strip(punctuation)
    if token.isalpha():
        all_word_tokens.append(token)
        
len(all_word_tokens)

# number of words - 3841868

3841868

## Lexical Diversity - type-to-token ratio after removing punctuations, symbols etc.

In [15]:
len(Counter(all_word_tokens)) / len(all_word_tokens)


0.006610065728442518

## Getting Top 20 most frequent words 

In [16]:
def sort_dict_by_value(d, reverse = False):
  return dict(sorted(d.items(), key = lambda x: x[1], reverse = reverse))


In [17]:
def getTopNMostFrequent(tokens_list, n):
    count_word_tokens = Counter(tokens_list)
    sorted_word_tokens = sort_dict_by_value(count_word_tokens, True)
    for word_token in list(sorted_word_tokens)[0:n]:
        print ("{} | {} ".format(word_token, sorted_word_tokens[word_token]))
    return sorted_word_tokens

sorted_word_tokens = getTopNMostFrequent(all_word_tokens, 20)


the | 257141 
of | 156123 
to | 129875 
and | 129072 
or | 105168 
in | 79944 
any | 62236 
a | 51447 
shall | 48794 
by | 44311 
agreement | 43638 
this | 39989 
be | 39701 
for | 38724 
such | 36173 
with | 33884 
as | 32910 
party | 32831 
that | 27654 
other | 26395 


## Calculating Lexical Diversity - type-to-token ratio for words only

In [18]:
# Lexical Diversity - type-to-token ratio for words only - 0.00661

unique_words_count = len(sorted_word_tokens)

type_token_ratio_words = unique_words_count/len(all_word_tokens)
type_token_ratio_words

0.006610065728442518

## Excluding stopwords from list of words

In [19]:
def removeStopWords(all_word_tokens):
    with open('StopWords.txt', encoding='utf-8') as info:
        stop_words_file = word_tokenize(info.read())

    stop_words = set(stopwords.words('english'))
    stop_words.update(stop_words_file)

    words_without_stopwords = [w for w in all_word_tokens if not w.lower() in stop_words]
    
    return words_without_stopwords

In [20]:
words_without_stopwords = removeStopWords(all_word_tokens)
    
len(all_word_tokens), len(words_without_stopwords)

# (3841868, 1818953) - with stopwords from both file and nltk

(3841868, 1818953)

## Getting top 20 most frequent words after removing stopwords

In [21]:
top20_frequent_words = getTopNMostFrequent(words_without_stopwords, 20)

agreement | 43638 
party | 32831 
parties | 13511 
section | 13292 
company | 12403 
information | 10923 
product | 10766 
date | 10127 
products | 8169 
rights | 8049 
services | 7866 
applicable | 7533 
business | 7255 
set | 6984 
confidential | 6881 
written | 6799 
terms | 6714 
right | 6676 
notice | 6655 
term | 6575 


## Calculating Lexical density - type-to-token ratio when using only word tokens without stopwords

In [22]:
# Lexical density - 0.0136199

count_words_without_stopwords = Counter(words_without_stopwords)

type_words_count = len(count_words_without_stopwords)
words_count = len(words_without_stopwords) 

lexical_density = type_words_count/words_count

lexical_density

0.013619923109613057

In [23]:
# get all bigrams from text after removing stopwords and punctuations

def getBiGrams(all_files):
    all_tokens = readFiles(all_files)
    filtered_tokens = [token for token in all_tokens if token.isalpha()]
    filtered_tokens = removeStopWords(filtered_tokens)
    return nltk.bigrams(filtered_tokens)

In [24]:
biGrams = list(getBiGrams(all_files))
top20_frequent_bigrams = getTopNMostFrequent(biGrams, 20)

('confidential', 'information') | 3604 
('intellectual', 'property') | 2921 
('effective', 'date') | 2839 
('written', 'notice') | 2387 
('terms', 'conditions') | 2087 
('set', 'section') | 1825 
('prior', 'written') | 1814 
('term', 'agreement') | 1709 
('confidential', 'treatment') | 1534 
('termination', 'agreement') | 1440 
('parties', 'agree') | 1417 
('securities', 'exchange') | 1410 
('receiving', 'party') | 1367 
('party', 'party') | 1363 
('pursuant', 'section') | 1353 
('written', 'consent') | 1330 
('united', 'states') | 1265 
('applicable', 'law') | 1249 
('agreement', 'party') | 1215 
('terms', 'agreement') | 1202 
