# Import modules & Directory config
These are Python modules which are needed (most of them) for the cleansing to happen

In [1]:
# Import standard modules
from glob import glob
from collections import defaultdict
import re
import numpy as np
import pandas as pd
import os
import time
import seaborn as sns
import pickle
import scipy
from tqdm import tqdm
from multiprocessing.dummy import Pool as ThreadPool 
from multiprocessing import cpu_count
import threading
import gc

# Import NLP modules for pre-processing
import spacy
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Enable these if you have not downloaded nltk packages before
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

# Set notation of values 
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Multithread processing
num_threads = threading.activeCount()
print("# of threads for multiprocessing:", cpu_count())

# Assign multithread processing
pool = ThreadPool(cpu_count()) 

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dangj\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dangj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dangj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# of threads for multiprocessing: 8


## Set a working directory 

In [2]:
# Set working directory, i.e where the data is located
directory = "D:\Python\Thesis\data"

os.chdir(directory)

# Data parsing functions
Functions mainly to create an identifier and normalise some of the text data

In [3]:
def left(s, amount):
    """
    Inputs: s - a string, amount - an integer
    Output: returns characters of a string, starting from the left
    Example: 
    s = 'string'
    amount = 3
    print(left(s,amount))
    # 'str'
    """
    return s[:amount]

def right(s, amount):
    """
    Inputs: s - a string, amount - an integer
    Output: returns characters of a string, starting from the right
    Example: 
    s = 'string'
    amount = 3
    print(right(s,amount))
    # 'ing'
    """
    return s[-amount:]

def process_token(token):
    """lower cases tokens"""
    return token.lemma_.lower()

nlp = spacy.load("en_core_web_sm", disable=["parser", "tagger"])
regex = re.compile('[^a-zA-Z]')

def alignment_pipeline(clause):
    tokens = nlp(clause)
    
    return ' '.join(regex.sub('', process_token(token)) for token in tokens if not token.ent_type_)

# Creating the data in dict format and store locally

In [4]:
#def create_data(pickl, location):
    """
    Input: pickl: a boolean value, True = saves dictionary to a .pickle format
           location: a string value, directory of the data, accept wild cards and must include file itself
    Output: a dictionary that has preprocessed clauses.
    notes: example location: 'selected_contracts/warrant/*.clauses'
    """
    start = time.perf_counter()
    
    raw_clauses = defaultdict(list)
    count = 0
    files = glob(location, recursive=True)
    
    for file_path in tqdm(files, position = 0, desc = "Compiling contracts"):
        size = os.stat(f"{file_path}").st_size
        
        if size > 0:
            with open(file_path, encoding='utf-8') as f:
                for line in f:
                    # Obtain/create the contract ID rather than the full directory
                    contract = left(file_path, (len(file_path) - len(right(file_path, len('.fclauses')))))
                    contract = contract.split("\\" )[2]
                    
                    # Next iteration
                    count += 1
            
                    # append ID and the clauses
                    raw_clauses[contract].append(line.strip())
    
    
    # list to store clauses
    clauses = []
    
    for key in tqdm(raw_clauses.keys(), position = 0, desc = "Processing contracts"):
        for i in range(len(raw_clauses[f"{key}"])):
            
            # repeated alignment for extra white spaces
            raw_clauses[f"{key}"][i] = alignment_pipeline(raw_clauses[f"{key}"][i])
            #raw_clauses[f"{key}"][i] = alignment_pipeline(raw_clauses[f"{key}"][i])
            clauses.append(raw_clauses[f"{key}"][i])
            
    
    if pickl == True:
        pickle_out = open("full_raw_data_parse2.pickle", "wb")
        pickle.dump(dict(raw_clauses), pickle_out)
        print("Dictionary has been pickled to:", os.getcwd() +"/full_raw_data_parse2.pickle")
    else:
        print("Dictionary has not been pickled")
        pass
  
    print("Time elapsed in seconds: ", time.perf_counter() - start)
    
    return dict(raw_clauses), clauses

In [None]:
#raw_data, raw_clauses = create_data(pickl = True, location = 'D:/Python/Thesis/selected_contracts/*/*.fclauses')

# Data Cleansing

- Remove duplicate contracts
- Remove "empty" clauses
- Remove very short clauses
- Remove contracts that are considered as outliers/or been incorrectly parsed (all clauses in one list or too many clauses)

## Removing duplicate contracts
Approach here is to concatenate clauses together and compute a measure of similarity (cosine similarity, 0 = different, 1 = identical) apply a threshold and then remove duplicates

In [None]:
def simplify_contracts(dictionary):
    """
    Input: dictionary, A dictionary of the contract data
    Output: processed_data , A dictionary which has a key, and a concatenated string of clauses from dictionary. 
    
    Notes: Simplifies contracts, a contract is defined to be a dictionary with an ID for the contract as the key
           and multiple values in the form of lists containing strings. simplify_contracts() concatenates the lists
           into a single list.
    """
    start = time.perf_counter()
    processed_data = defaultdict(list)
    contract_list= []
    
    for key in tqdm(dictionary.keys(), position = 0, desc = 'Preparing for duplicate detection'):

        concat_clauses = ''
        for clause in range(len(dictionary[f"{key}"])):
            
            concat_clauses = concat_clauses + dictionary[f"{key}"][clause]
                       
        processed_data[f"{key}"].append(concat_clauses)
        #processed_data[clause] = processed_data.pop(f"{key}")
        # for some reason the order gets changed temporarily
        contract_list.append(concat_clauses)
            
    print("Time elapsed in seconds: ", time.perf_counter() - start)    
    
    return dict(processed_data), contract_list

In [None]:
process_data, process_contract_list = simplify_contracts(raw_data)

In [None]:
def dedupe(dictionary, process_data_dict, process_contract_list, thresh, verbose):
    """
    Inputs: dictionary, a dictionary object of contracts
            process_data_dict, a dictionary object of processed contracts from 1st object returned by simplify_contracts()
            process_contract_list, a list object of processed contracts from 2nd object returned by simplify_contracts()
            thresh, a positive real number between [0,1]; determines what contracts should be determined as a duplicate
            verbose, a boolean value to turn on/off progress
            
    Output: cleansed_data, a dictionary that has the duplicate contracts removed   
    """
    # Used for giving the run time of function
    start = time.perf_counter()

    # Create class of tfidf packages
    vectorizer = TfidfVectorizer()
    count_vectorizer = CountVectorizer(stop_words='english')

    # Apply count vectorizer to the contract list
    if verbose == True:
        print("Fitting and Transforming TF-IDF onto dataset...")
    else:
        pass
    
    count_vec = count_vectorizer.fit_transform(process_contract_list)
 
    # Create list of contract ids
    contract_ids = list(process_data.keys())
   
    # Cosine similarity between contracts
    
    sim_matrix = [] # placeholder to store similarities
    
    # Need to loop over each document row and compute similarity due to memory restrictions
    for i in tqdm(range(count_vec.shape[0]), position = 0, desc = 'Computing Cosine Similarity' ):
        sim = cosine_similarity(count_vec[0,:].toarray(), count_vec[i,:].toarray())[0][0]
        sim_matrix.append(sim)   
    
    sim_matrix = np.array(sim_matrix[1:]) # subset from index 1 and above because we'll get rid of the original if we index all rows
    
    # Thresholding the similarity
    # Change thresholding the data, initial value is 0.9
    
    if verbose == True:
        print("Thresholding...")
    else:
        pass

    sim_matrix[sim_matrix >= thresh] = 1

    # Check to see how many dupes there are
    count_dupe = np.sum(sim_matrix[sim_matrix == 1])
  
    if verbose == True:
        print("# of duplicate contracts: ", count_dupe)
    else:
        pass
     
    # Create list of duplicate contract ids
    dupe = np.where(sim_matrix >= 1)[0]
    dupe = dupe.tolist()
    dupe_contracts = [contract_ids[i] for i in dupe]
    
    if verbose == True:
        print("Deduping in-place...")
    else:
        pass
    
    for key in dupe_contracts:
        
        dictionary.pop(f"{key}")
       
    if verbose == True:
        print("dictionary has been de-duped in place, no need to store in variable")
    else:
        pass
    
    print("Time elapsed in mins: ", (time.perf_counter() - start)/60) 
    
    return 
 
# Start de-duping the data in-place
dedupe(raw_data, process_data, process_contract_list, 0.8, True) 

## Removing empty clauses
Approach here is that since this dictionary has keys, and values are lists, the list length must be 0 so we remove these values

In [None]:
new_dict = defaultdict(list)
for key in tqdm(raw_data.keys(), position = 0, desc = 'Removing Empty Clauses'):
    for i in range(len(raw_data[f"{key}"])):
        if len(raw_data[f"{key}"][i]) == 0:
            pass
        else:
            new_dict[f"{key}"].append(raw_data[f"{key}"][i])  

# THE DATA IS NOW HELD IN A VARIABLE CALLED new_dict
#pd.DataFrame.from_dict(new_dict, 'index').head(5)

## Removing very short/long clauses
Here we remove entries where there clauses don't meet an adequate length, in this case, clauses are removed if the are $\not\in [10,100]$ words

In [None]:
new_dict2 = defaultdict(list)
start = time.perf_counter()

# Define a threshold which contributes to a clause
word_count_min = 10
word_count_max = 100
# Define the tokenizer - using nltk since spacy is slower 
tokenizer = nltk.RegexpTokenizer('\s+', gaps=True)

for key in tqdm(new_dict.keys(), position = 0, desc = "Removing very short/long clauses"):
    for i in range(len(new_dict[f"{key}"])):
        if word_count_min < len(tokenizer.tokenize(new_dict[f"{key}"][i])) <= word_count_max:
            # Put clauses of word length between [10,100] in
            new_dict2[f"{key}"].append(new_dict[f"{key}"][i])

        else:
            # Do not put long ones in
            pass
print("Time elapsed in mins: ", (time.perf_counter() - start)/60)
#pd.DataFrame.from_dict(new_dict2, orient='index').head(5)

## Resolving contracts that are considered as outliers
We look at how many 'clauses' each contract has and then decide how resolve the outliers

In [None]:
df = pd.DataFrame(data = np.zeros(len(new_dict2)) , index = new_dict2.keys(), columns = ["Clauses"])

for key in new_dict2.keys():
    df.loc[f"{key}"] = len(new_dict2[f"{key}"])
    
df.head()

# Create list of outliers where we have < 3 clauses
clause_thresh = 3
outlier_list = list(df[df.Clauses <= clause_thresh].index)

df = df.drop(df[df.Clauses <= 3].index)

while True:
    outliers = list(df[df['Clauses'] > df['Clauses'].mean() + 3 * df['Clauses'].std()].index)
    #print(outliers)
    
    if len(outliers) > 0:
        # Keep a copy of a list of outliers to remove from dictionary
        outlier_list.append(outliers[0])
        # Remove from the data frame
        df = df.drop(labels = outliers)
    else:
        break
        
# Boxplot of the # of clauses        
sns.boxplot(x = df['Clauses'])
print(df.describe())

# List of outliers
print("# of Outliers detected:", len(outlier_list))

In [None]:
# Here we truncate the outliers to the median value

median = int(np.round(df.median()[0],0))

for outlier in outlier_list:
    new_dict2[f"{outlier}"] = new_dict2[f"{outlier}"][0:median]
    
#pd.DataFrame.from_dict(new_dict2, orient='index').head()

### Export cleansed data - NO SUMMARISATION APPLIED

In [None]:
# Save the cleansed dataset
#pickle_out = open("full_cleansed_raw_data_parse2.pickle", "wb")
#pickle.dump(new_dict2, pickle_out)

# Text summarisation

In [5]:
pickle_in = open("full_cleansed_raw_data_parse2.pickle", "rb")
new_dict2 = pickle.load(pickle_in)

In [6]:
clauses_list = []
cls = 0
for key in new_dict2.keys():
    for clause in range(len(new_dict2[f"{key}"])):
        clauses_list.append(new_dict2[f"{key}"][clause])
        cls+=1

## Use TFIDF to summarise clauses and reduce columns

In [7]:
def tfidf_summarisation(clause_list, new_dict, n, pickl):
    """
    Inputs: clause_list: a list object, which contains clauses at every index
            new_dict: a dictionary object, A dictionary of the contract data
            n: a natural number, used to apply thresholding for top n words
            pickle: a boolean, used to pickle dictionary object
            
    Output: new_dict3: a dictionary object where the clauses are summarised via tf-idf
    """
    summarised_clauses = []
    
    # Create class of tfidf packages
    print("Fitting and Transforming TF-IDF onto dataset...")
    vectorizer = TfidfVectorizer()
    count_vectorizer = CountVectorizer(stop_words='english')

    # Apply count vectorizer to the contract list
    count_vec = count_vectorizer.fit_transform(clause_list)

    # Create dataframe containing contracts x words - due to memory errors need to loop can't instantly create matrix
    
    for i in tqdm(range(len(clause_list)), position = 0, desc = "Summarising clauses"):
        # look at every row and obtain the full format i.e dense form, 
        # sort in descending order, lock index at 0 to bypass 'by' in sort_values()
        row_slice = count_vec[i,:].todense()
        row_slice = pd.DataFrame(row_slice, columns = count_vectorizer.get_feature_names())
        row_slice = row_slice.iloc[0,:].sort_values(ascending = False)
        
        # obtain top n tf-idf words as a list
        row_topn = ' '.join(row_slice[0:n].index.to_list())
        
        # append to summarised_clauses
        summarised_clauses.append(row_topn)
    
    
    # Build a new dict for the summarised clauses
    new_dict3 = defaultdict(list)
    
    idx = 0
    for key in tqdm(new_dict.keys(), position = 0, desc = "Creating summarised dict"):
        for clause in range(len(new_dict[f"{key}"])):
            
            new_dict3[key].append(summarised_clauses[idx])
            idx +=1
    
    if pickl == True:
        pickle_out = open("full_summarised_data_parse2.pickle","wb")
        pickle.dump(new_dict3, pickle_out)
        print("Dictionary has been pickled to:", os.getcwd() +'/full_summarised_data_parse2.pickle')
    else:
        print("Dictionary has not been pickled")
        pass

    return new_dict3

new_dict3 = tfidf_summarisation(clauses_list, new_dict2, 3, True)

Fitting and Transforming TF-IDF onto dataset...


Summarising clauses: 100%|██████████████████████████████████████████████████████████████| 565235/565235 [5:51:17<00:00, 26.82it/s]
Creating summarised dict: 100%|█████████████████████████████████████████████████████████| 50106/50106 [00:00<00:00, 235994.33it/s]


Dictionary has been pickled to: D:\Python\Thesis\data/full_summarised_data_parse2.pickle


# Load up the processed files

In [8]:
# Load full cleansed data
f_data = open("full_cleansed_raw_data_parse2.pickle",'rb')
full_data = pickle.load(f_data)

# Load summarised data
summarised_data = open("full_summarised_data_parse2.pickle",'rb')
summ_data = pickle.load(summarised_data)

# Memory saver
gc.collect()

0

# Create and save sampled version of full data

In [10]:
def Sampler(Data, p, seed, replacement, save, loc, data_type):
    """
    Inputs: Data = a dict object, containing a key (contract id) and it's values (clauses)
            p = a float object, a number between 0-1 to represent the % of data sampled
            seed = an int object, a number to represent the random state for reproducible results
            replacement = a boolean object, to sample with or without replacement
            save = a boolean object, determines to save the file or not
            loc = a string object, location of where to save file
            data_type = string object, whether the file to save is the full or summarised version
            
    Outputs: a dictionary version of the sampled data
    """
    print("Reading DataFrame from Dictionary...")
    df = pd.DataFrame.from_dict(data = Data, orient = 'index')
    print("Sampling DataFrame with parameters:", "% sampled:", p*100, "random state:", seed)
    df = df.sample(frac = p, replace = replacement, random_state = seed).T
    
    if save == True and type(loc) == str:
        print("Saving dictionary...")
        pickle_sample_out = open(f"{loc}" + f"{data_type}_" +"sample_" + f"{p*100}" + ".pickle","wb")
        pickle.dump(df.to_dict(orient = 'index'), pickle_sample_out)
        print("Save completed!")
    else:
        pass
    
    return

In [15]:
# Sampling summarised data
Sampler(Data = summ_data, 
        p = 0.01, 
        seed = 1234, 
        replacement = True,
        save = True,
        loc = "D:\Python\Thesis\data_samples\\",
        data_type = "summ"
       )
# Sampling full data

Sampler(Data = full_data, 
        p = 0.01, 
        seed = 1234, 
        replacement = True,
        save = True,
        loc = "D:\Python\Thesis\data_samples\\",
        data_type = "full"
       )

# Memory saver
gc.collect()

Reading DataFrame from Dictionary...
Sampling DataFrame with parameters: % sampled: 1.0 random state: 1234
Saving dictionary...




Save completed!
Reading DataFrame from Dictionary...
Sampling DataFrame with parameters: % sampled: 1.0 random state: 1234
Saving dictionary...
Save completed!


14

# Create and Export ratings matrix

In [3]:
def ratings_matrix(dictionary, to_df, transpose, fill_val):
    """
    Inputs: dictionary: dictionary object, which should be in the form of:
            key = contract ID, 
            value = clauses
            
            to_df: a boolean value, decides to output to pd.DataFrame object or numpy array
            tranpose: a boolean value, decides to transpose the matrix or not (for sklearn)
            fill_val: an int or np.nan object, decides what to fill matrix entries with

    Output: ratings_matrix: pd.DataFrame object, which is in the form of:
            rows = contracts, 
            columns = unique clauses,
            values = 1 if there is a clause in a contract, else 0
    
    Notes: ratings_matrix() transforms a dictionary to a pd.DataFrame object with the form:
           rows = contract ID
           columns = clause #
           cell value = clause (i.e the string)
           
           Then transforms into the ratings_matrix  
    """
    
    # Need to unpivot the df down to a very transactional form 
    print("Unpivoting...")
    df_to_melt = pd.DataFrame.from_dict(dictionary, orient = 'columns').T
    melt_df = pd.melt(df_to_melt)
    
    # Create a column in melt_df to use as a counter to aggregate on
    melt_df['count'] = 1
    gc.collect()
    print("Creating Ratings Matrix")
    ratings_matrix = pd.pivot_table(data = melt_df, 
                                    values = 'count', 
                                    columns = 'value',
                                    index = 'variable',
                                    fill_value = fill_val,
                                   )
    print("Matrix created of size:", ratings_matrix.shape)
    
    if to_df == False:
        print("Matrix is a numpy object")
        ratings_matrix = ratings_matrix.to_numpy()
    else:
        print("Matrix is a pandas object")
        #return ratings_matrix
    
    if transpose == True:
        ratings_matrix = ratings_matrix.T
        print("Matrix is transposed of size:", ratings_matrix.shape)
    
    else:
        print("Matrix not transposed")
    
    return ratings_matrix

In [17]:
# Load sampled summarised data
sampled_data = open("D:\Python\Thesis\cf_ready_data\samples\\summ_sample_5.0.pickle",'rb')
sampled_x = pickle.load(sampled_data)

# Load sampled full data
sampled_f_data = open("D:\Python\Thesis\cf_ready_data\samples\\full_sample_5.0.pickle",'rb')
sampled_f_x = pickle.load(sampled_f_data)

# Memory saver
gc.collect()

0

In [18]:
rating_matrix_df = ratings_matrix(sampled_x, to_df = True, transpose = False, fill_val = 0)
rating_matrix = ratings_matrix(sampled_x, to_df = False, transpose = False, fill_val = 0)
rating_matrix_orig = ratings_matrix(sampled_f_x, to_df = True, transpose = False, fill_val = 0)
gc.collect()

Unpivoting...
Creating Ratings Matrix
Matrix created of size: (2446, 23454)
Matrix is a pandas object
Matrix not transposed
Unpivoting...
Creating Ratings Matrix
Matrix created of size: (2446, 23454)
Matrix is a numpy object
Matrix not transposed
Unpivoting...
Creating Ratings Matrix
Matrix created of size: (2446, 25351)
Matrix is a pandas object
Matrix not transposed


35

## Export ratings matrix data to apply CF

In [19]:
# Export DF version
rating_matrix_df.to_pickle("D:\Python\Thesis\cf_ready_data\\" + "X" + "5_df.pickle")

# Export Matrix version
rating_matrix_loc = open("D:\Python\Thesis\cf_ready_data\\" + "X" + "5.pickle","wb")
pickle.dump(rating_matrix, rating_matrix_loc)

# Export orig DF version
rating_matrix_orig.to_pickle("D:\Python\Thesis\cf_ready_data\\" + "X" + "5_orig.pickle")