  # Omidyar Extractives Project 1
## Extract Contract Text (Notebook 7 of 8)
### Hash-based partitition function for segmenting documents prior to clustering

In [33]:
from lib.py_rabin import rabin_partition, example_rabin_partition
import re
import time
import subprocess
import cPickle as pickle
import pandas as pd
import community
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline

In [171]:
# remove other characters
char_to_remove = set(['.',',',';',':','-','_','[',']','&','`','@','*','^','|','~',';',':','\'','\"',">","<"]) 
def longstr_clean(longstr):
    longstr = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff]', '', longstr) # remove all non-printable characters
    longstr = re.compile(r'<.*?>').sub('', longstr) # strip html markup, e.g. <br>,<div>, etc...
    longstr = longstr.replace("&nbsp",'').replace("&lt;",'').replace("&gt;",'').replace("\\","") 
    longstr = ''.join(i for i in longstr if ord(i)<128 and i not in char_to_remove)
    longstr = longstr.replace('\r','').replace('\n','').replace('\t','')
    longstr = " ".join(longstr.split()).lower() # remove whitespace
    return longstr

In [208]:
#df = pd.read_pickle('contract_data/openland_contracts_with_text.pkl') # ~200 
df = pd.read_pickle('contract_data/resource_contracts_with_text.pkl') # ~1500
print('\nTotal documents: %d' % len(df))

df.columns = [k.lower().replace(" ","_") for k in df.columns]
df["contract_text_clean"] = df.contract_text.apply(longstr_clean) 
df.columns


Total documents: 1496


Index([u'ocid', u'category', u'contract_name', u'contract_identifier',
       u'language', u'country_name', u'resource', u'contract_type',
       u'signature_date', u'document_type', u'government_entity',
       u'government_identifier', u'company_name', u'company_address',
       u'jurisdiction_of_incorporation', u'registration_agency',
       u'company_number', u'corporate_grouping', u'participation_share',
       u'open_corporates_link', u'incorporation_date', u'operator',
       u'project_title', u'project_identifier', u'license_name',
       u'license_identifier', u'source_url', u'disclosure_mode',
       u'retrieval_date', u'pdf_url', u'deal_number', u'contract_note',
       u'matrix_page', u'annotation_category', u'annotation_text',
       u'contract_text', u'contract_text_clean'],
      dtype='object')

In [209]:
start = time.time()
flag  = "characters" 
fpraw = '_docRabinChunks'
fpsorted  = '_docRabinChunks_sorted'

# ----------------------------------
# CREATE GRAPH NODES == CONTRACTS
# 1. scrub of non-ascii characters and remove all spaces
# 2. identify file markers via 'rabin fingerprint'
# 3. break up file marker-2-marker and sort by docFingerprint
G = nx.Graph() 
with open(fpraw,'w') as fp:
    for k in xrange(len(df.index)):
        a = df.contract_text_clean.iloc[k]  # clean up document 
        if len(a) < 100: continue
        b = rabin_partition(a)              # partition document via rabin fingerprint
        c = set(b)
        
        # add dresocument node to graph; 
        # store total doc length in char, total chunks a number of unique chunks; and docid
        G.add_node(k, 
                   ocid = df.ocid.iloc[k],
                   n_chunks = len(c),
                   n_chrs   = sum([len(ch) for ch in c]),
                       )
        d = [(i,k) for i in c]
        fp.writelines([str(i)+'|'+str(k)+'\n' for i in list(c)])
print('Time to import, clean, and fingerprint documents: %s seconds' % str(time.time()-start) )
        
# in-memory pre-sort for all document chunks for faster many-many comparison 
cmd = 'sort -k1 -S2G ' + fpraw + ' > ' + fpsorted
subprocess.call(cmd,shell=True)
print('Time to sort document fingerprints: %s seconds' % str(time.time()-start) )

# ----------------------------------
# UPDATE GRAPH WITH EDGE INFORMATION, i.e. contract similarity
with open(fpsorted,'r') as fp:
    init=True
    
    # read line by line
    for line in fp:
        chunk,k = line.strip('\n').split('|')
        
        # initialize on iteration
        if init: 
            tmp = []
            prev_chunk = chunk
            init = False

        # collect docs that share the same chunk (recall: chunks are sorted)
        if len(prev_chunk) == len(chunk) and prev_chunk == chunk:
            tmp.extend([int(k)])
        
        # write which files share the identified chunks    
        else:
            if len(tmp)>1:
                for m in sorted(tmp):
                    for n in sorted(tmp): 
                        if n>m: 
                            # first occurence of edge
                            if ~G.has_edge(m,n):    
                                if flag=="chunks":
                                    G.add_edge(m,n,
                                               n_chunks_matched = 1,
                                               n_chunks_min = float(min(G.node[m]['n_chunks'],G.node[n]['n_chunks'])))
                                    G[m][n]['weight'] = G[m][n]['n_chunks_matched'] / G[m][n]['n_chunks_min']
                                else:
                                    G.add_edge(m,n,
                                               n_chrs_matched = len(prev_chunk),
                                               n_chrs_min = float(min(G.node[m]['n_chrs'],G.node[n]['n_chrs'])))
                                    G[m][n]['weight'] = G[m][n]['n_chrs_matched'] / G[m][n]['n_chrs_matched']
                            # update existing edge
                            else:
                                if flag=="chunks":
                                    G[m][n]['n_chunks_matched'] += 1
                                    G[m][n]['weight'] = G[m][n]['n_chunks_matched'] / G[m][n]['n_chunks_min']
                                else:
                                    G[m][n]['n_chrs_matched']  += len(prev_chunk)
                                    G[m][n]['weight'] = G[m][n]['n_chrs_matched'] / G[m][n]['n_chrs_min']
      
            tmp=[]
        prev_chunk = chunk 
print('Time to create networkx: %s seconds'%str(time.time()-start))

# Identify 'oneoff' documents 
oneoffs  = identifyIsolates(G)        # identify isolated docs
if oneoffs:
    Goneoffs = nx.subgraph(G,oneoffs) # keep info in a separate graph
    G.remove_nodes_from(oneoffs)      # remove oneoffs from the main network
    print("\nNumber of 'oneoff' documents: %d"%len(oneoffs))

# Graph summary
print('\nGraph/network summary:')
print nx.info(G)
print('Average density: %f' % nx.density(G))

Time to import, clean, and fingerprint documents: 168.110702038 seconds
Time to sort document fingerprints: 368.844605923 seconds
Time to create networkx: 641.455252886 seconds

Number of 'oneoff' documents: 8

Graph/network summary:
Name: 
Type: Graph
Number of nodes: 1485
Number of edges: 508040
Average degree: 684.2290
Average density: 0.461071


In [210]:
# clustering based on connected components
# identify isolated nodes
def identifyIsolates(G):
    iso = nx.isolates(G)
    return iso if len(iso)>0 else None 

def find_cluster_cutoff(G, cutoff = 0.10, minCluster=0):
    print('\nSimilarity cutoff: %f' % cutoff )
    H = G.copy()
    H.remove_edges_from([(u,v) for (u,v,d) in H.edges(data=True) if d['weight'] < cutoff])
    clusters = [sorted(i) for i in sorted(nx.connected_components(H),key=len,reverse=True) if len(i)>minCluster]
    print_cluster_summary(G,clusters)
    return clusters

# summary of cluster results
def print_cluster_summary(G,clusters):
    print('\nNumber of clusters identified: %d' % len(clusters))
    print('Document coverage: %d%% (%d of %d)' % \
          (100 * sum([len(i) for i in clusters])/len(G.node), sum([len(i) for i in clusters]), len(G.node)))
    print('\nCluster sizes:')
    print([len(i) for i in clusters])

# clustering based on lovain method (winner of 2010 cluster challenge!)
def find_cluster_louvain(G,minCluster=0):
    partition = cm.best_partition(G)
    clusters = []
    for label in set(partition.values()):
        clusters.append([i for i in partition.keys() if partition[i] == label])
    clusters = [sorted(i) for i in sorted(clusters, key = len, reverse=True) if len(i)>minCluster]
    print_cluster_summary(G,clusters)
    return clusters

# collect output
def base_output(G,node,label):
    doc_id     = G.node[node]['ocid']
    doc_degree = G.degree(node) 
    return [doc_id,label,doc_degree]

In [211]:
### COLLECT RESULTS: cluster and subcluster
alphabet = list(map(chr,range(65,91)))
clusters = find_cluster_louvain(G,minCluster=0)

# build final output
# label every node by format category
OUT = []
for i,cluster in enumerate(clusters):
    if len(cluster) > 50:
        Gsub = G.subgraph(cluster)
        subclusters = find_cluster_louvain(Gsub,minCluster=0)
        for j,subcluster in enumerate(subclusters):
            label = str(i+1) + str(alphabet[j])
            for node in subcluster:
                OUT.append(base_output(G,node,label))
    else:
        label = str(i+1)  
        for node in cluster:
            OUT.append(base_output(G,node,label))

if oneoffs:        
    ilabel = 'oneoff'    
    for node in oneoffs:
        OUT.append(base_output(Goneoffs,node,ilabel))


Number of clusters identified: 3
Document coverage: 100% (1485 of 1485)

Cluster sizes:
[603, 485, 397]

Number of clusters identified: 3
Document coverage: 100% (603 of 603)

Cluster sizes:
[276, 184, 143]

Number of clusters identified: 3
Document coverage: 100% (485 of 485)

Cluster sizes:
[215, 155, 115]

Number of clusters identified: 2
Document coverage: 100% (397 of 397)

Cluster sizes:
[222, 175]


In [212]:
df['template'] = None
for k in OUT:
    ix = df[df.ocid==k[0]].index.values[0]
    df['template'].iloc[ix] = k[1]

In [214]:
df.tail(20)

Unnamed: 0,ocid,category,contract_name,contract_identifier,language,country_name,resource,contract_type,signature_date,document_type,...,retrieval_date,pdf_url,deal_number,contract_note,matrix_page,annotation_category,annotation_text,contract_text,contract_text_clean,template
1476,ocds-591adf-9292872653,rc,"DRC, Office des Mines de Kilo Moto, Mindev,Org...",,fr,"Congo, the Democratic Republic of the",Gold;Copper,Service Contract,,Contract,...,,https://resourcecontracts-nrgi.s3-us-west-2.am...,,,,,,﻿ ...,convention miniere entre la republique du zare...,2A
1477,ocds-591adf-0781697699,rc,"Soci�t� S�n�galaise des Phosphates de Thi�s, P...",,fr,Senegal,Attapulgite;Clay,Concession Agreement,1998-05-28,Contract,...,2016-11-04,https://resourcecontracts-nrgi.s3-us-west-2.am...,,,,,,﻿ CONVENTION MINIE...,convention miniere pour les argiles industriel...,2C
1478,ocds-591adf-5576431527,rc,Cahier des Charges Kodiat el Koucha,,fr,Tunisia,Barite,Cahier des Charges,2013-12-18,Contract,...,,https://resourcecontracts-nrgi.s3-us-west-2.am...,,,,,,﻿CAHIER DES CHARGES ...,cahier des charges relatif a la production et ...,2A
1479,ocds-591adf-6442841974,rc,Cahier des Charges Sabkhet El Melah de Zarzis,,fr,Tunisia,Salt,Cahier des Charges,1998-04-29,Contract,...,,https://resourcecontracts-nrgi.s3-us-west-2.am...,,,,,,﻿DGM/DREM ...,dgm/drem cahier des charges concession des sel...,2A
1480,ocds-591adf-8797519591,rc,Cahier des Charges Oued El Ghar,,fr,Tunisia,Gypsum,Cahier des Charges,2010-09-20,Contract,...,,https://resourcecontracts-nrgi.s3-us-west-2.am...,,,,,,﻿CHAIER DES CHARGES -TYPE ...,chaier des charges type relatif a la productio...,2A
1481,ocds-591adf-7492015340,rc,Cahier des Charges Merbeh Chtioua,,fr,Tunisia,Gypsum,Cahier des Charges,2010-12-14,Contract,...,,https://resourcecontracts-nrgi.s3-us-west-2.am...,,,,,,﻿CAHIER DES CHARGES-TYPE ...,cahier des chargestype relatif a la production...,2A
1482,ocds-591adf-4091289998,rc,Cahier des Charges Jebel Houfia,,fr,Tunisia,Gypsum,Cahier des Charges,2012-01-25,Contract,...,,https://resourcecontracts-nrgi.s3-us-west-2.am...,,,,,,﻿CAHIER DES CHARGES -TYPE ...,cahier des charges type relatif a la productio...,2A
1483,ocds-591adf-0143682044,rc,Cahier des Charges Hassi El Gypse-Mestaoua,,fr,Tunisia,Gypsum,Cahier des Charges,2012-06-13,Contract,...,,https://resourcecontracts-nrgi.s3-us-west-2.am...,,,,,,﻿CAHIER DES CHARGES ...,cahier des charges relatif a la production et ...,2A
1484,ocds-591adf-5128040886,rc,Cahier des Charges Oued El Gabel,,fr,Tunisia,Gypsum,Cahier des Charges,2010-02-10,Contract,...,,https://resourcecontracts-nrgi.s3-us-west-2.am...,,,,,,﻿CAHIER DES CHARGES ...,cahier des charges relatif a la production et ...,2A
1485,ocds-591adf-7013278443,rc,Cahier des Charges Bir el Afou,,fr,Tunisia,Gypsum,Cahier des Charges,2011-08-08,Contract,...,,https://resourcecontracts-nrgi.s3-us-west-2.am...,,,,,,﻿I ...,i cahier des charges type relatif a la product...,2A


In [215]:
df[["ocid","template"]].to_csv("prelim_template_1500docs")

In [205]:
from fuzzywuzzy import fuzz
longstr1 = df.iloc[5].contract_text_clean
longstr2 = df.iloc[14].contract_text_clean
fuzz.ratio(longstr1, longstr2)

86

In [207]:
minchunk,avgchunk,maxchunk=32,64,128
a = set(rabin_partition(longstr1,avgchunk,minchunk,maxchunk,32))
b = set(rabin_partition(longstr2,avgchunk,minchunk,maxchunk,32))

print
print "EXAMPLE USE CASE: DOCUMENT SIMILARITY"
print

print
print "Percent similarity between document 1 and 2: \n%0.2f%%"%( 100.*len(a&b)/float(min(len(a),len(b))) )

print
print "Common rabin chunks between documents 1 and 2:"
print [k for k in rabin_partition(longstr1) if k in set(rabin_partition(longstr2))]

print
print "Non-common rabin chunks between document 1 and 2:"
print [k for k in rabin_partition(longstr1) if k not in set(rabin_partition(longstr2))]
print


EXAMPLE USE CASE: DOCUMENT SIMILARITY


Percent similarity between document 1 and 2: 
24.71%

Common rabin chunks between documents 1 and 2:
['cratique du cong', 'onvertible suivan', 'e le conce', 'par le co', 'par le concessionna', 'de concessi', 'on le cahi', 'er des ch', 'artie integrante', 'ticle 3 la duree du contrat de', ' concession est ', 'de vingtcinq ans renouvelables dans ', 'e 8 cidessous article 4 letat gara', 'ntit au concessionnaire la jouissance pleine et entiere des', ' droits qui lui sont co', 'nferes par la lo', 'i et le present contrat de concession pendant toute la duree du ', 'contrat le co', 'ncessionnaire ne peut et', 're prive en tout ou parti', 'e de son droit dexploiter ', 'sa concession sauf en cas de', ' ou contractuelles ', 'ou pour cause du', 'tilite publique et da', ' droit commun article 5 sous reserve d', ' du present cont', 'rat le conc', 'essionnaire a ', 'un droit exclusif dexplo', 'ncession ar', 'ticle 6 le concess', 'ionnaire est tenu de re', 'sp

#### REFERENCE CODE

In [None]:
# necessary functions
char_to_remove = set([' ','*',',',';',':','-','_','[',']',']','&','`','@','*','^','|','~','\'','\"'])
def doc_clean(longstr):
    longstr= re.sub(r'[\x00-x08\x0b\x0c\x0e-\xlf\x7f-\xff]', '', longstr) #remove all non-printable characters
    longstr = ''.join(i for i in longstr if i not in char_to_remove and ord(i) < 128).replace('\r','').replace('\n','').replace('\t','')
    return longstr
def partision(longstr, chunksize, hashflag=True):
    if hashflag:
        return [hashlib.shal(longstr[i:j]).hexdigest() for i, j in zip(list(np.cumsum([0]+chunksize[:-1])),list(np.cumsum(chunksize)))]
    else:
        return [longstr[i:j] for i, j in zip(list(np.cumsum([0]+chunksize[:-1])), list (np.cumsum(chunksize)))]

#clustering based on connected components
def find_cluster_cutoff(G, cutoff=.9, minCluster=0):
    print '\nSimilarity cutoff: %f' % cutoff
    H = G.copy()
    H.revmove_edges_from([(u,v) for (u,v,s) in H.edges(data=True) if d['weight'] < cutoff])
    clusters = [sorted(i) for i in sorted(nx.connected_components(H), key=len,reverse=True) if len(i)>minCluster]
    return clusters

def find_cluster_louvan(G,minCluster=0):
    partition = cm.best_partition(G)
    clusters = []
    for label in set(partition.values()):
        clusters = []
        for label in set(partition.values()):
            clusters.append([i for i in partiion.keys() if partiion[i] == label])
        clusters = [sorted(i) for i in sorted(clusters, key = len, reverse=True) if len(i)>minCluster]
    return clusters

def print_cluster_summary(G,clusters):
    print '\nNumber of clusters identified: %d' % len(clusters)
    print 'Document coverage: %d%% (%d of %d)' 

In [None]:
start = time.time()
fpraw = 'docFingerprints'
fpsorted = 'docFingerprints_sorted'
# generate file fingerprints
# initialize graph nodes, one per document
# 1. scrub of non-ascii characters and remove all spaces
# 2. identify file markers via 'rabin fingerprint'
# 3. break up file marker-2-marker and sort by docFingerprint
G = nx.Graph()
with open(fpraw, 'w') as fp:
    for k, filepath in enumerate(filePaths):
        a = doc_clean(open(filepath).read()) # clean up document
        b = rabin_chunks(a) # identify chunks per rabin fingerprint algorithm
        c = set(partition(a,b,hashflag=False)) # partition document into its fingerprints and hash (optional)