  # Omidyar Extractives Project 1
## Extract Contract Text (Notebook 7 of 8)
### Hash-based partitition function for segmenting documents prior to clustering

In [33]:
from lib.py_rabin import rabin_partition, example_rabin_partition
import re
import time
import subprocess
import cPickle as pickle
import pandas as pd
import community
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline

In [171]:
# remove other characters
char_to_remove = set(['.',',',';',':','-','_','[',']','&','`','@','*','^','|','~',';',':','\'','\"',">","<"]) 
def longstr_clean(longstr):
    longstr = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff]', '', longstr) # remove all non-printable characters
    longstr = re.compile(r'<.*?>').sub('', longstr) # strip html markup, e.g. <br>,<div>, etc...
    longstr = longstr.replace("&nbsp",'').replace("&lt;",'').replace("&gt;",'').replace("\\","") 
    longstr = ''.join(i for i in longstr if ord(i)<128 and i not in char_to_remove)
    longstr = longstr.replace('\r','').replace('\n','').replace('\t','')
    longstr = " ".join(longstr.split()).lower() # remove whitespace
    return longstr

In [208]:
#df = pd.read_pickle('contract_data/openland_contracts_with_text.pkl') # ~200 
df = pd.read_pickle('contract_data/resource_contracts_with_text.pkl') # ~1500
print('\nTotal documents: %d' % len(df))

df.columns = [k.lower().replace(" ","_") for k in df.columns]
df["contract_text_clean"] = df.contract_text.apply(longstr_clean) 
df.columns


Total documents: 1496


Index([u'ocid', u'category', u'contract_name', u'contract_identifier',
       u'language', u'country_name', u'resource', u'contract_type',
       u'signature_date', u'document_type', u'government_entity',
       u'government_identifier', u'company_name', u'company_address',
       u'jurisdiction_of_incorporation', u'registration_agency',
       u'company_number', u'corporate_grouping', u'participation_share',
       u'open_corporates_link', u'incorporation_date', u'operator',
       u'project_title', u'project_identifier', u'license_name',
       u'license_identifier', u'source_url', u'disclosure_mode',
       u'retrieval_date', u'pdf_url', u'deal_number', u'contract_note',
       u'matrix_page', u'annotation_category', u'annotation_text',
       u'contract_text', u'contract_text_clean'],
      dtype='object')

In [None]:
start = time.time()
flag  = "characters" 
fpraw = '_docRabinChunks'
fpsorted  = '_docRabinChunks_sorted'

# ----------------------------------
# CREATE GRAPH NODES == CONTRACTS
# 1. scrub of non-ascii characters and remove all spaces
# 2. identify file markers via 'rabin fingerprint'
# 3. break up file marker-2-marker and sort by docFingerprint
G = nx.Graph() 
with open(fpraw,'w') as fp:
    for k in xrange(len(df.index)):
        a = df.contract_text_clean.iloc[k]  # clean up document 
        if len(a) < 100: continue
        b = rabin_partition(a)              # partition document via rabin fingerprint
        c = set(b)
        
        # add dresocument node to graph; 
        # store total doc length in char, total chunks a number of unique chunks; and docid
        G.add_node(k, 
                   ocid = df.ocid.iloc[k],
                   n_chunks = len(c),
                   n_chrs   = sum([len(ch) for ch in c]),
                       )
        d = [(i,k) for i in c]
        fp.writelines([str(i)+'|'+str(k)+'\n' for i in list(c)])
print('Time to import, clean, and fingerprint documents: %s seconds' % str(time.time()-start) )
        
# in-memory pre-sort for all document chunks for faster many-many comparison 
cmd = 'sort -k1 -S2G ' + fpraw + ' > ' + fpsorted
subprocess.call(cmd,shell=True)
print('Time to sort document fingerprints: %s seconds' % str(time.time()-start) )

# ----------------------------------
# UPDATE GRAPH WITH EDGE INFORMATION, i.e. contract similarity
with open(fpsorted,'r') as fp:
    init=True
    
    # read line by line
    for line in fp:
        chunk,k = line.strip('\n').split('|')
        
        # initialize on iteration
        if init: 
            tmp = []
            prev_chunk = chunk
            init = False

        # collect docs that share the same chunk (recall: chunks are sorted)
        if len(prev_chunk) == len(chunk) and prev_chunk == chunk:
            tmp.extend([int(k)])
        
        # write which files share the identified chunks    
        else:
            if len(tmp)>1:
                for m in sorted(tmp):
                    for n in sorted(tmp): 
                        if n>m: 
                            # first occurence of edge
                            if ~G.has_edge(m,n):    
                                if flag=="chunks":
                                    G.add_edge(m,n,
                                               n_chunks_matched = 1,
                                               n_chunks_min = float(min(G.node[m]['n_chunks'],G.node[n]['n_chunks'])))
                                    G[m][n]['weight'] = G[m][n]['n_chunks_matched'] / G[m][n]['n_chunks_min']
                                else:
                                    G.add_edge(m,n,
                                               n_chrs_matched = len(prev_chunk),
                                               n_chrs_min = float(min(G.node[m]['n_chrs'],G.node[n]['n_chrs'])))
                                    G[m][n]['weight'] = G[m][n]['n_chrs_matched'] / G[m][n]['n_chrs_matched']
                            # update existing edge
                            else:
                                if flag=="chunks":
                                    G[m][n]['n_chunks_matched'] += 1
                                    G[m][n]['weight'] = G[m][n]['n_chunks_matched'] / G[m][n]['n_chunks_min']
                                else:
                                    G[m][n]['n_chrs_matched']  += len(prev_chunk)
                                    G[m][n]['weight'] = G[m][n]['n_chrs_matched'] / G[m][n]['n_chrs_min']

                            
                                
            tmp=[]
        prev_chunk = chunk 
print('Time to create networkx: %s seconds'%str(time.time()-start))

# Identify 'oneoff' documents 
oneoffs  = identifyIsolates(G)        # identify isolated docs
if oneoffs:
    Goneoffs = nx.subgraph(G,oneoffs) # keep info in a separate graph
    G.remove_nodes_from(oneoffs)      # remove oneoffs from the main network
    print("\nNumber of 'oneoff' documents: %d"%len(oneoffs))

# Graph summary
print('\nGraph/network summary:')
print nx.info(G)
print('Average density: %f' % nx.density(G))

In [182]:
for i in xrange(len(G.node)):
    for j in xrange(len(G.node)):
        if j>i: 
            try: 
                print i,j,G.get_edge_data(i,j)['weight']
            except: 
                pass

0 1 0 2 0 3 0 4 0 5 0 6 0 7 0 8 0 9 0 10 0 11 0 12 0 13 0 14 0 15 0 16 0 17 0 18 0 19 0 20 0 21 0 22 0 23 0 24 0 25 0 26 0 27 0 28 0 29 0 30 0 31 0 32 0 33 0 34 0 35 0 36 0 37 0 38 0 39 0 40 0 41 0 42 0 43 0 44 0 45 0 46 0 47 0 48 0 49 0 50 0 51 0 52 0 53 0 54 0 55 0 56 0 57 0 58 0 59 0 60 0 61 0 62 0 63 0 64 0 65 0 66 0 67 0 68 0 69 0 70 0 71 0 72 0 73 0 74 0 75 0 76 0 77 0 78 0 79 0 80 0 81 0 82 0 83 0 84 0 85 0 86 0 87 0 88 0 89 0 90 0 91 0 92 0 93 0 94 0 95 0 96 0 97 0 98 0 99 0 100 0 101 0 102 0 103 0 104 0 105 0 106 0 107 0 108 0 109 0 110 0 111 0 112 0 113 0 114 0 115 0 116 0 117 0 118 0 119 0 120 0 121 0 122 0 123 0 124 0 125 0 126 0 127 0 128 0 129 0 130 0 131 0 132 0 133 0 134 0 135 0 136 0 137 0 138 0 139 0 140 0 141 0 142 0 143 0 144 0 145 0 146 0 147 0 148 0 149 0 150 0 151 0 152 0 153 0 154 0 155 0 156 0 157 0 158 0 159 0 160 0 161 0 162 0 163 0 164 0 165 0 166 0 167 0 168 0 169 0 170 0 171 0 172 1 2 1 3 1 4 1 5 1 6 1 7 1 8 1 9 1 10 1 11 1 12 1 13 1 14 1 15 1 16 1 17 1 18

In [177]:
# clustering based on connected components
# identify isolated nodes
def identifyIsolates(G):
    iso = nx.isolates(G)
    return iso if len(iso)>0 else None 

def find_cluster_cutoff(G, cutoff = 0.10, minCluster=0):
    print('\nSimilarity cutoff: %f' % cutoff )
    H = G.copy()
    H.remove_edges_from([(u,v) for (u,v,d) in H.edges(data=True) if d['weight'] < cutoff])
    clusters = [sorted(i) for i in sorted(nx.connected_components(H),key=len,reverse=True) if len(i)>minCluster]
    print_cluster_summary(G,clusters)
    return clusters

# summary of cluster results
def print_cluster_summary(G,clusters):
    print('\nNumber of clusters identified: %d' % len(clusters))
    print('Document coverage: %d%% (%d of %d)' % \
          (100 * sum([len(i) for i in clusters])/len(G.node), sum([len(i) for i in clusters]), len(G.node)))
    print('\nCluster sizes:')
    print([len(i) for i in clusters])

# clustering based on lovain method (winner of 2010 cluster challenge!)
def find_cluster_louvain(G,minCluster=0):
    partition = cm.best_partition(G)
    clusters = []
    for label in set(partition.values()):
        clusters.append([i for i in partition.keys() if partition[i] == label])
    clusters = [sorted(i) for i in sorted(clusters, key = len, reverse=True) if len(i)>minCluster]
    print_cluster_summary(G,clusters)
    return clusters

# collect output
def base_output(G,node,label):
    doc_id     = G.node[node]['ocid']
    doc_degree = G.degree(node) 
    return [doc_id,label,doc_degree]

In [180]:
### COLLECT RESULTS: cluster and subcluster
alphabet = list(map(chr,range(65,91)))
clusters = find_cluster_louvain(G,minCluster=0)

# build final output
# label every node by format category
OUT = []
for i,cluster in enumerate(clusters):
    if len(cluster) > 50:
        Gsub = G.subgraph(cluster)
        subclusters = find_cluster_louvain(Gsub,minCluster=0)
        for j,subcluster in enumerate(subclusters):
            label = str(i+1) + str(alphabet[j])
            for node in subcluster:
                OUT.append(base_output(G,node,label))
    else:
        label = str(i+1)  
        for node in cluster:
            OUT.append(base_output(G,node,label))

if oneoffs:        
    ilabel = 'oneoff'    
    for node in oneoffs:
        OUT.append(base_output(Goneoffs,node,ilabel))


Number of clusters identified: 3
Document coverage: 100% (173 of 173)

Cluster sizes:
[99, 71, 3]

Number of clusters identified: 2
Document coverage: 100% (99 of 99)

Cluster sizes:
[56, 43]

Number of clusters identified: 2
Document coverage: 100% (71 of 71)

Cluster sizes:
[41, 30]


In [187]:
df['template'] = None
for k in OUT:
    ix = df[df.ocid==k[0]].index.values[0]
    df['template'].iloc[ix] = k[1]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [194]:
df.head(20)

Unnamed: 0,ocid,category,contract_name,contract_identifier,language,country_name,resource,contract_type,signature_date,document_type,...,retrieval_date,pdf_url,deal_number,contract_note,matrix_page,annotation_category,annotation_text,contract_text,contract_text_clean,template
0,ocds-591adf-6498452777,olc,"Megabois, Contrat de Concession Foresti�re, 2011",,fr,"Congo, the Democratic Republic of the",Timber (Wood),Contrat de Concession Foresti�re,2011-12-13,Contract,...,2015-09-13,https://resourcecontracts-nrgi.s3-us-west-2.am...,5019.0,,http://www.landmatrix.org/en/get-the-detail/by...,,,REPUBLIQUE DEMOCRATIQUE DU CONGO<br><br />\n<b...,republique democratique du congoministre de le...,oneoff
1,ocds-591adf-8891537745,olc,"Liberia, ADA Commercial Inc., Concession Contr...",,en,Liberia,Rice;Rice products,Concession Agreement,2008-04-05,Contract,...,,https://resourcecontracts-nrgi.s3-us-west-2.am...,1397.0,,http://www.landmatrix.org/en/get-the-detail/by...,,,"""AN ACT RATIFYING THE CONCESSION AGREEMENT<br>...",an act ratifying the concession agreementbetwe...,oneoff
2,ocds-591adf-7213946982,olc,"SIFORCO, Contrat de Concession Foresti�re N�02...",,fr,"Congo, the Democratic Republic of the",Timber (Wood),Contrat de Concession Foresti�re,2011-10-24,Contract,...,,https://resourcecontracts-nrgi.s3-us-west-2.am...,5223.0,,http://www.landmatrix.org/en/get-the-detail/al...,,,REPUBLIQUE DEMOCRATIQUE DU CONGO<br /><br />\r...,republique democratique du congoministre de le...,2A
3,ocds-591adf-1327744931,olc,"SIFORCO, Contrat de Concession Foresti�re N�02...",,fr,"Congo, the Democratic Republic of the",Timber (Wood),Contrat de Concession Foresti�re,2011-10-24,Contract,...,,https://resourcecontracts-nrgi.s3-us-west-2.am...,5226.0,,http://www.landmatrix.org/en/get-the-detail/al...,,,RÉPUBLIQUE DÉMOCRATIQUE DU CONGO...,rpublique dmocratique du congo ministre de len...,2A
4,ocds-591adf-9505117998,olc,"Socfinaf S.A., Plantations Socfinaf Ghana Limi...",,en,Ghana,Oil palm or palm oils;Rubber;Timber (Wood);Other,Asset Sale and Purchase Agreement,2015-02-04,Contract,...,2015-11-05,https://resourcecontracts-nrgi.s3-us-west-2.am...,5055.0,,http://www.landmatrix.org/en/get-the-detail/al...,,,ASSET SALE AND PURCHASE AGREEMENT/SIPL <br /><...,asset sale and purchase agreement/sipl asset s...,1A
5,ocds-591adf-6837447464,olc,"SODEFOR, Contrat de Concession Foresti�re N�45...",,fr,"Congo, the Democratic Republic of the",Timber (Wood),Contrat de Concession Foresti�re,2011-10-24,Contract,...,,https://resourcecontracts-nrgi.s3-us-west-2.am...,5225.0,,http://www.landmatrix.org/en/get-the-detail/al...,,,﻿ REP...,republique democratique du congo ministere de ...,2B
6,ocds-591adf-0186221161,olc,"GTLESTE BIOTECH, MOU, 2008",,en,Timor - Leste,Sugarcane,Memorandum of Understanding,2008-01-15,Contract,...,,https://resourcecontracts-nrgi.s3-us-west-2.am...,156.0,,http://www.landmatrix.org/en/get-the-detail/by...,,,MINISTÉRIO DA AGRICULTURA E PESCAS<br><br><div...,ministrio da agricultura e pescasmemorandum of...,1A
7,ocds-591adf-4730159366,olc,"SODEFOR, Contrat de Concession Foresti�re N�03...",,fr,"Congo, the Democratic Republic of the",Le bois d'�uvre,Contrat de Concession Foresti�re,2011-10-24,Contract,...,,https://resourcecontracts-nrgi.s3-us-west-2.am...,5240.0,,http://www.landmatrix.org/en/get-the-detail/al...,,,REPUBLIQUE DEMOCRATIQUE DU CONGO<br /><br />\r...,republique democratique du congoministre de le...,2A
8,ocds-591adf-8854028206,olc,"SEFOCO, Contrat de Concession Foresti�re N� 01...",,fr,"Congo, the Democratic Republic of the",Le bois d'�uvre,Contrat de Concession Foresti�re,2011-10-24,Contract,...,,https://resourcecontracts-nrgi.s3-us-west-2.am...,5228.0,,http://www.landmatrix.org/en/get-the-detail/al...,,,<br /><br />\r\n<br /><br />\r\n<br /><br />\r...,republique democratique du congo ministre de l...,2A
9,ocds-591adf-7269710792,olc,"TOZZI GREEN SARL, Bail Foncier, 2012",,fr,Madagascar,Not Specified,Bail Foncier,2012-08-17,Contract,...,,https://resourcecontracts-nrgi.s3-us-west-2.am...,1454.0,,http://www.landmatrix.org/en/get-the-detail/by...,,,VICE PRIMATURE EN CHARGE DU DEVELOPPEMENT<br /...,vice primature en charge du developpementet de...,2A


In [205]:
from fuzzywuzzy import fuzz
longstr1 = df.iloc[5].contract_text_clean
longstr2 = df.iloc[14].contract_text_clean
fuzz.ratio(longstr1, longstr2)

86

In [207]:
minchunk,avgchunk,maxchunk=32,64,128
a = set(rabin_partition(longstr1,avgchunk,minchunk,maxchunk,32))
b = set(rabin_partition(longstr2,avgchunk,minchunk,maxchunk,32))

print
print "EXAMPLE USE CASE: DOCUMENT SIMILARITY"
print

print
print "Percent similarity between document 1 and 2: \n%0.2f%%"%( 100.*len(a&b)/float(min(len(a),len(b))) )

print
print "Common rabin chunks between documents 1 and 2:"
print [k for k in rabin_partition(longstr1) if k in set(rabin_partition(longstr2))]

print
print "Non-common rabin chunks between document 1 and 2:"
print [k for k in rabin_partition(longstr1) if k not in set(rabin_partition(longstr2))]
print


EXAMPLE USE CASE: DOCUMENT SIMILARITY


Percent similarity between document 1 and 2: 
24.71%

Common rabin chunks between documents 1 and 2:
['cratique du cong', 'onvertible suivan', 'e le conce', 'par le co', 'par le concessionna', 'de concessi', 'on le cahi', 'er des ch', 'artie integrante', 'ticle 3 la duree du contrat de', ' concession est ', 'de vingtcinq ans renouvelables dans ', 'e 8 cidessous article 4 letat gara', 'ntit au concessionnaire la jouissance pleine et entiere des', ' droits qui lui sont co', 'nferes par la lo', 'i et le present contrat de concession pendant toute la duree du ', 'contrat le co', 'ncessionnaire ne peut et', 're prive en tout ou parti', 'e de son droit dexploiter ', 'sa concession sauf en cas de', ' ou contractuelles ', 'ou pour cause du', 'tilite publique et da', ' droit commun article 5 sous reserve d', ' du present cont', 'rat le conc', 'essionnaire a ', 'un droit exclusif dexplo', 'ncession ar', 'ticle 6 le concess', 'ionnaire est tenu de re', 'sp

#### REFERENCE CODE

In [None]:
# necessary functions
char_to_remove = set([' ','*',',',';',':','-','_','[',']',']','&','`','@','*','^','|','~','\'','\"'])
def doc_clean(longstr):
    longstr= re.sub(r'[\x00-x08\x0b\x0c\x0e-\xlf\x7f-\xff]', '', longstr) #remove all non-printable characters
    longstr = ''.join(i for i in longstr if i not in char_to_remove and ord(i) < 128).replace('\r','').replace('\n','').replace('\t','')
    return longstr
def partision(longstr, chunksize, hashflag=True):
    if hashflag:
        return [hashlib.shal(longstr[i:j]).hexdigest() for i, j in zip(list(np.cumsum([0]+chunksize[:-1])),list(np.cumsum(chunksize)))]
    else:
        return [longstr[i:j] for i, j in zip(list(np.cumsum([0]+chunksize[:-1])), list (np.cumsum(chunksize)))]

#clustering based on connected components
def find_cluster_cutoff(G, cutoff=.9, minCluster=0):
    print '\nSimilarity cutoff: %f' % cutoff
    H = G.copy()
    H.revmove_edges_from([(u,v) for (u,v,s) in H.edges(data=True) if d['weight'] < cutoff])
    clusters = [sorted(i) for i in sorted(nx.connected_components(H), key=len,reverse=True) if len(i)>minCluster]
    return clusters

def find_cluster_louvan(G,minCluster=0):
    partition = cm.best_partition(G)
    clusters = []
    for label in set(partition.values()):
        clusters = []
        for label in set(partition.values()):
            clusters.append([i for i in partiion.keys() if partiion[i] == label])
        clusters = [sorted(i) for i in sorted(clusters, key = len, reverse=True) if len(i)>minCluster]
    return clusters

def print_cluster_summary(G,clusters):
    print '\nNumber of clusters identified: %d' % len(clusters)
    print 'Document coverage: %d%% (%d of %d)' 

In [None]:
start = time.time()
fpraw = 'docFingerprints'
fpsorted = 'docFingerprints_sorted'
# generate file fingerprints
# initialize graph nodes, one per document
# 1. scrub of non-ascii characters and remove all spaces
# 2. identify file markers via 'rabin fingerprint'
# 3. break up file marker-2-marker and sort by docFingerprint
G = nx.Graph()
with open(fpraw, 'w') as fp:
    for k, filepath in enumerate(filePaths):
        a = doc_clean(open(filepath).read()) # clean up document
        b = rabin_chunks(a) # identify chunks per rabin fingerprint algorithm
        c = set(partition(a,b,hashflag=False)) # partition document into its fingerprints and hash (optional)