![alt text](DataKind_orange.png)

# NRGI Extractives Contracts
## Clean Contract Text
### 1. Reads in contract text
### 2. Parses by paragraph
### 3. Strips HTML from text
### 4. Outputs pickled dataframe ready for featurization and modeling

In [2]:
import re
import string
import pandas as pd
from HTMLParser import HTMLParser
import nltk
from nltk.corpus import stopwords
from langdetect import detect

import warnings
with warnings.catch_warnings():
    warnings.filterwarnings("ignore",category=DeprecationWarning)
import matplotlib.pylab as plt
import seaborn as sns
%matplotlib inline

In [3]:
supported_languages = ['en','fr','es']

In [4]:
rc_contracts = pd.read_pickle('contract_data/resource_contracts_with_text.pkl')
rc_contracts['Source'] = 'rc'
ol_contracts = pd.read_pickle('contract_data/openland_contracts_with_text.pkl')
ol_contracts['Source'] = 'ol'
outfile = 'contract_data/cleaned_contracts.pkl'

In [5]:
contracts = pd.concat([rc_contracts,ol_contracts])
print "Number of contracts: " + str(len(contracts))

Number of contracts: 1689


### Parse by paragraph, strip HTML, carriage returns, newline characters

In [6]:
def get_paragraphs(text):
    text = re.sub(' +',' ',text)
    splitters = [r'<br \/><br \/>\r\n<br \/><br \/>\r\n',
                 r'<br \/><br \/>\n<br \/><br \/>\n',
                 r'\b\. <br \/>\r<br \/><br \/>\n',
                 r'</div><div><br></div><div>',
                 r'\b\.<br><br>',
                 r'<br \/>\n<br><br><br \/>\n',
                 r'<br><br \/>\n<br><br \/>\n',
                 r'\b\.\n\n',
                 r'; <br />\n<br><br />\n ']
    splitter_counts = {}
    for splitter in splitters:
        splitter_counts[splitter] = re.findall(splitter,text)
    maxsplitter = max(splitter_counts,key=splitter_counts.get)
    paras = re.split(maxsplitter, text)
    if maxsplitter in [r'\b. <br \/>\r<br \/><br \/>\n',r'\b.<br><br>',r'\b\.\n\n']:
        paras = [para.strip() + '.' for para in paras]
    return paras

In [7]:
class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [8]:
contracts['Text_by_Paragraph'] = contracts['Contract_Text'].apply(get_paragraphs)
contracts['Num Paragraphs'] = contracts['Text_by_Paragraph'].apply(len)
contracts = contracts[contracts['Num Paragraphs'] > 10].copy() # remove short documents
print "Number of contracts: " + str(len(contracts))

Number of contracts: 1535


In [9]:
contracts_by_para = pd.DataFrame()
temp = []
for index, row in contracts.iterrows():
    i = 1
    for para in row['Text_by_Paragraph']:
        temp.append({'OCID':row['OCID'],
                'Source':row['Source'],
                'Language':row['Language'],
                'Country Name':row['Country Name'],
                'Resource':row['Resource'],
                'Contract Type':row['Contract Type'],
                'Document Type':row['Document Type'],
                'Paragraph_Num':i,
                'Paragraph_Text': para})
        i += 1
contracts_by_para = pd.DataFrame(temp)

In [10]:
contracts_by_para['Clean_Paragraph_Text'] = contracts_by_para['Paragraph_Text'].apply(strip_tags)

In [11]:
characters_to_replace = ['\xef','\xbb','\xbf','\r','\n']
for char in characters_to_replace:
    contracts_by_para['Clean_Paragraph_Text'] = contracts_by_para['Clean_Paragraph_Text'].str.replace(char," ")
contracts_by_para.drop('Paragraph_Text',axis=1,inplace=True)

In [12]:
# drop blank and integer paragraphs and contracts less than 4 words
contracts_by_para.dropna(subset=['Clean_Paragraph_Text'],inplace=True)
contracts_by_para = contracts_by_para[contracts_by_para['Clean_Paragraph_Text'].apply(lambda x: type(x)!=int)].copy()
contracts_by_para = contracts_by_para[contracts_by_para['Clean_Paragraph_Text'].apply(lambda x: len(x.split()) > 5)].copy()
print len(contracts_by_para)

397761


In [13]:
# contracts_by_para['TextLength'] = contracts_by_para['Clean_Paragraph_Text'].str.len()

In [16]:
contracts_by_para.head(10)

Unnamed: 0,Contract Type,Country Name,Document Type,Language,OCID,Paragraph_Num,Resource,Source,Clean_Paragraph_Text
1,Concession Agreement,Mongolia,Contract,en,ocds-591adf-3757991515,2,Copper;Gold;Molybdenum;Silver,rc,BETWEEN THE GOVERNMENT OF MONGOLIA AND IVANHOE...
3,Concession Agreement,Mongolia,Contract,en,ocds-591adf-3757991515,4,Copper;Gold;Molybdenum;Silver,rc,In accordance with Article 29 of the Minerals ...
5,Concession Agreement,Mongolia,Contract,en,ocds-591adf-3757991515,6,Copper;Gold;Molybdenum;Silver,rc,"Parties during the period of exploring, minin..."
9,Concession Agreement,Mongolia,Contract,en,ocds-591adf-3757991515,10,Copper;Gold;Molybdenum;Silver,rc,"Except as provided by Clause 15.26, this Agree..."
11,Concession Agreement,Mongolia,Contract,en,ocds-591adf-3757991515,12,Copper;Gold;Molybdenum;Silver,rc,The Investor shall have a right to apply for a...
13,Concession Agreement,Mongolia,Contract,en,ocds-591adf-3757991515,14,Copper;Gold;Molybdenum;Silver,rc,"Except as provided by Clause 2.24.2, Taxes pay..."
15,Concession Agreement,Mongolia,Contract,en,ocds-591adf-3757991515,16,Copper;Gold;Molybdenum;Silver,rc,The Investor is hereby granted the rights to m...
17,Concession Agreement,Mongolia,Contract,en,ocds-591adf-3757991515,18,Copper;Gold;Molybdenum;Silver,rc,This Agreement applies to the whole range of t...
19,Concession Agreement,Mongolia,Contract,en,ocds-591adf-3757991515,20,Copper;Gold;Molybdenum;Silver,rc,The State shall own 34% (thirty four percent) ...
21,Concession Agreement,Mongolia,Contract,en,ocds-591adf-3757991515,22,Copper;Gold;Molybdenum;Silver,rc,The Investor shall use its best endeavours in ...


In [17]:
# Remove unsupported languages
contracts_by_para = contracts_by_para[contracts_by_para['Language'].isin(supported_languages)].copy()

In [19]:
for lang in supported_languages:
    df = contracts_by_para[contracts_by_para['Language'] == lang]
    df.to_pickle('contract_data/cleaned_unannotated_contracts_by_paragraph_'+ lang + '.pkl')

In [21]:
print contracts_by_para['Language'].value_counts()

en    252816
fr     77142
es     58614
Name: Language, dtype: int64
