![alt text](DataKind_orange.png)

# NRGI Extractives Contracts
## Clean Contract Text
### 1. Reads in contract text
### 2. Parses by paragraph
### 3. Strips HTML from text
### 4. Outputs pickled dataframe ready for featurization and modeling

In [1]:
import re
import string
import pandas as pd
from HTMLParser import HTMLParser
import nltk
from nltk.corpus import stopwords
from langdetect import detect

import warnings
with warnings.catch_warnings():
    warnings.filterwarnings("ignore",category=DeprecationWarning)
import matplotlib.pylab as plt
import seaborn as sns
%matplotlib inline

In [3]:
rc_contracts = pd.read_pickle('contract_data/resource_contracts_with_text.pkl')
rc_contracts['Source'] = 'rc'
ol_contracts = pd.read_pickle('contract_data/openland_contracts_with_text.pkl')
ol_contracts['Source'] = 'ol'
outfile = 'contract_data/cleaned_contracts.pkl'

In [4]:
contracts = pd.concat([rc_contracts,ol_contracts])
print "Number of contracts: " + str(len(contracts))

Number of contracts: 1689


### Parse by paragraph, strip HTML, carriage returns, newline characters

In [461]:
def get_paragraphs(text):
    text = re.sub(' +',' ',text)
    splitters = [r'<br \/><br \/>\r\n<br \/><br \/>\r\n',
                 r'<br \/><br \/>\n<br \/><br \/>\n',
                 r'\b\. <br \/>\r<br \/><br \/>\n',
                 r'</div><div><br></div><div>',
                 r'\b\.<br><br>',
                 r'<br \/>\n<br><br><br \/>\n',
                 r'<br><br \/>\n<br><br \/>\n',
                 r'\b\.\n\n',
                 r'; <br />\n<br><br />\n ']
    splitter_counts = {}
    for splitter in splitters:
        splitter_counts[splitter] = re.findall(splitter,text)
    maxsplitter = max(splitter_counts,key=splitter_counts.get)
    paras = re.split(maxsplitter, text)
    if maxsplitter in [r'\b. <br \/>\r<br \/><br \/>\n',r'\b.<br><br>',r'\b\.\n\n']:
        paras = [para.strip() + '.' for para in paras]
    return paras

In [233]:
class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [510]:
contracts['Text_by_Paragraph'] = contracts['Contract_Text'].apply(get_paragraphs)
contracts['Num Paragraphs'] = contracts['Text_by_Paragraph'].apply(len)
contracts = contracts[contracts['Num Paragraphs'] > 10].copy() # remove short documents
print "Number of contracts: " + str(len(contracts))

Number of contracts: 1535


In [528]:
contracts_by_para = pd.DataFrame()
temp = []
for index, row in contracts.iterrows():
    i = 1
    for para in row['Text_by_Paragraph']:
        temp.append({'OCID':row['OCID'],
                'Source':row['Source'],
                'Language':row['Language'],
                'Country Name':row['Country Name'],
                'Resource':row['Resource'],
                'Contract Type':row['Contract Type'],
                'Document Type':row['Document Type'],
                'Paragraph_Num':i,
                'Paragraph_Text': para})
        i += 1
contracts_by_para = pd.DataFrame(temp)

In [529]:
contracts_by_para['Clean_Paragraph_Text'] = contracts_by_para['Paragraph_Text'].apply(strip_tags)

In [530]:
characters_to_replace = ['\xef','\xbb','\xbf','\r','\n']
for char in characters_to_replace:
    contracts_by_para['Clean_Paragraph_Text'] = contracts_by_para['Clean_Paragraph_Text'].str.replace(char," ")

In [531]:
contracts_by_para['TextLength'] = contracts_by_para['Clean_Paragraph_Text'].str.len()

In [532]:
contracts_by_para.head(10)

Unnamed: 0,Contract Type,Country Name,Document Type,Language,OCID,Paragraph_Num,Paragraph_Text,Resource,Source,Clean_Paragraph_Text,TextLength
0,Concession Agreement,Mongolia,Contract,en,ocds-591adf-3757991515,1,INVESTMENT AGREEMENT,Copper;Gold;Molybdenum;Silver,rc,INVESTMENT AGREEMENT,20
1,Concession Agreement,Mongolia,Contract,en,ocds-591adf-3757991515,2,BETWEEN<br /><br />\nTHE GOVERNMENT OF MONGOLIA<br /><br />\nAND<br /><br />\nIVANHOE MINES MONGOLIA INC LLC<br /><br />\nAND<br /><br />\nIVANHOE MINES LTD<br /><br />\nAND<br /><br />\nRIO TINTO...,Copper;Gold;Molybdenum;Silver,rc,BETWEEN THE GOVERNMENT OF MONGOLIA AND IVANHOE MINES MONGOLIA INC LLC AND IVANHOE MINES LTD AND RIO TINTO INTERNATIONAL HOLDINGS LIMITED,136
2,Concession Agreement,Mongolia,Contract,en,ocds-591adf-3757991515,3,INVESTMENT AGREEMENT,Copper;Gold;Molybdenum;Silver,rc,INVESTMENT AGREEMENT,21
3,Concession Agreement,Mongolia,Contract,en,ocds-591adf-3757991515,4,"In accordance with Article 29 of the Minerals Law of Mongolia (hereinafter referred to as the<br /><br />\n“Minerals Law”), Resolution Number 40 dated 4 December 2008 and Resolution Number 57<br /...",Copper;Gold;Molybdenum;Silver,rc,"In accordance with Article 29 of the Minerals Law of Mongolia (hereinafter referred to as the “Minerals Law”), Resolution Number 40 dated 4 December 2008 and Resolution Number 57 dated 16 July 200...",2059
4,Concession Agreement,Mongolia,Contract,en,ocds-591adf-3757991515,5,Page 1,Copper;Gold;Molybdenum;Silver,rc,Page 1,6
5,Concession Agreement,Mongolia,Contract,en,ocds-591adf-3757991515,6,"Parties during the period of exploring, mining and processing operations within the<br /><br />\nContract Area;<br /><br />\nResolution Number 57 of the State Great Khural dated 16 July 2009 has ...",Copper;Gold;Molybdenum;Silver,rc,"Parties during the period of exploring, mining and processing operations within the Contract Area; Resolution Number 57 of the State Great Khural dated 16 July 2009 has resolved to authorise the ...",373
6,Concession Agreement,Mongolia,Contract,en,ocds-591adf-3757991515,7,1.,Copper;Gold;Molybdenum;Silver,rc,1.,2
7,Concession Agreement,Mongolia,Contract,en,ocds-591adf-3757991515,8,CHAPTER ONE: GENERAL,Copper;Gold;Molybdenum;Silver,rc,CHAPTER ONE: GENERAL,20
8,Concession Agreement,Mongolia,Contract,en,ocds-591adf-3757991515,9,1.1.,Copper;Gold;Molybdenum;Silver,rc,1.1.,4
9,Concession Agreement,Mongolia,Contract,en,ocds-591adf-3757991515,10,"Except as provided by Clause 15.26, this Agreement shall come into effect on and from<br /><br />\nthe Effective Date and remain in effect, in accordance with Article 29.3 of the Minerals<br /><br...",Copper;Gold;Molybdenum;Silver,rc,"Except as provided by Clause 15.26, this Agreement shall come into effect on and from the Effective Date and remain in effect, in accordance with Article 29.3 of the Minerals Law, for an initial p...",223


In [533]:
contracts_by_para.to_pickle('contract_data/cleaned_unannotated_contracts_by_paragraph.pkl')

In [534]:
print len(contracts_by_para)

653133
