![alt text](DataKind_orange.png)

# Omidyar Extractives Project 1
## Clean Contract Text (Notebook 4 of 8)
### 1. Reads in contract text
### 2. Strips HTML from text
### 3. Cleans text for NLP prep (remove unicode, special characters, stopwords, etc.)
### 4. Outputs cleaned text appended to dataframe from Notebook 1

In [1]:
import re
import string
import pandas as pd
from HTMLParser import HTMLParser
import nltk
from nltk.corpus import stopwords
from langdetect import detect

import warnings
with warnings.catch_warnings():
    warnings.filterwarnings("ignore",category=DeprecationWarning)
import matplotlib.pylab as plt
import seaborn as sns
%matplotlib inline

In [2]:
contracts = pd.read_pickle('contract_data/1_contracts_with_text.pkl')
outfile = 'contract_data/2_contracts_with_text.pkl'

In [3]:
print "Number of contracts: " + str(len(contracts))

Number of contracts: 1496


### Parse by paragraph, strip HTML, carriage returns, newline characters

In [4]:
def get_paragraphs(text):
    return re.split('\s{4,}',text)

In [5]:
class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [6]:
contracts['Text_by_Paragraph'] = contracts['Contract_Text'].apply(get_paragraphs)

In [20]:
pd.set_option('max_colwidth',200)
contracts[contracts['Text_by_Paragraph'].map(len) ==2]

Unnamed: 0,OCID,Category,Contract Name,Contract Identifier,Language,Country Name,Resource,Contract Type,Signature Date,Document Type,...,Disclosure Mode,Retrieval Date,Pdf Url,Deal Number,Contract Note,Matrix Page,Annotation Category,Annotation Text,Contract_Text,Text_by_Paragraph
215,ocds-591adf-7104921871,rc,"Kosmos Energy Offshore Morocco HC, Cap Boujdour Offshore, PSA, 2011",,en,Morocco,Hydrocarbons,Production or Profit Sharing Agreement,2011-07-11,Contract,...,Company,2016-08-09,https://resourcecontracts-nrgi.s3-us-west-2.amazonaws.com/2148/2148-kosmos-energy-offshore-morocco-hc-cap-boujdour-offshore-psa-2011.pdf,,,,,,EX-10.27 28 a13-19720_1ex10d27.htm EX-10.27<br /><br />\nExhibit 10.27<br /><br />\nPETROLEUM AGREEMENT<br /><br />\n<br /><br />\nREGARDING<br /><br />\n<br /><br />\nTHE EXPLORATION FOR AND EXPL...,[EX-10.27 28 a13-19720_1ex10d27.htm EX-10.27<br /><br />\nExhibit 10.27<br /><br />\nPETROLEUM AGREEMENT<br /><br />\n<br /><br />\nREGARDING<br /><br />\n<br /><br />\nTHE EXPLORATION FOR AND EXP...
283,ocds-591adf-8080364797,rc,"Minera Yanacocha S.R.L., Maqui Maqui, Investment Promotion Agreement, 1994",,es,Peru,Gold,Investment Promotion Agreement,1994-05-20,Contract,...,Government,2016-02-13,https://resourcecontracts-nrgi.s3-us-west-2.amazonaws.com/1684/1684-peru-minera-yanacocha-exploration-investment-contract-1994.pdf,,,,,,CONTRATO DE GARANTIAS Y MEDIDAS DE PROMOCION A LA INVERSION<br /><br />\r\n<br /><br />\r\nDECRETO SUPREMO N° 0 4-94-EM<br /><br />\r\n<br /><br />\r\nSeñor Notario:<br /><br />\r\n<br /><br />\r\...,[CONTRATO DE GARANTIAS Y MEDIDAS DE PROMOCION A LA INVERSION<br /><br />\r\n<br /><br />\r\nDECRETO SUPREMO N° 0 4-94-EM<br /><br />\r\n<br /><br />\r\nSeñor Notario:<br /><br />\r\n<br /><br />\r...
303,ocds-591adf-4212712774,rc,"Compa��a Minera Oro Candente S.A., Exploration License, Investment Promotion Agreement, 2003",,es,Peru,other minerals,Exploration Permit/License;Investment Promotion Agreement,2003-09-15,Contract,...,Government,2016-01-20,https://resourcecontracts-nrgi.s3-us-west-2.amazonaws.com/1609/1609-peru-compania-minera-oro-exploration-investment-contract-2003.pdf,,,,,,",<br /><br />\nMINISTERIO<br /><br />\nDEENERGIA<br /><br />\ny MINAS<br /><br />\nDPDM-DGM<br /><br />\n<br /><br />\nFOLIO:<br /><br />\n<br /><br />\n211<br /><br />\nNómeros<br /><br />\n<br /...","[,<br /><br />\nMINISTERIO<br /><br />\nDEENERGIA<br /><br />\ny MINAS<br /><br />\nDPDM-DGM<br /><br />\n<br /><br />\nFOLIO:<br /><br />\n<br /><br />\n211<br /><br />\nNómeros<br /><br />\n<br ..."
342,ocds-591adf-1623268944,rc,"Eramen Minerals, Incorporated - Annex of MPSA No. 209-2005-III, 2005",ph_Sta-Cruz-Nickel-Project,en,Philippines,Nickel,Production or Profit Sharing Agreement,2005-04-19,Contract Annexe,...,Government,,https://resourcecontracts-nrgi.s3-us-west-2.amazonaws.com/1098/1098-annexes-mpsa-no-209-2005-iii.pdf,,,,,,︶<br /><br />\n <br /><br />\n <br /><br />\n一<br /><br />\n<br /><br />\nANNEX― A<br /><br />\n<br /><br />\nSECRETARY'S<br /><br />\nCER丁 lFiCATE<br /><br />\n<br /><br />\nν<br /><br />\n<br />...,[︶<br /><br />\n <br /><br />\n <br /><br />\n一<br /><br />\n<br /><br />\nANNEX― A<br /><br />\n<br /><br />\nSECRETARY'S<br /><br />\nCER丁 lFiCATE<br /><br />\n<br /><br />\nν<br /><br />\n<br /...
469,ocds-591adf-3068184322,rc,"Empresa Mo�ambicana de Explora��o Mineira, Minas Moatize, Limitada, Moatize, Concession, 2013",mz_dd20130413_Moatize-No-1163C_Concession_Emem_Minas,pt,Mozambique,Coal,Concession Agreement,2013-04-03,Contract,...,Government,2016-01-02,https://resourcecontracts-nrgi.s3-us-west-2.amazonaws.com/1498/1498-mozambique-emem-minas-tm-no-1163c-concession-2013.pdf,,,,,,[image]<br /><br />\r\nREPÚBLICA DE MOÇAMBIQUE MINISTÉRIO DOS RECURSOS MINERAIS<br /><br />\r\n<br /><br />\r\nCONTRATO MINEIRO ENTRE<br /><br />\r\n<br /><br />\r\nO ESTADO REPRESENTADO PELO MINI...,[[image]<br /><br />\r\nREPÚBLICA DE MOÇAMBIQUE MINISTÉRIO DOS RECURSOS MINERAIS<br /><br />\r\n<br /><br />\r\nCONTRATO MINEIRO ENTRE<br /><br />\r\n<br /><br />\r\nO ESTADO REPRESENTADO PELO MIN...
488,ocds-591adf-5374338853,rc,"La Societe Agip Mineraria, Concession El borma - Tunisia, 1960",,fr,Tunisia,Hydrocarbons,Concession Agreement,1960-06-10,Contract,...,Government,2016-07-16,https://resourcecontracts-nrgi.s3-us-west-2.amazonaws.com/2046/2046-la-societe-agip-mineraria-concession-el-borma-tunisia-1960.pdf,,,,,,CONVENTION<br /><br />\r\n<br /><br />\r\nentre<br /><br />\r\n<br /><br />\r\nL’Etat Tunisien et la Société Agip Mineraria<br /><br />\r\n<br /><br />\r\n<br /><br />\r\nEDITION JANVIER 1982<br /...,[CONVENTION<br /><br />\r\n<br /><br />\r\nentre<br /><br />\r\n<br /><br />\r\nL’Etat Tunisien et la Société Agip Mineraria<br /><br />\r\n<br /><br />\r\n<br /><br />\r\nEDITION JANVIER 1982<br ...
515,ocds-591adf-3538681265,rc,"Philex Mining Corporation - Annex of MPSA No. 276-2009-CAR, 2009",ph_Padcal-Copper-Gold-Operation,en,Philippines,Copper;Gold,Production or Profit Sharing Agreement,2009-01-19,Contract Annexe,...,Government,,https://resourcecontracts-nrgi.s3-us-west-2.amazonaws.com/1088/1088-annexes-mpsa-no-276-2009-car.pdf,,,,,,"ANNEX“ A""<br /><br />\nV<br /><br />\n<br /><br />\nCORPORATE SECRETARY'S<br /><br />\nCERttIFiCAT10N<br /><br />\n1<br /><br />\n<br /><br />\n▼<br /><br />\n<br /><br />\n/<br /><br />\n<br /><...","[ANNEX“ A""<br /><br />\nV<br /><br />\n<br /><br />\nCORPORATE SECRETARY'S<br /><br />\nCERttIFiCAT10N<br /><br />\n1<br /><br />\n<br /><br />\n▼<br /><br />\n<br /><br />\n/<br /><br />\n<br />..."
520,ocds-591adf-3156014237,rc,"Shuley Mine, Incorporated - Annex of MPSA No. 072-97-XIII (Surigao Mineral Reservation), 1997",ph_Nonoc-Nickel-Project,en,Philippines,Nickel,Production or Profit Sharing Agreement,1997-08-07,Contract Annexe,...,Government,,https://resourcecontracts-nrgi.s3-us-west-2.amazonaws.com/1103/1103-annexes-mpsa-no-072-97-xiii.pdf,,,,,,"Annex B<br /><br />\n<br /><br />\nPNNTX<br /><br />\n<br /><br />\nG<br /><br />\n<br /><br />\nOrry,tor..<br /><br />\n<br /><br />\nO<br /><br />\n<br /><br />\nNonoc lsland<br /><br />\n<br />...","[Annex B<br /><br />\n<br /><br />\nPNNTX<br /><br />\n<br /><br />\nG<br /><br />\n<br /><br />\nOrry,tor..<br /><br />\n<br /><br />\nO<br /><br />\n<br /><br />\nNonoc lsland<br /><br />\n<br /..."
583,ocds-591adf-2742639589,rc,"Ivanhoe Energy Ecuador Inc., Empresa Estatal de Petr�leos del Ecuador, Empresa Estatal de Exploraci�n y Producci�n de Petr�leos del Ecuador, Block 20 Pungarayacu, Service Contract, 2008",,en,Ecuador,Hydrocarbons,Service Contract,2008-10-08,Contract,...,,,https://resourcecontracts-nrgi.s3-us-west-2.amazonaws.com/779/779-empresa-petroecuador-ecuador-2007.pdf,,,,,,"ENGLISH TRANSLATION OF CONTRACT AS SIGNED<br /><br />\nOCTOBER 8, 2008<br /><br />\nSPECIFIC SERVICES CONTRACT FOR DEVELOMENT, PRODUCTION AND<br /><br />\nUPGRADING OF CRUDE OIL<br /><br />\nIN B...","[ENGLISH TRANSLATION OF CONTRACT AS SIGNED<br /><br />\nOCTOBER 8, 2008<br /><br />\nSPECIFIC SERVICES CONTRACT FOR DEVELOMENT, PRODUCTION AND<br /><br />\nUPGRADING OF CRUDE OIL<br /><br />\nIN ..."
652,ocds-591adf-6064437287,rc,"Aral Petroleum Capital CJSC, North Block, Exploration License, 2002",kz_North-Block_dd20021229_Exploration-Contract_Aral-CJSC,en,Kazakhstan,Hydrocarbons,Exploration Permit/License,2002-12-29,Contract,...,Other,2015-08-31,https://resourcecontracts-nrgi.s3-us-west-2.amazonaws.com/1060/1060-kazakhstan-aral-petroleum-north-block-exploration-contract.pdf,,,,,,UNOFFICIAL TRANSLATION<br /><br />\nPREPARED BY DENTON WILDE SAPTE<br /><br />\nANNEX A<br /><br />\n<br /><br />\n(The State Emblem of the Republic of Kazakhstan)<br /><br />\n<br /><br />\nCONTR...,[UNOFFICIAL TRANSLATION<br /><br />\nPREPARED BY DENTON WILDE SAPTE<br /><br />\nANNEX A<br /><br />\n<br /><br />\n(The State Emblem of the Republic of Kazakhstan)<br /><br />\n<br /><br />\nCONT...


In [9]:
contracts['Text_by_Paragraph'][0]



In [None]:
contracts['HTML_Stripped_Text'] = contracts['Contract_Text'].apply(strip_tags)

In [15]:
characters_to_replace = ['\xef','\xbb','\xbf','\r','\n']
for char in characters_to_replace:
    contracts['HTML_Stripped_Text'] = contracts['HTML_Stripped_Text'].str.replace(char," ")

In [16]:
contracts['TextLength'] = contracts['HTML_Stripped_Text'].str.len()

In [22]:
## Remove short documents
contracts = contracts[contracts['TextLength'] > 2000].copy()

### Clean Text

In [None]:
unicode_re = nltk.regexp.re.compile(u'('u'\ud83c[\udf00-\udfff]|'
                        u'\ud83d[\udc00-\ude4f\ude80-\udeff]|'
                        u'[\u2600-\u26FF\u2700-\u27BF])+',
                        nltk.regexp.re.UNICODE)
url_re = r'(?:www|https?\://)\S+'

In [None]:
def unicode_remove(text):
    try:
        return text.decode('unicode_escape').encode('ascii','ignore')
    except:
        try:
            return text.decode('string-escape').encode('ascii','ignore')
        except:
            return repr(text).translate(None,r'\\x')

def perform_lowercase(text):
    """
    Mutates and returns text where all characters are lowercased
    """
    new_text = text.lower()
    return new_text

def urlpath_replace(text):
    """
    Replaces URL with URL base path
    """
    new_text = text
    urls = nltk.regexp.re.findall(url_re,text)
    for url in urls:
        try:
            parsed = urlparse.urlparse(url)
            urlnetloc = parsed.netloc
            urlpath = parsed.path
            if urlnetloc != '':
                new_text = text.replace(url,urlnetloc)
            else:
                new_text = text.replace(url,urlpath)
            text = new_text
        except:
            pass
    return new_text

def emoji_replace_word(text):
    """
    Replaces emoji byte representations with ' EMOJI ' 
    Uses UCS-4 wide code points for the regex
    http://stackoverflow.com/questions/26568722/remove-unicode-emoji-using-re-in-python
    """
    if isinstance(text, unicode):
        new_text = nltk.regexp.re.sub(unicode_re, ' emoji ', text)
        return new_text
    else:
        return text

def remove_underscore(text):
    new_text = text.replace('_',' ')
    return new_text

def punctuation_remove(text):
    """
    Mutates and returns text where all punctuation are replaced
    """
    chars = re.escape(string.punctuation)
    return re.sub(r'['+chars+']', ' ',text)

def doublespace_remove(text):
    return re.sub(' +',' ',text)

In [None]:
def cleaning(text):
    function_list = [
        unicode_remove, perform_lowercase, urlpath_replace, 
        emoji_replace_word, punctuation_remove, 
        remove_underscore, doublespace_remove
                ]
    for func in function_list:
        text = func(text)
    
    return text

In [None]:
contracts['CleanText'] = contracts.apply(lambda row: row['HTML_Stripped_Text'] if row['Language'] == 'ar' else cleaning(row['HTML_Stripped_Text']),axis=1)

### Optimizing for NLP

In [None]:
stopwords = set(stopwords.words('english'))
stopwords.update(stopword_list)
def remove_stopwords(text):
    return ' '.join([i for i in text.lower().split() if i not in stopwords])