![alt text](DataKind_orange.png)

# Omidyar Extractives Project 1
## Clean Contract Text (Notebook 2 of 5)
### 1. Reads in contract text
### 2. Strips HTML from text
### 3. Cleans text for NLP prep (remove unicode, special characters, stopwords, etc.)
### 4. Outputs cleaned text appended to dataframe from Notebook 1

In [23]:
import re
import string
import pandas as pd
from HTMLParser import HTMLParser
import nltk
from nltk.corpus import stopwords
from langdetect import detect

import warnings
with warnings.catch_warnings():
    warnings.filterwarnings("ignore",category=DeprecationWarning)
import matplotlib.pylab as plt
import seaborn as sns
%matplotlib inline

In [25]:
contracts = pd.read_pickle('contract_data/1_contracts_with_text.pkl')
outfile = 'contract_data/2_contracts_with_text.pkl'

In [26]:
print "Number of contracts: " + str(len(contracts))

Number of contracts: 1496


### Strip HTML, carriage returns, newline characters

In [27]:
class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [28]:
contracts['HTML_Stripped_Text'] = contracts['Contract_Text'].apply(strip_tags)

In [29]:
characters_to_replace = ['\xef','\xbb','\xbf','\r','\n']
for char in characters_to_replace:
    contracts['HTML_Stripped_Text'] = contracts['HTML_Stripped_Text'].str.replace(char," ")

In [30]:
contracts['TextLength'] = contracts['HTML_Stripped_Text'].str.len()

Unnamed: 0,OCID,Contract Type,Pdf Url,Contract_Text
13,ocds-591adf-8368260751,Production or Profit Sharing Agreement,https://resourcecontracts-nrgi.s3-us-west-2.amazonaws.com/1106/1106-annex-mpsa-no-011-92-x-smr.pdf,Loading page 1
32,ocds-591adf-1344520384,,https://resourcecontracts-nrgi.s3-us-west-2.amazonaws.com/112/112-journal-officiel-de-la-republique-du-congo-marine-iv.pdf,"Du jeudi 19 avril 2012<br /><br />\n<br /><br />\nJournal officiel de la République du Congo<br /><br />\n<br /><br />\nDécret n° 2012 - 330 du 12 avril 2012 portant prorogation de la seconde période de validité du<br /><br />\npermis de recherche “Marine IV”<br /><br />\nLe Président de la République,<br /><br />\nVu la Constitution ;<br /><br />\nVu la loi n° 24-94 du 23 août 1994 portant co..."
117,ocds-591adf-0855371527,Concession Agreement;Contract Annex,https://resourcecontracts-nrgi.s3-us-west-2.amazonaws.com/862/862-cbg-gac-convention-ratification-guinee-2005.pdf,﻿ REPUBLIQUE DE GUINEE <br />\r<br /><br />\n <br />\r<br /><br />\n Travail-Justice-Solidarite <br />\r<br /><br />\n ...
124,ocds-591adf-1432977950,,https://resourcecontracts-nrgi.s3-us-west-2.amazonaws.com/892/892-helalin-annulation-permis-guinee-2010.pdf,"MINISTERE DES MINES ET DE LA GEOLOGIE REPUBLIQUE DE GUINEE Travail - Justice - Solidarité<br /><br />\r\nARRETE N°A2010/2399/MMG/SGG<br /><br />\r\nNOTIFIANT LE RETRAIT DES PERMIS DE RECHERCHES MINIERES, ACCORDES AUX SOCIETES CI-DESSOUS<br /><br />\r\nLE MINISTRE<br /><br />\r\nSur Recommandation du Centre de Promotion et de Développement miniers<br /><br />\r\nVu Le Communiqué N°001/CNDD du..."
129,ocds-591adf-8307320141,Concession Agreement,https://resourcecontracts-nrgi.s3-us-west-2.amazonaws.com/900/900-bellzone-holdings-ordonnance-guinee-2010.pdf,REPUBLIQUE DE GUINEE Travail —Justice - Solidarité <br /><br />\r\nPRESIDENCE DE LA REPUBLIQUE<br /><br />\r\nSECRETARIAT GENERAL DU GOUVERNEMENT <br /><br />\r\n<br /><br />\r\nORDONNANCE N° 011/PRG/CNDD/SGG/2010<br /><br />\r\nPORTANT RATIFICATION DE LA CONVENTION DE BASE CONCLUE LE 26 JUILLET 2010 ENTRE LA REPUBLIQUE DE GUINEE ET ET LES SOCIETES BELLZONE MINING P/c ET BELLZONE HOLDINGS S.A...
134,ocds-591adf-0621960018,Concession Agreement;Contract Annex,https://resourcecontracts-nrgi.s3-us-west-2.amazonaws.com/908/908-cbk-annex-a-guinee-2000.pdf,﻿ RUSSIAN ALUMINIUM <br />\r<br /><br />\n <br />\r<br /><br />\n JOINT STOCK COMPANY <br />\r<br /><br />\n ...
135,ocds-591adf-2096071243,Concession Agreement;Contract Annex,https://resourcecontracts-nrgi.s3-us-west-2.amazonaws.com/907/907-cbk-accord-annexe-c-guinee-2000.pdf,"ACCORD<br /><br />\r\n<br /><br />\r\nEntre les soussignés<br /><br />\r\n<br /><br />\r\nLa compagnie « RUSSKY ALUMINY LTD », enregistrée aux Îles Vierges Britanniques, sise à : bureau de Commonwealth Trust Limited, P.O. Box 3321, Drake Chambers, Road Town, Tortola, British Virgin Islands, ci-après dénommée la « Compagnie », représentée par M. Anatoli PANTCHENKO, Chef de la Représentation de ..."
164,ocds-591adf-5265824533,Exploitation Permit/License,https://resourcecontracts-nrgi.s3-us-west-2.amazonaws.com/1757/1757-egypt-egyptian-general-petroleum-corporation-alliance-egyptian-national-exploration-company-lease-2010.pdf,"Cairo :<br /><br />\r\nEng. Sameh Fahmy <br /><br />\r\nMinister of Petroleum <br /><br />\r\nMinistry of Petroleum <br /><br />\r\nNasr City<br /><br />\r\nSubject :<br /><br />\r\n<br /><br />\r\nRequest for the approval of the Minister of Petroleum to convert the oil discoveries of Lagia 6 and Lagia 7 wells to a Development Lease "" Lagia Development Lease"" in Central Sinai Conces..."
324,ocds-591adf-0725247752,Concession Agreement,https://resourcecontracts-nrgi.s3-us-west-2.amazonaws.com/914/914-euronimba-ratification-guinee-2003.pdf,REPUBLIQUE DE GUINEE<br /><br />\r\nTravail – Justice - Solidarité<br /><br />\r\nASSEMBLEE NATIONALE DE LA REPUBLIQUE DE GUINEE<br /><br />\r\nLOI<br /><br />\r\nL/2003/009/AN<br /><br />\r\nRATIFIANT ET PROMULGUANT LA CONVENTION DE CONCESSION MINIERE ENTRE LA REPUBLIQUE DE GUINEE ET EURONIMBA POUR L’EXPLOITATION DES GISEMENTS DE FER DES MONTS NIMBA<br /><br />\r\nL’ASSEMBLEE NATIONALE DE LA ...
326,ocds-591adf-4278971256,Concession Agreement,https://resourcecontracts-nrgi.s3-us-west-2.amazonaws.com/918/918-cbg-guinee-1964.pdf,N°252 / PRC<br /><br />\r\nREPUBLIQUE DE GUINEE<br /><br />\r\n<br /><br />\r\nTRAVAIL-JUSTICE-SOLIDARITE<br /><br />\r\n<br /><br />\r\nPRESIDENCE DE LA REPUBLIQUE<br /><br />\r\n<br /><br />\r\nSECRETARIAT GENERAL DU GOUVERNEMENT<br /><br />\r\n<br /><br />\r\n<br /><br />\r\nDECRET<br /><br />\r\n<br /><br />\r\nLE PRESIDENT DE LA REPUBLIQUE<br /><br />\r\n<br /><br />...


### Clean Text

In [15]:
unicode_re = nltk.regexp.re.compile(u'('u'\ud83c[\udf00-\udfff]|'
                        u'\ud83d[\udc00-\ude4f\ude80-\udeff]|'
                        u'[\u2600-\u26FF\u2700-\u27BF])+',
                        nltk.regexp.re.UNICODE)
url_re = r'(?:www|https?\://)\S+'

In [16]:
def unicode_remove(text):
    try:
        return text.decode('unicode_escape').encode('ascii','ignore')
    except:
        try:
            return text.decode('string-escape').encode('ascii','ignore')
        except:
            return repr(text).translate(None,r'\\x')

def perform_lowercase(text):
    """
    Mutates and returns text where all characters are lowercased
    """
    new_text = text.lower()
    return new_text

def urlpath_replace(text):
    """
    Replaces URL with URL base path
    """
    new_text = text
    urls = nltk.regexp.re.findall(url_re,text)
    for url in urls:
        try:
            parsed = urlparse.urlparse(url)
            urlnetloc = parsed.netloc
            urlpath = parsed.path
            if urlnetloc != '':
                new_text = text.replace(url,urlnetloc)
            else:
                new_text = text.replace(url,urlpath)
            text = new_text
        except:
            pass
    return new_text

def emoji_replace_word(text):
    """
    Replaces emoji byte representations with ' EMOJI ' 
    Uses UCS-4 wide code points for the regex
    http://stackoverflow.com/questions/26568722/remove-unicode-emoji-using-re-in-python
    """
    if isinstance(text, unicode):
        new_text = nltk.regexp.re.sub(unicode_re, ' emoji ', text)
        return new_text
    else:
        return text

def remove_underscore(text):
    new_text = text.replace('_',' ')
    return new_text

def punctuation_remove(text):
    """
    Mutates and returns text where all punctuation are replaced
    """
    chars = re.escape(string.punctuation)
    return re.sub(r'['+chars+']', ' ',text)

def doublespace_remove(text):
    return re.sub(' +',' ',text)

In [17]:
def cleaning(text):
    function_list = [
        unicode_remove, perform_lowercase, urlpath_replace, 
        emoji_replace_word, punctuation_remove, 
        remove_underscore, doublespace_remove
                ]
    for func in function_list:
        text = func(text)
    
    return text

In [18]:
contracts['CleanText'] = contracts.apply(lambda row: row['HTML_Stripped_Text'] if row['Language'] == 'ar' else cleaning(row['HTML_Stripped_Text']),axis=1)

### Optimizing for NLP

In [None]:
stopwords = set(stopwords.words('english'))
stopwords.update(stopword_list)
def remove_stopwords(text):
    return ' '.join([i for i in text.lower().split() if i not in stopwords])