![alt text](DataKind_orange.png)

# Omidyar Extractives Project 1
## Clean Contract Text (Notebook 2 of 5)
### 1. Reads in contract text
### 2. Strips HTML from text
### 3. Cleans text for NLP prep (remove unicode, special characters, stopwords, etc.)
### 4. Outputs cleaned text appended to dataframe from Notebook 1

In [9]:
import re
import string
import pandas as pd
from HTMLParser import HTMLParser
import nltk
from nltk.corpus import stopwords
from langdetect import detect

import warnings
with warnings.catch_warnings():
    warnings.filterwarnings("ignore",category=DeprecationWarning)
import matplotlib.pylab as plt
import seaborn as sns
%matplotlib inline

In [10]:
contracts = pd.read_pickle('contract_data/1_contracts_with_text.pkl')
outfile = 'contract_data/2_contracts_with_text.pkl'

In [11]:
print "Number of contracts: " + str(len(contracts))

Number of contracts: 1496


### Strip HTML, carriage returns, newline characters

In [13]:
class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [14]:
contracts['HTML_Stripped_Text'] = contracts['Contract_Text'].apply(strip_tags)

In [15]:
characters_to_replace = ['\xef','\xbb','\xbf','\r','\n']
for char in characters_to_replace:
    contracts['HTML_Stripped_Text'] = contracts['HTML_Stripped_Text'].str.replace(char," ")

In [16]:
contracts['TextLength'] = contracts['HTML_Stripped_Text'].str.len()

In [22]:
## Remove short documents
contracts = contracts[contracts['TextLength'] > 2000].copy()

### Clean Text

In [None]:
unicode_re = nltk.regexp.re.compile(u'('u'\ud83c[\udf00-\udfff]|'
                        u'\ud83d[\udc00-\ude4f\ude80-\udeff]|'
                        u'[\u2600-\u26FF\u2700-\u27BF])+',
                        nltk.regexp.re.UNICODE)
url_re = r'(?:www|https?\://)\S+'

In [None]:
def unicode_remove(text):
    try:
        return text.decode('unicode_escape').encode('ascii','ignore')
    except:
        try:
            return text.decode('string-escape').encode('ascii','ignore')
        except:
            return repr(text).translate(None,r'\\x')

def perform_lowercase(text):
    """
    Mutates and returns text where all characters are lowercased
    """
    new_text = text.lower()
    return new_text

def urlpath_replace(text):
    """
    Replaces URL with URL base path
    """
    new_text = text
    urls = nltk.regexp.re.findall(url_re,text)
    for url in urls:
        try:
            parsed = urlparse.urlparse(url)
            urlnetloc = parsed.netloc
            urlpath = parsed.path
            if urlnetloc != '':
                new_text = text.replace(url,urlnetloc)
            else:
                new_text = text.replace(url,urlpath)
            text = new_text
        except:
            pass
    return new_text

def emoji_replace_word(text):
    """
    Replaces emoji byte representations with ' EMOJI ' 
    Uses UCS-4 wide code points for the regex
    http://stackoverflow.com/questions/26568722/remove-unicode-emoji-using-re-in-python
    """
    if isinstance(text, unicode):
        new_text = nltk.regexp.re.sub(unicode_re, ' emoji ', text)
        return new_text
    else:
        return text

def remove_underscore(text):
    new_text = text.replace('_',' ')
    return new_text

def punctuation_remove(text):
    """
    Mutates and returns text where all punctuation are replaced
    """
    chars = re.escape(string.punctuation)
    return re.sub(r'['+chars+']', ' ',text)

def doublespace_remove(text):
    return re.sub(' +',' ',text)

In [None]:
def cleaning(text):
    function_list = [
        unicode_remove, perform_lowercase, urlpath_replace, 
        emoji_replace_word, punctuation_remove, 
        remove_underscore, doublespace_remove
                ]
    for func in function_list:
        text = func(text)
    
    return text

In [None]:
contracts['CleanText'] = contracts.apply(lambda row: row['HTML_Stripped_Text'] if row['Language'] == 'ar' else cleaning(row['HTML_Stripped_Text']),axis=1)

### Optimizing for NLP

In [None]:
stopwords = set(stopwords.words('english'))
stopwords.update(stopword_list)
def remove_stopwords(text):
    return ' '.join([i for i in text.lower().split() if i not in stopwords])