![alt text](DataKind_orange.png)

# NRGI Extractives Contracts
## Featurize and Predict
### 1. Reads in text (already parsed by paragraph and HTML stripped)
### 2. Cleans text
### 3. Featurizes
### 4. Predicts using previously pickled model
### 5. Outputs results to csv

In [2]:
import os, io
from collections import Counter
from itertools import product
import re, string
import pandas as pd
import numpy as np
import pickle

from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.externals import joblib

import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from textblob import TextBlob

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_context('notebook', rc={'figure.figsize': (10, 6)}, font_scale=1.5)

In [3]:
data_input_file = 'contract_data/cleaned_unannotated_contracts_by_paragraph_en.pkl'
pickled_model = 'models/model_3_classes.pkl'
pickled_tfidf = 'models/tfidf_vectorizer.pkl'
output_results = 'results_classification/unannotated_corpus_predictions_3_classes_en.pkl'

In [4]:
# spacy is used for Part of Speech tagging and Named Entity Recognition
# spacy is a non-standard python library which can be installed using 'pip install spacy' from the command line
# language models can be downloaded by running 'python -m spacy download <language>' from the command line
import spacy
supported_languages = ['en','fr','es']
language_dict = {'en':'english','fr':'french','es':'spanish'}
nlp_langs = {}
for language in supported_languages:
    nlp_langs[language]  = spacy.load(language)

### Load Data and model

In [5]:
# contracts_by_para = pd.read_pickle(data_input_file)
model = joblib.load(pickled_model)
tfidf_vectorizer = joblib.load(pickled_tfidf)

In [None]:
len(contracts_by_para)

In [None]:
def punctuation_remove(text):
    """
    Mutates and returns text where all punctuation besides underscores,
    are replaced
    """
    punctuation_re = r'[^\w\s_]'
    new_text = nltk.regexp.re.sub(punctuation_re, ' ', text)
    return new_text

def replace_numbers(text):
    ''' 
    Removes all characters but periods, commas and alpha-numeric and 
    returns all numeric values replace with the word numeric_value
    '''
#     allowed = {",", "."," ","%"}.union(string.ascii_letters).union([str(num) for num in range(0,10)])
#     filtered = ''.join([character for character in text if character in allowed])
    wordlist = text.split()
    for i in range(len(wordlist)):
        if '$' in wordlist[i]:
            try:
                int(wordlist[i].split('$')[-1].replace(',','').replace('.','').replace('-','').replace(')','').replace('(','').replace('\'','').replace(';','').replace(':',''))
                wordlist[i] = ' '.join(wordlist[i].split('$')[:-1]) + ' dollarvalue'
            except:
                pass
        elif '%' in wordlist[i]:
            try: 
                int(wordlist[i].replace(',','').replace('.','').replace('-','').replace('/','').replace('%','').replace(')','').replace('(','').replace('\'','').replace(';','').replace(':',''))
                wordlist[i] = 'percentvalue'
            except:
                pass
        else:
            try: 
                int(wordlist[i].replace(',','').replace('.','').replace('-','').replace('/','').replace(')','').replace('(','').replace('\'','').replace(';','').replace(':',''))
                wordlist[i] = 'numericvalue'
            except:
                pass
    return ' '.join(wordlist)

def perform_lowercase(text):
    """
    Mutates and returns text where all characters are lowercased
    """
    try:
        new_text = text.lower()
    except:
        new_text = str(text).lower()
    return new_text

def underscore_remove(text):
    '''
    replaces multiple underscores with text fillintheblank
    and single underscore with space
    '''
    double_underscore_re = r'(__[a-zA-Z0-9_]*(__)?)'
    text = nltk.regexp.re.sub(double_underscore_re,'fillintheblank',text)
    return text.replace('_',' ')

def doublespace_remove(text):
    return re.sub(' +',' ',text)

def textblobsent(text):
    '''
    returns the TextBlob polarity and subjectivity
    '''
    text = text.encode('ascii','replace')
    sent = TextBlob(text).sentiment
    return pd.Series([sent.polarity,sent.subjectivity])

def get_avg_wordlength(document):
    wordlengths = [len(word) for word in document.split()]
    if len(wordlengths) == 0:
        return 0
    else:
        return np.mean(wordlengths)

def get_multilingual_pos(text,language):
    if language in nlp_langs.keys():
        try:
            tokens = nlp_langs[language](text)
            tags = [token.pos_ for token in tokens]
            counts = Counter(tags).items()
            countdict = {}
            for key, value in counts:
                countdict[key] = value
            return countdict
        except:
            return {'pos_error':1}
    else:
        return {'unsupported_language':1}

def get_multilingual_entities(row):
    
    text = row['CleanText']
    language = row['Language']
    if language in nlp_langs.keys():
        try:
            doc = nlp_langs[language](text)
            labels = [ent.label_ for ent in doc.ents]
            texts = [ent.text for ent in doc.ents]
            starts = [ent.start_char for ent in doc.ents]
            ends = [ent.end_char for ent in doc.ents]
            textlens = [len(word) for word in texts]
            labellens = [len(label) for label in labels]
            diffs = [textlens[i] - labellens[i] for i in range(len(textlens))]
            diffsum = [sum(diffs[0:i]) for i in range(len(diffs))]

            for i in range(len(labels)):
                text = text[0:starts[i] - diffsum[i]] + 'entity' + labels[i] + text[ends[i] - diffsum[i]:]
            return text
        except:
            return text
    else:
        return text
    
def remove_stopwords(row):
    '''
    Multilingual stopwords removal
    '''
    try:
        language = language_dict[row['Language']]
        text = ' '.join([word for word in row['CleanText'].split(' ') if word not in stopwords.words(language)])
        return text
    except:
        return row['CleanText']
    
def stem_words(row):
    ''' 
    Multilingual word stemmer
    '''
    language = language_dict[row['Language']]
    try:
        stemmer = SnowballStemmer(language)
        text = ' '.join([stemmer.stem(word) for word in row['CleanText_NoStop'].split(' ')])
        return text
    except:
        return row['CleanText_NoStop']
    
def clean_metadata(text):
    if type(text) in [float,int]:
        return text
    elif type(text) == str:
        return text.lower().split(';')[0]
    else:
        text = text.encode('ascii','replace')
        return text.lower().split(';')[0]

In [None]:
def clean_text(df):
    df['Clean_Paragraph_Text'].fillna('',inplace=True)
    df['CleanText'] = df['Clean_Paragraph_Text']
    df['CleanText'] = df.apply(get_multilingual_entities,axis=1)
    func_list = [perform_lowercase,replace_numbers,punctuation_remove,underscore_remove, doublespace_remove]
    for func in func_list:
        df['CleanText'] = df['CleanText'].apply(func)
        print func
    return df

def featurize(df):
    
    df['AvgWordLength'] = df['CleanText'].apply(get_avg_wordlength)
    print 'AvgWordLength complete'
    df['CleanText_NoStop'] = df.apply(remove_stopwords,axis=1)
    df['CleanText_NoStop_Stemmed'] = df.apply(stem_words,axis=1)
    print 'stemming complete'
    postagcounts = []
    entitycounts = []
    for index, row in df.iterrows():
        postagcounts.append(get_multilingual_pos(row['Clean_Paragraph_Text'],row['Language']))    
    postagdf = pd.DataFrame(postagcounts).fillna(0)
    postagdf.index = df.index
    postagdf.columns = ['postag_' + col for col in postagdf.columns]
    print 'postagging complete'
    
    # create dummy variables for categoricals
    df['Resource'] = df['Resource'].apply(clean_metadata)
    df['Contract Type'] = df['Contract Type'].apply(clean_metadata)
    df['Document Type'] = df['Document Type'].apply(lambda x: x.lower().split(';')[0])
    dummy_cols = ['Language','Country Name','Resource','Contract Type','Document Type']
    dummies = pd.get_dummies(df[dummy_cols],prefix = dummy_cols)
    print 'dummies complete'
    # drop lowest least frequent dummy columns for each
    for dummy_col in dummy_cols:
        cols = [col for col in dummies.columns if col.startswith(dummy_col)]
        dummies.drop([col for col, val in dummies[cols].sum().iteritems() if val == dummies[cols].sum().min()],axis=1,inplace=True)
    df.drop(dummy_cols, axis=1,inplace=True)
        
    textblobsentdf = df['CleanText'].apply(textblobsent)
    textblobsentdf.columns = ['TextblobPolarity','TextblobSubjectivity']
    print 'textblob complete'
    df = pd.concat([df,textblobsentdf,postagdf,dummies],axis=1)
    
    return df

In [None]:
contracts_by_para = clean_text(contracts_by_para)
contracts_by_para = featurize(contracts_by_para)

In [71]:
# contracts_by_para.to_pickle('contracts_by_para_featurized.pkl')
contracts_by_para = pd.read_pickle('contracts_by_para_featurized.pkl')

In [72]:
exclude = ['Source','Category','Topic','Annotation Text','CleanText','CleanText_NoStop','CleanText_NoStop_Stemmed',
           'OCID','PDF Page Number','Article Reference','MD','VBP','VBZ','VBG','VBD','VBN','other',"''",'label','sort_key','Corrected']
features = [str(col) for col in contracts_by_para.columns.tolist() if not col in exclude]
print len(features)
fitted_features = pickle.load(open('models/fitted_feature_list.pkl','rb'))
print len(fitted_features)

192
164


In [75]:
for col in fitted_features:
    if not col in features:
        print col
        contracts_by_para[col] = 0
        features.append(col)
print len(features)
for col in features:
    if not col in fitted_features:
        features.remove(col)
print len(features)
for col in features:
    if not col in fitted_features:
        features.remove(col)
print len(features)

166
165
164


In [76]:
chunks = [i for i in xrange(0,len(contracts_by_para),30000)]
chunks.append(len(contracts_by_para))
chunks

[0, 30000, 60000, 90000, 120000, 150000, 180000, 210000, 240000, 252816]

In [82]:
print len(chunks)

10


In [77]:
for i in range(len(chunks)-1):
    df = contracts_by_para[chunks[i]:chunks[i+1]].copy()
    df.to_pickle('contract_data/en_featurized_chunk_' + str(i) + '.pkl')

In [78]:
del contracts_by_para

In [85]:
for i in range(len(chunks)-1):
    print i
    df = pd.read_pickle('contract_data/en_featurized_chunk_' + str(i) + '.pkl')
    output_results = 'results_classification/unannotated_corpus_predictions_3_classes_en_' + str(i) + '.pkl'
    tfidf_matrix = tfidf_vectorizer.transform(df['CleanText_NoStop_Stemmed'].values.astype('U'))
    terms = tfidf_vectorizer.get_feature_names()
    tfidf_matrix = tfidf_matrix.todense()
    tfidf = pd.DataFrame(tfidf_matrix)
    tfidf.index = df.index
    tfidf.columns = terms
    print tfidf.shape
    X = pd.concat([tfidf,df[features]],axis=1)
    X.fillna(0,inplace=True)
    X = X.rename(columns = {'fit':'fit_feature'})
    print 'predicting'
    predictions = model.predict(X)
    df['Predicted_Clause'] = predictions
    df[['OCID','Paragraph_Num','Source','Clean_Paragraph_Text','CleanText','Predicted_Clause']].to_pickle(output_results)

0
(30000, 8449)
predicting
1
(30000, 8449)
predicting
2
(30000, 8449)
predicting
3
(30000, 8449)
predicting
4
(30000, 8449)
predicting
5
(30000, 8449)
predicting
6
(30000, 8449)
predicting
7
(30000, 8449)
predicting
8
(12816, 8449)
predicting
