In [1]:
import os
import gc
from collections import Counter
from itertools import product
import re
import pandas as pd
import numpy as np

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk import word_tokenize, pos_tag
from textblob import TextBlob
from textstat.textstat import textstat
from langdetect import detect

from IPython.display import display
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_context('notebook', rc={'figure.figsize': (10, 6)}, font_scale=1.5)

In [2]:
resource_folder = 'contract_data/Contracts_Annotations/resource_contracts/'
land_folder = 'contract_data/Contracts_Annotations/openland_contracts/'
folders = [resource_folder,land_folder]

In [3]:
target_labels = ['stabilization','royalties']
n_folds = 5 

In [4]:
# Languages must be supported by NLTK Snowball Stemmer and stopwords
languages = ['english','french','spanish','italian','portuguese','dutch','swedish']
language_codes = pd.read_csv('http://data.okfn.org/data/core/language-codes/r/language-codes.csv')
language_codes.columns = ['language_code','Language']

In [5]:
annotations = pd.DataFrame()
for folder in folders:
    xls_files = [f for f in os.listdir(folder) if f.lower().endswith('.xls')]
    for xls in xls_files:
        temp = pd.read_excel(folder + xls)
        if len(temp) > 0:
            temp['OCID'] = xls[:-4]
            temp['Source'] = folder.split('/')[-2]
            annotations = annotations.append(temp)
print "Number of annotations: " + str(len(annotations))

Number of annotations: 20609


In [6]:
labels = {}
for label in target_labels:
    labels[label] = [1 if x.lower() == label.lower() else 0 for x in annotations['Category']]
    print str("{0:.2f}%".format(100*np.sum(labels[label]) / float(len(labels[label])))) + ' ' + label

1.54% stabilization
2.37% royalties


In [7]:
annotations.head(10)

Unnamed: 0,Category,Topic,Annotation Text,PDF Page Number,Article Reference,OCID,Source
0,Project title,General,Production Sharing Contract between Sociedade ...,1,Preamble,ocds-591adf-0014595575,resource_contracts
1,"Name of field, block, deposit or site",General,Block 20/11,1,Preamble,ocds-591adf-0014595575,resource_contracts
2,Name of company executing document,General,CIE Angola Block 20 LTD (referred to as Cobalt...,3,Preamble,ocds-591adf-0014595575,resource_contracts
3,"State agency, national company or ministry exe...",General,"Sociedade Nacional de Combustiveis de Angola, ...",3,Preamble,ocds-591adf-0014595575,resource_contracts
4,Type of contract,General,Production Sharing Contract,3,Preamble,ocds-591adf-0014595575,resource_contracts
5,Country,General,Angola,3,Preamble,ocds-591adf-0014595575,resource_contracts
6,Parent company or affiliates outside of country,General,"Cobalt, Cayman Islands; BP Exploration Angola ...",3,Preamble,ocds-591adf-0014595575,resource_contracts
7,Resource(s),General,Hydrocarbons,4,Preamble,ocds-591adf-0014595575,resource_contracts
8,Date of issue of title/permit,General,40909,5,Art. 1.15,ocds-591adf-0014595575,resource_contracts
9,Infrastructure,Operations,Contractor Group may construct and install pip...,5,"Arts. 1.18, 19, 20.3, 29.3",ocds-591adf-0014595575,resource_contracts


In [8]:
punctuation_re = r'[^\w\s#]'

def punctuation_remove(text):
    """
    Mutates and returns text where all punctuation besides hashtags,
    are replaced
    """
    new_text = nltk.regexp.re.sub(punctuation_re, ' ', text)
    return new_text

def perform_lowercase(text):
    """
    Mutates and returns text where all characters are lowercased
    """
    try:
        new_text = text.lower()
    except:
        new_text = str(text).lower()
    return new_text

def doublespace_remove(text):
    return re.sub(' +',' ',text)

def textblobsent(text):
    '''
    returns the TextBlob polarity and subjectivity
    '''
    text = text.encode('ascii','ignore')
    sent = TextBlob(text).sentiment
    return pd.Series([sent.polarity,sent.subjectivity])

def get_length(document):
    return len(document)

def get_num_words(document):
    return len(document.split())

def get_avg_wordlength(document):
    wordlengths = [len(word) for word in document.split()]
    if len(wordlengths) == 0:
        return 0
    else:
        return np.mean(wordlengths)

def get_num_syllables(text):
    '''
    returns the number of syllables
    '''
    return textstat.syllable_count(text)

def determine_tense(essay):
    '''
    Returns the number of past, present and future tense verbs in a given text
    '''
    text = word_tokenize(essay)
    tagged = pos_tag(text)

    numfuture = len([word for word in tagged if word[1] == "MD"])
    numpresent = len([word for word in tagged if word[1] in ["VBP", "VBZ","VBG"]])
    numpast = len([word for word in tagged if word[1] in ["VBD", "VBN"]]) 
    
    return pd.Series([numpast,numpresent,numfuture],index=['NumPast','NumPresent','NumFuture'])

def get_pos_tags(text):
    '''
    Returns part of speech tag counts
    '''
    text = word_tokenize(text)
    tagged = pos_tag(text)

    counts = Counter([word[1] for word in tagged]).items()
    countdict = {}
    for key, value in counts:
        countdict[key] = value  
    
    return countdict

def detect_lang(text):
    '''
    Returns detected language
    '''
    text = doublespace_remove(text)
    try:
        return detect(text)
    except:
        return 'None'
    
def remove_stopwords(row):
    '''
    Multilingual stopwords removal
    '''
    language = row['Language']
    if language in languages:
        text = ' '.join([word for word in row['CleanText'].split(' ') if word not in stopwords.words(language)])
        return text
    else:
        return row['CleanText']
    
def stem_words(row):
    ''' 
    Multilingual word stemmer
    '''
    language = row['Language']    
    if language in languages:
        stemmer = SnowballStemmer(language)
        text = ' '.join([stemmer.stem(word) for word in row['CleanText_NoStop'].split(' ')])
        return text
    else:
        return row['CleanText_NoStop']

In [9]:
def clean_text(df):
    df['Annotation Text'].fillna('',inplace=True)
    df['CleanText'] = df['Annotation Text']
    func_list = [perform_lowercase,punctuation_remove,doublespace_remove]
    for func in func_list:
        df['CleanText'] = df['CleanText'].apply(func)

    return df

def featurize(df):
    
    df['NumWords'] = df['CleanText'].apply(get_num_words)
    df['NumCharacters'] = df['CleanText'].apply(get_length)
    df['AvgWordLength'] = df['CleanText'].apply(get_avg_wordlength)
    df['NumSyllables'] = df['CleanText'].apply(get_num_syllables)
    df['language_code'] = df['CleanText'].apply(detect_lang)
    df = df.merge(language_codes,how='left',on='language_code')
    df['Language'] = df['Language'].astype('str')
    df['Language'] = df['Language'].apply(lambda x: x.lower().split(';')[0])
    df['Language'] = df['Language'].apply(lambda x: x if x in languages else 'other')
    df['Language'].fillna('None')
    langdummies = pd.get_dummies(df['Language'],prefix='language_')
    
    df['CleanText_NoStop'] = df.apply(remove_stopwords,axis=1)
    df['CleanText_NoStop_Stemmed'] = df.apply(stem_words,axis=1)
    df.drop(['language_code','Language'],axis=1,inplace=True)
    
    tenses = df['CleanText'].apply(determine_tense)
    tenses.columns = ['tense_' + col for col in tenses.columns]
    
    postagcounts = []
    for index, row in df.iterrows():
        postagcounts.append(get_pos_tags(row['CleanText']))    
    postagdf = pd.DataFrame(postagcounts).fillna(0)
    postagdf.index = df.index
    postagdf.columns = ['postag_' + col for col in postagdf.columns]
    
    textblobsentdf = df['CleanText'].apply(textblobsent)
    textblobsentdf.columns = ['TextblobPolarity','TextblobSubjectivity']
    df = pd.concat([df,textblobsentdf,tenses,postagdf,langdummies],axis=1)
    
    return df

In [10]:
annotations = clean_text(annotations)
annotations = featurize(annotations)

In [11]:
tfidf_vectorizer = TfidfVectorizer(max_df= .6,
                                 min_df= .002, 
                                 stop_words=None,  
                                 use_idf=True, 
                                 ngram_range=(1,4))
tfidf_matrix = tfidf_vectorizer.fit_transform(annotations['CleanText_NoStop_Stemmed'].values.astype('U'))
terms = tfidf_vectorizer.get_feature_names()
tfidf_matrix = tfidf_matrix.todense()
tfidf = pd.DataFrame(tfidf_matrix)
tfidf.index = annotations.index
tfidf.columns = terms
print tfidf.shape

(20609, 12468)


In [12]:
exclude = ['Source','Category','Topic','Annotation Text','CleanText','CleanText_NoStop','CleanText_NoStop_Stemmed',
           'OCID','PDF Page Number','Article Reference','MD','VBP','VBZ','VBG','VBD','VBN','other',"''"]
features = [col for col in annotations.columns.tolist() if not col in exclude]

In [13]:
X = pd.concat([tfidf,annotations[features]],axis=1)

In [14]:
X.fillna(0,inplace=True)
X = X.rename(columns = {'fit':'fit_feature'})

In [15]:
classifiers = [
            LogisticRegression(),
            DecisionTreeClassifier(),
            RandomForestClassifier(),
            ExtraTreesClassifier()
             ]

In [18]:
parameters = [
        {'penalty': ['l1','l2']      # LogisticRegression
              ,'C': [.001,.01,.1,1,10,100]
              ,'class_weight': [None,'balanced']
              ,'n_jobs': [-1]}
        ,{'max_depth': [8,10,12,14]}       #DecisionTree
        ,{'n_estimators': [100]              # RandomForest
              ,'max_features': [.2,.33,.5] 
              ,'max_depth': [10,12,14,16] 
              ,'class_weight': ['balanced','balanced_subsample']
              ,'n_jobs': [-1]}
        ,{'max_features':['auto',0.2,0.33,0.5]       #ExtraTrees
              ,'n_estimators': [100]
              ,'n_jobs': [-1]
              ,'max_depth': [10,12,14,16]}
]

In [19]:
def grid_search(X_all, y_all, classifiers, parameters):
    
    # Runs grid search on given list of classifiers and parameters dictionary
    scores = ['accuracy','precision_macro','recall_macro','roc_auc']
    X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.33)
    
    # Dummy Classifier results
    dc = DummyClassifier(strategy='most_frequent')
    dc.fit(X_train, y_train)
    dummy_acc_train = accuracy_score(y_train, dc.predict(X_train))
    dummy_acc_test = accuracy_score(y_test, dc.predict(X_test))
    print '     *** Dummy Model Accuracy ***'
    print '     Train: ' + str("{0:.2f}%".format(100*dummy_acc_train  ))
    print '     Test: ' + str("{0:.2f}%".format(100*dummy_acc_test))
    results = []
    for i in range(len(classifiers)):
        model = str(classifiers[i]).split('(')[:1][0]
        print '     ******************************************'
        print '     *** ' + model + ' ***'        
        print '     Tuning hyper-parameters for:' 
        for score in scores:
            print '     ' + score.title()
            clf = GridSearchCV(classifiers[i], parameters[i],cv=n_folds,scoring=score ,n_jobs=-1)            
            clf.fit(X_train,y_train)
            y_true, y_pred = y_test, clf.predict(X_test)
            
            acc = accuracy_score(y_true, y_pred)
            prec = precision_score(y_true, y_pred, average='macro')
            rec = recall_score(y_true, y_pred, average='macro')
            roc_auc = roc_auc_score(y_true, y_pred, average='macro')

            results.append([model, score, clf.best_params_, acc, prec, rec, roc_auc, dummy_acc_test])
    return results

In [None]:
label_results = {}
for label in target_labels:
    print '******************************************************************'
    print '********************** Label: ' + label.title() + ' **********************'
    y = labels[label]
    label_results[label] = grid_search(X, y, classifiers, parameters)

*******************************************************************
********************** Label: Stabilization **********************
     *** Dummy Model Accuracy ***
     Train: 98.49%
     Test: 98.40%
     ******************************************
     *** LogisticRegression ***
     Tuning hyper-parameters for:
     Accuracy
     Precision_Macro


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


     Recall_Macro
     Roc_Auc
     ******************************************
     *** DecisionTreeClassifier ***
     Tuning hyper-parameters for:
     Accuracy
     Precision_Macro
     Recall_Macro
     Roc_Auc
     ******************************************
     *** RandomForestClassifier ***
     Tuning hyper-parameters for:
     Accuracy
     Precision_Macro
     Recall_Macro


In [None]:
for label in labels:
    resultsdf = pd.DataFrame(label_results[label])
    resultsdf.columns = ['Model','Scoring Method','Best Params','Accuracy','PrecisionMacro','RecallMacro','AUCMacro','Dummy Accuracy']
    print label.title()
    display(resultsdf)