![alt text](DataKind_orange.png)

### Performs grid search for binary classification of Stabilization and Royalties clauses

In [1]:
import os, io
from collections import Counter
from itertools import product
import requests
import re, string
import pandas as pd
import numpy as np

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk import word_tokenize, pos_tag
from textblob import TextBlob
from textstat.textstat import textstat
from langdetect import detect

from IPython.display import display
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_context('notebook', rc={'figure.figsize': (10, 6)}, font_scale=1.5)

In [2]:
resource_folder = 'contract_data/Contracts_Annotations/resource_contracts/'
land_folder = 'contract_data/Contracts_Annotations/openland_contracts/'
folders = [resource_folder,land_folder]

In [3]:
labels = ['stabilization','royalties']
n_folds = 3

In [4]:
# Languages must be supported by NLTK Snowball Stemmer and stopwords
languages = ['english','french','spanish','italian','portuguese','dutch','swedish']
s = requests.get('http://data.okfn.org/data/core/language-codes/r/language-codes.csv').content
language_codes = pd.read_csv(io.StringIO(s.decode('utf-8')))
language_codes.columns = ['language_code','Language']

In [5]:
annotations = pd.DataFrame()
for folder in folders:
    xls_files = [f for f in os.listdir(folder) if f.lower().endswith('.xls')]
    for xls in xls_files:
        temp = pd.read_excel(folder + xls)
        if len(temp) > 0:
            temp['OCID'] = xls[:-4]
            temp['Source'] = folder.split('/')[-2]
            annotations = annotations.append(temp)
print "Number of annotations: " + str(len(annotations))

Number of annotations: 20609


In [6]:
# drop blank annotations
annotations.dropna(subset=['Annotation Text'],inplace=True)
print len(annotations)
# If duplicate text appears within the same contract, drop it
annotations.drop_duplicates(['Annotation Text','OCID','Category'],inplace=True)
print len(annotations)

20561
16364


In [7]:
labels = [label.lower() for label in labels]
annotations['label'] = [x.lower() if x.lower() in labels else 'other' for x in annotations['Category']]

In [8]:
# Given some duplicate text with different Category labels, keep the labels that are in our target label list when dropping
sort_num = range(len(labels) + 1)
sort_key = dict(zip(labels,sort_num))
sort_key['other'] = sort_num[-1]
annotations['sort_key'] = [sort_key[x] for x in annotations['label']]
annotations.sort_values(by='sort_key',inplace=True,ascending=True)
annotations.drop_duplicates(['Annotation Text','OCID'],keep='first',inplace=True)
print len(annotations)

16181


In [9]:
y = list(annotations['label'])
yunique = list(np.unique(y))
for item in yunique:
    print str("{0:.2f}%".format(100*y.count(item) / float(len(y)))) + " " + item

96.40% other
2.07% royalties
1.53% stabilization


In [10]:
annotations.head(2)

Unnamed: 0,Category,Topic,Annotation Text,PDF Page Number,Article Reference,OCID,Source,label,sort_key
23,Stabilization,Legal Rules,"If, as a result of any change in the laws, rul...",30,Art. 18. 3,ocds-591adf-1990915353,resource_contracts,stabilization,0
29,Stabilization,Legal Rules,If a material change occurs to Phillips China ...,40,Art. 27.2,ocds-591adf-1706855956,resource_contracts,stabilization,0


In [11]:
def punctuation_remove(text):
    """
    Mutates and returns text where all punctuation besides underscores,
    are replaced
    """
    punctuation_re = r'[^\w\s_]'
    new_text = nltk.regexp.re.sub(punctuation_re, ' ', text)
    return new_text

def replace_numbers(text):
    ''' 
    Removes all characters but periods, commas and alpha-numeric and 
    returns all numeric values replace with the word numeric_value
    '''
    allowed = {",", "."," "}.union(string.ascii_letters).union([str(num) for num in range(0,10)])
    filtered = ''.join([character for character in text if character in allowed])
    for word in text.split():
        try:
            int(word.replace(',','').replace('.',''))
            text = text.replace(word,'numericvalue')
        except:
            pass
    return text

def perform_lowercase(text):
    """
    Mutates and returns text where all characters are lowercased
    """
    try:
        new_text = text.lower()
    except:
        new_text = str(text).lower()
    return new_text

def underscore_remove(text):
    '''
    replaces multiple underscores with text fillintheblank
    and single underscore with space
    '''
    double_underscore_re = r'(__[a-zA-Z0-9_]*(__)?)'
    text = nltk.regexp.re.sub(double_underscore_re,'fillintheblank',text)
    return text.replace('_',' ')

def doublespace_remove(text):
    return re.sub(' +',' ',text)

def textblobsent(text):
    '''
    returns the TextBlob polarity and subjectivity
    '''
    text = text.encode('ascii','replace')
    sent = TextBlob(text).sentiment
    return pd.Series([sent.polarity,sent.subjectivity])

def get_avg_wordlength(document):
    wordlengths = [len(word) for word in document.split()]
    if len(wordlengths) == 0:
        return 0
    else:
        return np.mean(wordlengths)

def determine_tense(essay):
    '''
    Returns the number of past, present and future tense verbs in a given text
    '''
    text = word_tokenize(essay)
    tagged = pos_tag(text)

    numfuture = len([word for word in tagged if word[1] == "MD"])
    numpresent = len([word for word in tagged if word[1] in ["VBP", "VBZ","VBG"]])
    numpast = len([word for word in tagged if word[1] in ["VBD", "VBN"]]) 
    
    return pd.Series([numpast,numpresent,numfuture],index=['NumPast','NumPresent','NumFuture'])

def get_pos_tags(text):
    '''
    Returns part of speech tag counts
    '''
    text = word_tokenize(text)
    tagged = pos_tag(text)

    counts = Counter([word[1] for word in tagged]).items()
    countdict = {}
    for key, value in counts:
        countdict[key] = value  
    
    return countdict

def detect_lang(text):
    '''
    Returns detected language
    '''
    text = doublespace_remove(text)
    try:
        return detect(text)
    except:
        return 'None'
    
def remove_stopwords(row):
    '''
    Multilingual stopwords removal
    '''
    language = row['Language']
    if language in languages:
        text = ' '.join([word for word in row['CleanText'].split(' ') if word not in stopwords.words(language)])
        return text
    else:
        return row['CleanText']
    
def stem_words(row):
    ''' 
    Multilingual word stemmer
    '''
    language = row['Language']    
    if language in languages:
        stemmer = SnowballStemmer(language)
        text = ' '.join([stemmer.stem(word) for word in row['CleanText_NoStop'].split(' ')])
        return text
    else:
        return row['CleanText_NoStop']

In [12]:
def clean_text(df):
    df['Annotation Text'].fillna('',inplace=True)
    df['CleanText'] = df['Annotation Text']
    func_list = [perform_lowercase,replace_numbers,punctuation_remove,underscore_remove, doublespace_remove]
    for func in func_list:
        df['CleanText'] = df['CleanText'].apply(func)

    return df

def featurize(df):
    
    df['AvgWordLength'] = df['CleanText'].apply(get_avg_wordlength)
    df['language_code'] = df['CleanText'].apply(detect_lang)
    df = df.merge(language_codes,how='left',on='language_code')
    df['Language'] = df['Language'].astype('str')
    df['Language'] = df['Language'].apply(lambda x: x.lower().split(';')[0])
    df['Language'] = df['Language'].apply(lambda x: x if x in languages else 'other')
    df['Language'].fillna('None')
    langdummies = pd.get_dummies(df['Language'],prefix='language')
    
    df['CleanText_NoStop'] = df.apply(remove_stopwords,axis=1)
    df['CleanText_NoStop_Stemmed'] = df.apply(stem_words,axis=1)
    df.drop(['language_code','Language'],axis=1,inplace=True)
    
    tenses = df['CleanText'].apply(determine_tense)
    tenses.columns = ['tense_' + col for col in tenses.columns]
    
    postagcounts = []
    for index, row in df.iterrows():
        postagcounts.append(get_pos_tags(row['CleanText']))    
    postagdf = pd.DataFrame(postagcounts).fillna(0)
    postagdf.index = df.index
    postagdf.columns = ['postag_' + col for col in postagdf.columns]
    
    textblobsentdf = df['CleanText'].apply(textblobsent)
    textblobsentdf.columns = ['TextblobPolarity','TextblobSubjectivity']
    df = pd.concat([df,textblobsentdf,tenses,postagdf,langdummies],axis=1)
    
    return df

In [13]:
annotations = clean_text(annotations)
annotations = featurize(annotations)

In [14]:
tfidf_vectorizer = TfidfVectorizer(max_df= .6,
                                 min_df= .002, 
                                 stop_words=None,  
                                 use_idf=True, 
                                 ngram_range=(1,4))
tfidf_matrix = tfidf_vectorizer.fit_transform(annotations['CleanText_NoStop_Stemmed'].values.astype('U'))
terms = tfidf_vectorizer.get_feature_names()
tfidf_matrix = tfidf_matrix.todense()
tfidf = pd.DataFrame(tfidf_matrix)
tfidf.index = annotations.index
tfidf.columns = terms
print tfidf.shape

(16181, 6560)


In [15]:
exclude = ['Source','Category','Topic','Annotation Text','CleanText','CleanText_NoStop','CleanText_NoStop_Stemmed',
           'OCID','PDF Page Number','Article Reference','MD','VBP','VBZ','VBG','VBD','VBN','other',"''",'label','sort_key']
features = [col for col in annotations.columns.tolist() if not col in exclude]

In [16]:
X = pd.concat([tfidf,annotations[features]],axis=1)

In [17]:
X.fillna(0,inplace=True)
X = X.rename(columns = {'fit':'fit_feature'})

In [18]:
classifiers = [
#             LogisticRegression(),
#             DecisionTreeClassifier(),
            RandomForestClassifier()
#             ExtraTreesClassifier()
             ]

In [26]:
parameters = [
#         {'penalty': ['l1','l2']      # LogisticRegression
#               ,'C': [.001,.01,.1,1,10,100]
#               ,'class_weight': [None,'balanced']
#               ,'n_jobs': [-1]}
#         ,{'max_depth': [8,10,12,14]}       #DecisionTree
         {'estimator__n_estimators': [200],          # RandomForest
          'estimator__n_jobs': [-1],
            "estimator__max_depth": [None,10,12],
              "estimator__max_features": [.2,.33],
              "estimator__min_samples_split": [2, 4, 10],
              "estimator__min_samples_leaf": [50],
              "estimator__bootstrap": [True, False],
              "estimator__criterion": ['gini'],
              "estimator__class_weight": ['balanced','balanced_subsample']}
#         ,{'max_features':[0.2,0.33,0.5]       #ExtraTrees
#               ,'n_estimators': [50]
#               ,'n_jobs': [-1]
#               ,'max_depth': [3,6,9]}
]

In [38]:
parameters = [
#         {'penalty': ['l1','l2']      # LogisticRegression
#               ,'C': [.001,.01,.1,1,10,100]
#               ,'class_weight': [None,'balanced']
#               ,'n_jobs': [-1]}
#         ,{'max_depth': [8,10,12,14]}       #DecisionTree
         {'estimator__n_estimators': [500],          # RandomForest
          'estimator__n_jobs': [-1],
            "estimator__max_depth": [None,12,14],
              "estimator__max_features": [.2,.33],
              "estimator__min_samples_split": [2,10],
              "estimator__min_samples_leaf": [50],
              "estimator__bootstrap": [False,True],
              "estimator__criterion": ['gini'],
              "estimator__class_weight": ['balanced','balanced_subsample']}
#         ,{'max_features':[0.2,0.33,0.5]       #ExtraTrees
#               ,'n_estimators': [50]
#               ,'n_jobs': [-1]
#               ,'max_depth': [3,6,9]}
]

In [39]:
def grid_search(X_all, y_all, classifiers, parameters):
    
    # Runs grid search on given list of classifiers and parameters dictionary
    scores = ['recall_macro']
#     X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.33)
    
    results = []
    for i in range(len(classifiers)):
        model = str(classifiers[i]).split('(')[:1][0]
        print '     ******************************************'
        print '     *** ' + model + ' ***'        
        print '     Tuning hyper-parameters for:' 
        for score in scores:
            print '     ' + score.title()
#             clf = GridSearchCV(classifiers[i], parameters[i],cv=n_folds,scoring=score)            
            clf = GridSearchCV(OneVsRestClassifier(classifiers[i]), parameters[i],cv=n_folds,scoring=score, verbose=10)            
            clf.fit(X_all,y_all)
            y_true, y_pred = y_all, clf.predict(X_all)
            
#             acc = accuracy_score(y_true, y_pred)
#             prec = precision_score(y_true, y_pred, average='macro')
#             rec = recall_score(y_true, y_pred, average='macro')
#             roc_auc = roc_auc_score(y_true, y_pred, average='macro')
            print "Accuracy Score: " + str(accuracy_score(y_true, y_pred))
            print classification_report(y_true, y_pred, target_names=yunique)
            results.append([model, score, clf.best_params_, clf.best_score_])
    return results

In [40]:
label_results = grid_search(X, y, classifiers, parameters)

     ******************************************
     *** RandomForestClassifier ***
     Tuning hyper-parameters for:
     Recall_Macro
Fitting 3 folds for each of 48 candidates, totalling 144 fits
[CV] estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced, estimator__min_samples_split=2, estimator__max_features=0.2, estimator__bootstrap=False, estimator__max_depth=None, estimator__n_estimators=500, estimator__n_jobs=-1 
[CV]  estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced, estimator__min_samples_split=2, estimator__max_features=0.2, estimator__bootstrap=False, estimator__max_depth=None, estimator__n_estimators=500, estimator__n_jobs=-1, score=0.888524, total= 4.0min
[CV] estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced, estimator__min_samples_split=2, estimator__max_features=0.2, estimator__bootstrap=False, estimator__max_depth=None, estimator__n_estimators=500

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  4.1min remaining:    0.0s


[CV]  estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced, estimator__min_samples_split=2, estimator__max_features=0.2, estimator__bootstrap=False, estimator__max_depth=None, estimator__n_estimators=500, estimator__n_jobs=-1, score=0.892574, total= 4.2min
[CV] estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced, estimator__min_samples_split=2, estimator__max_features=0.2, estimator__bootstrap=False, estimator__max_depth=None, estimator__n_estimators=500, estimator__n_jobs=-1 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  8.3min remaining:    0.0s


[CV]  estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced, estimator__min_samples_split=2, estimator__max_features=0.2, estimator__bootstrap=False, estimator__max_depth=None, estimator__n_estimators=500, estimator__n_jobs=-1, score=0.923433, total= 4.2min
[CV] estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced, estimator__min_samples_split=10, estimator__max_features=0.2, estimator__bootstrap=False, estimator__max_depth=None, estimator__n_estimators=500, estimator__n_jobs=-1 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 12.6min remaining:    0.0s


[CV]  estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced, estimator__min_samples_split=10, estimator__max_features=0.2, estimator__bootstrap=False, estimator__max_depth=None, estimator__n_estimators=500, estimator__n_jobs=-1, score=0.888139, total= 3.9min
[CV] estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced, estimator__min_samples_split=10, estimator__max_features=0.2, estimator__bootstrap=False, estimator__max_depth=None, estimator__n_estimators=500, estimator__n_jobs=-1 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 16.6min remaining:    0.0s


[CV]  estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced, estimator__min_samples_split=10, estimator__max_features=0.2, estimator__bootstrap=False, estimator__max_depth=None, estimator__n_estimators=500, estimator__n_jobs=-1, score=0.892767, total= 4.2min
[CV] estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced, estimator__min_samples_split=10, estimator__max_features=0.2, estimator__bootstrap=False, estimator__max_depth=None, estimator__n_estimators=500, estimator__n_jobs=-1 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 20.9min remaining:    0.0s


[CV]  estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced, estimator__min_samples_split=10, estimator__max_features=0.2, estimator__bootstrap=False, estimator__max_depth=None, estimator__n_estimators=500, estimator__n_jobs=-1, score=0.918983, total= 4.2min
[CV] estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced, estimator__min_samples_split=2, estimator__max_features=0.33, estimator__bootstrap=False, estimator__max_depth=None, estimator__n_estimators=500, estimator__n_jobs=-1 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 25.2min remaining:    0.0s


[CV]  estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced, estimator__min_samples_split=2, estimator__max_features=0.33, estimator__bootstrap=False, estimator__max_depth=None, estimator__n_estimators=500, estimator__n_jobs=-1, score=0.889691, total= 6.1min
[CV] estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced, estimator__min_samples_split=2, estimator__max_features=0.33, estimator__bootstrap=False, estimator__max_depth=None, estimator__n_estimators=500, estimator__n_jobs=-1 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed: 31.3min remaining:    0.0s


[CV]  estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced, estimator__min_samples_split=2, estimator__max_features=0.33, estimator__bootstrap=False, estimator__max_depth=None, estimator__n_estimators=500, estimator__n_jobs=-1, score=0.870608, total= 6.6min
[CV] estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced, estimator__min_samples_split=2, estimator__max_features=0.33, estimator__bootstrap=False, estimator__max_depth=None, estimator__n_estimators=500, estimator__n_jobs=-1 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 38.0min remaining:    0.0s


[CV]  estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced, estimator__min_samples_split=2, estimator__max_features=0.33, estimator__bootstrap=False, estimator__max_depth=None, estimator__n_estimators=500, estimator__n_jobs=-1, score=0.917263, total= 6.5min
[CV] estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced, estimator__min_samples_split=10, estimator__max_features=0.33, estimator__bootstrap=False, estimator__max_depth=None, estimator__n_estimators=500, estimator__n_jobs=-1 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 44.6min remaining:    0.0s


[CV]  estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced, estimator__min_samples_split=10, estimator__max_features=0.33, estimator__bootstrap=False, estimator__max_depth=None, estimator__n_estimators=500, estimator__n_jobs=-1, score=0.883290, total= 6.1min
[CV] estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced, estimator__min_samples_split=10, estimator__max_features=0.33, estimator__bootstrap=False, estimator__max_depth=None, estimator__n_estimators=500, estimator__n_jobs=-1 
[CV]  estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced, estimator__min_samples_split=10, estimator__max_features=0.33, estimator__bootstrap=False, estimator__max_depth=None, estimator__n_estimators=500, estimator__n_jobs=-1, score=0.873776, total= 6.6min
[CV] estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced, estimator__min_samples_split=10, estima

[CV]  estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced, estimator__min_samples_split=2, estimator__max_features=0.2, estimator__bootstrap=False, estimator__max_depth=14, estimator__n_estimators=500, estimator__n_jobs=-1, score=0.881019, total= 3.6min
[CV] estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced, estimator__min_samples_split=2, estimator__max_features=0.2, estimator__bootstrap=False, estimator__max_depth=14, estimator__n_estimators=500, estimator__n_jobs=-1 
[CV]  estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced, estimator__min_samples_split=2, estimator__max_features=0.2, estimator__bootstrap=False, estimator__max_depth=14, estimator__n_estimators=500, estimator__n_jobs=-1, score=0.879423, total= 3.8min
[CV] estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced, estimator__min_samples_split=2, estimator__max_feat

[CV]  estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced_subsample, estimator__min_samples_split=10, estimator__max_features=0.2, estimator__bootstrap=False, estimator__max_depth=None, estimator__n_estimators=500, estimator__n_jobs=-1, score=0.887819, total= 3.9min
[CV] estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced_subsample, estimator__min_samples_split=10, estimator__max_features=0.2, estimator__bootstrap=False, estimator__max_depth=None, estimator__n_estimators=500, estimator__n_jobs=-1 
[CV]  estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced_subsample, estimator__min_samples_split=10, estimator__max_features=0.2, estimator__bootstrap=False, estimator__max_depth=None, estimator__n_estimators=500, estimator__n_jobs=-1, score=0.889342, total= 4.2min
[CV] estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced_subsample, es

[CV]  estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced_subsample, estimator__min_samples_split=2, estimator__max_features=0.33, estimator__bootstrap=False, estimator__max_depth=12, estimator__n_estimators=500, estimator__n_jobs=-1, score=0.876411, total= 5.3min
[CV] estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced_subsample, estimator__min_samples_split=2, estimator__max_features=0.33, estimator__bootstrap=False, estimator__max_depth=12, estimator__n_estimators=500, estimator__n_jobs=-1 
[CV]  estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced_subsample, estimator__min_samples_split=2, estimator__max_features=0.33, estimator__bootstrap=False, estimator__max_depth=12, estimator__n_estimators=500, estimator__n_jobs=-1, score=0.871968, total= 5.5min
[CV] estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced_subsample, estimato

[CV]  estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced_subsample, estimator__min_samples_split=10, estimator__max_features=0.33, estimator__bootstrap=False, estimator__max_depth=14, estimator__n_estimators=500, estimator__n_jobs=-1, score=0.876091, total= 5.6min
[CV] estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced_subsample, estimator__min_samples_split=10, estimator__max_features=0.33, estimator__bootstrap=False, estimator__max_depth=14, estimator__n_estimators=500, estimator__n_jobs=-1 
[CV]  estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced_subsample, estimator__min_samples_split=10, estimator__max_features=0.33, estimator__bootstrap=False, estimator__max_depth=14, estimator__n_estimators=500, estimator__n_jobs=-1, score=0.873008, total= 5.9min
[CV] estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced_subsample, estim

[CV]  estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced, estimator__min_samples_split=2, estimator__max_features=0.2, estimator__bootstrap=True, estimator__max_depth=12, estimator__n_estimators=500, estimator__n_jobs=-1, score=0.881004, total= 2.2min
[CV] estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced, estimator__min_samples_split=2, estimator__max_features=0.2, estimator__bootstrap=True, estimator__max_depth=12, estimator__n_estimators=500, estimator__n_jobs=-1 
[CV]  estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced, estimator__min_samples_split=2, estimator__max_features=0.2, estimator__bootstrap=True, estimator__max_depth=12, estimator__n_estimators=500, estimator__n_jobs=-1, score=0.863168, total= 2.3min
[CV] estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced, estimator__min_samples_split=2, estimator__max_feature

[CV]  estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced, estimator__min_samples_split=10, estimator__max_features=0.2, estimator__bootstrap=True, estimator__max_depth=14, estimator__n_estimators=500, estimator__n_jobs=-1, score=0.888780, total= 2.3min
[CV] estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced, estimator__min_samples_split=10, estimator__max_features=0.2, estimator__bootstrap=True, estimator__max_depth=14, estimator__n_estimators=500, estimator__n_jobs=-1 
[CV]  estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced, estimator__min_samples_split=10, estimator__max_features=0.2, estimator__bootstrap=True, estimator__max_depth=14, estimator__n_estimators=500, estimator__n_jobs=-1, score=0.866080, total= 2.4min
[CV] estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced, estimator__min_samples_split=10, estimator__max_fea

[CV]  estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced_subsample, estimator__min_samples_split=2, estimator__max_features=0.33, estimator__bootstrap=True, estimator__max_depth=None, estimator__n_estimators=500, estimator__n_jobs=-1, score=0.882700, total= 3.7min
[CV] estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced_subsample, estimator__min_samples_split=2, estimator__max_features=0.33, estimator__bootstrap=True, estimator__max_depth=None, estimator__n_estimators=500, estimator__n_jobs=-1 
[CV]  estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced_subsample, estimator__min_samples_split=2, estimator__max_features=0.33, estimator__bootstrap=True, estimator__max_depth=None, estimator__n_estimators=500, estimator__n_jobs=-1, score=0.859615, total= 3.9min
[CV] estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced_subsample, estim

[CV]  estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced_subsample, estimator__min_samples_split=10, estimator__max_features=0.33, estimator__bootstrap=True, estimator__max_depth=12, estimator__n_estimators=500, estimator__n_jobs=-1, score=0.880570, total= 3.4min
[CV] estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced_subsample, estimator__min_samples_split=10, estimator__max_features=0.33, estimator__bootstrap=True, estimator__max_depth=12, estimator__n_estimators=500, estimator__n_jobs=-1 
[CV]  estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced_subsample, estimator__min_samples_split=10, estimator__max_features=0.33, estimator__bootstrap=True, estimator__max_depth=12, estimator__n_estimators=500, estimator__n_jobs=-1, score=0.850159, total= 3.5min
[CV] estimator__min_samples_leaf=50, estimator__criterion=gini, estimator__class_weight=balanced_subsample, estimato

[Parallel(n_jobs=1)]: Done 144 out of 144 | elapsed: 575.5min finished


Accuracy Score: 0.954576354984
               precision    recall  f1-score   support

        other       1.00      0.95      0.98     15598
    royalties       0.43      0.99      0.60       335
stabilization       0.46      0.99      0.63       248

  avg / total       0.98      0.95      0.96     16181



In [42]:
pd.set_option('max_colwidth',500)
resultsdf = pd.DataFrame(label_results)
resultsdf.columns = ['Model','Scoring Method','Best Params','Recall']
resultsdf.to_csv('Multiclass_grid_search_results.csv')
display(resultsdf.sort_values(by='Recall',ascending=False))

Unnamed: 0,Model,Scoring Method,Best Params,Recall
0,RandomForestClassifier,recall_macro,"{u'estimator__min_samples_leaf': 50, u'estimator__criterion': u'gini', u'estimator__class_weight': u'balanced', u'estimator__min_samples_split': 2, u'estimator__max_features': 0.2, u'estimator__bootstrap': False, u'estimator__max_depth': None, u'estimator__n_estimators': 500, u'estimator__n_jobs': -1}",0.901507
