# Model Selection

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.linear_model import Perceptron, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

In [3]:
import pandas as pd
import numpy as np
import scipy as sp
from scipy import sparse
import re

In [4]:
from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, GridSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [5]:
sparse_train = sp.sparse.load_npz('transitional_data/X5_train.npz')
sparse_dev = sp.sparse.load_npz('transitional_data/X5_dev.npz')

In [6]:
y_train = pd.read_csv("transitional_data/y_train.csv")
y_dev = pd.read_csv("transitional_data/y_dev.csv")

In [7]:
y_train = y_train.groupby(by = 'SentenceNR', group_keys=True).apply(lambda x: x)
y_train = y_train.rename(columns={"SentenceNR": "Sent", "Unnamed: 1": "TokenNr"})

In [8]:
y_dev = y_dev.groupby(by = 'SentenceNR', group_keys=True).apply(lambda x: x)
y_dev = y_dev.rename(columns={"SentenceNR": "Sent", "Unnamed: 1": "TokenNr"})

In [9]:
train_true = y_train["Label"]
dev_true = y_dev["Label"]

In [23]:
param_grid = {
    "class_weight": ["balanced", None],
}

In [24]:
svc = LinearSVC()

In [25]:
%%time
svc_gs = GridSearchCV(svc, param_grid, scoring = "f1_weighted", njobs = -1, refit='accuracy', cv=5, verbose=3 )
svc_gs.fit(sparse_train, train_true)
svc_gs.best_score_

Fitting 5 folds for each of 2 candidates, totalling 10 fits




[CV 1/5] END .............class_weight=balanced;, score=0.930 total time=  50.7s




[CV 2/5] END .............class_weight=balanced;, score=0.934 total time=  51.8s




[CV 3/5] END .............class_weight=balanced;, score=0.936 total time=  57.0s




[CV 4/5] END .............class_weight=balanced;, score=0.932 total time=  53.4s




[CV 5/5] END .............class_weight=balanced;, score=0.932 total time=  53.5s
[CV 1/5] END .................class_weight=None;, score=0.939 total time=  40.7s
[CV 2/5] END .................class_weight=None;, score=0.944 total time=  41.1s
[CV 3/5] END .................class_weight=None;, score=0.947 total time=  46.4s
[CV 4/5] END .................class_weight=None;, score=0.944 total time=  41.5s
[CV 5/5] END .................class_weight=None;, score=0.943 total time=  41.4s
CPU times: user 8min 50s, sys: 698 ms, total: 8min 51s
Wall time: 8min 53s


0.9433821299774723

In [26]:
svc_gs.best_params_

{'class_weight': None}

In [27]:
svc_gs.cv_results_

{'mean_fit_time': array([53.13943229, 42.08863144]),
 'std_fit_time': array([2.13173502, 2.12985227]),
 'mean_score_time': array([0.1227911 , 0.12884245]),
 'std_score_time': array([0.01336614, 0.01843171]),
 'param_class_weight': masked_array(data=['balanced', None],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'params': [{'class_weight': 'balanced'}, {'class_weight': None}],
 'split0_test_score': array([0.92981552, 0.93885356]),
 'split1_test_score': array([0.93432737, 0.94412456]),
 'split2_test_score': array([0.9361312 , 0.94667335]),
 'split3_test_score': array([0.93222087, 0.94436726]),
 'split4_test_score': array([0.93173387, 0.94289193]),
 'mean_test_score': array([0.93284577, 0.94338213]),
 'std_test_score': array([0.00218167, 0.00257302]),
 'rank_test_score': array([2, 1], dtype=int32)}

In [4]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [94]:
df_train = pd.read_csv("transitional_data/tagged_train_filled.csv", keep_default_na=False)
df_dev = pd.read_csv("transitional_data/tagged_dev_filled.csv", keep_default_na=False)

In [95]:
train = df_train[["Token", "standard_tagger", "Label"]]
dev = df_dev[["Token", "standard_tagger", "Label"]]

In [96]:
train = train.values.tolist()
dev = dev.values.tolist()

In [97]:
sent2features(train_sents[0])[0]

{'bias': 1.0,
 'word.lower()': '(',
 'word[-3:]': '(',
 'word[-2:]': '(',
 'word.isupper()': False,
 'word.istitle()': False,
 'word.isdigit()': False,
 'postag': '(',
 'postag[:2]': '(',
 'BOS': True,
 '+1:word.lower()': '7',
 '+1:word.istitle()': False,
 '+1:word.isupper()': False,
 '+1:postag': 'CD',
 '+1:postag[:2]': 'CD'}

In [27]:
def enumerate_tokens(sentence):
    c = 1
    for index, row in sentence.iterrows():
        sentence.at[index, 'TokenNR'] = c
        c += 1
    return sentence

In [28]:
%time df_dev = df_dev.groupby(by = 'SentenceNR', group_keys=True).apply(enumerate_tokens)
%time df_train = df_train.groupby(by = 'SentenceNR', group_keys=True).apply(enumerate_tokens)

CPU times: user 2.97 s, sys: 14.2 ms, total: 2.98 s
Wall time: 2.99 s
CPU times: user 27.1 s, sys: 109 ms, total: 27.2 s
Wall time: 27.3 s


In [114]:
train_sents0 = df_train[df_train["SentenceNR"]==0][["Token", "standard_tagger", "Label"]]

In [115]:
train_sents0 = train_sents0.values.tolist()

In [117]:
sent2features(train_sents0)[2]

{'bias': 1.0,
 'word.lower()': ')',
 'word[-3:]': ')',
 'word[-2:]': ')',
 'word.isupper()': False,
 'word.istitle()': False,
 'word.isdigit()': False,
 'postag': ')',
 'postag[:2]': ')',
 '-1:word.lower()': '7',
 '-1:word.istitle()': False,
 '-1:word.isupper()': False,
 '-1:postag': 'CD',
 '-1:postag[:2]': 'CD',
 '+1:word.lower()': 'on',
 '+1:word.istitle()': True,
 '+1:word.isupper()': False,
 '+1:postag': 'IN',
 '+1:postag[:2]': 'IN'}

In [53]:
train_SentenceNR = df_train.SentenceNR.unique()
dev_SentenceNR = df_dev.SentenceNR.unique()

In [54]:
train_sents = [df_train[ df_train["SentenceNR"]==nr][["Token", "standard_tagger", "Label"]].values.tolist() for nr in train_SentenceNR ]
dev_sents = [df_dev[ df_dev["SentenceNR"]==nr][["Token", "standard_tagger", "Label"]].values.tolist() for nr in dev_SentenceNR ]

In [106]:
%%time
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_dev = [sent2features(s) for s in dev_sents]
y_dev = [sent2labels(s) for s in dev_sents]

CPU times: user 901 ms, sys: 76.5 ms, total: 978 ms
Wall time: 982 ms


In [58]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)


CPU times: user 2min 2s, sys: 141 ms, total: 2min 2s
Wall time: 2min 2s


AttributeError: 'CRF' object has no attribute 'keep_tempfiles'

AttributeError: 'CRF' object has no attribute 'keep_tempfiles'

AttributeError: 'CRF' object has no attribute 'keep_tempfiles'

In [111]:
labels = ['B-ORG', 'I-ORG', 'B-OTHER_PERSON', 'I-OTHER_PERSON', 'B-WITNESS', 'I-WITNESS', 'B-GPE', 'B-STATUTE', 'B-DATE', 'I-DATE', 'B-PROVISION', 'I-PROVISION', 'I-STATUTE', 'B-COURT', 'I-COURT', 'B-PRECEDENT', 'I-PRECEDENT', 'B-CASE_NUMBER', 'I-CASE_NUMBER', 'I-GPE', 'B-PETITIONER', 'I-PETITIONER', 'B-JUDGE', 'I-JUDGE', 'B-RESPONDENT', 'I-RESPONDENT']
labels

['B-ORG',
 'I-ORG',
 'B-OTHER_PERSON',
 'I-OTHER_PERSON',
 'B-WITNESS',
 'I-WITNESS',
 'B-GPE',
 'B-STATUTE',
 'B-DATE',
 'I-DATE',
 'B-PROVISION',
 'I-PROVISION',
 'I-STATUTE',
 'B-COURT',
 'I-COURT',
 'B-PRECEDENT',
 'I-PRECEDENT',
 'B-CASE_NUMBER',
 'I-CASE_NUMBER',
 'I-GPE',
 'B-PETITIONER',
 'I-PETITIONER',
 'B-JUDGE',
 'I-JUDGE',
 'B-RESPONDENT',
 'I-RESPONDENT']

In [100]:
y_pred = crf.predict(X_dev)

In [105]:
from evaluation import get_recognition_report, get_all_labels

In [102]:
from functools import reduce
y_dev = reduce(lambda a,b:a+b, y_dev)
y_pred = reduce(lambda a,b:a+b, y_pred)

In [103]:
y_pred = get_all_labels(y_pred)
y_true = get_all_labels(y_dev)

In [104]:
all_labels_compare = get_recognition_report(y_true, y_pred, report=True)

              precision    recall  f1-score   support

 CASE_NUMBER       0.82      0.64      0.72       121
       COURT       0.87      0.82      0.85       178
        DATE       0.95      0.89      0.92       222
         GPE       0.67      0.52      0.58       182
       JUDGE       0.67      1.00      0.80         8
         ORG       0.65      0.47      0.54       159
OTHER_PERSON       0.78      0.68      0.73       276
  PETITIONER       0.27      0.33      0.30         9
   PRECEDENT       0.68      0.62      0.65       177
   PROVISION       0.91      0.87      0.89       258
  RESPONDENT       0.00      0.00      0.00         5
     STATUTE       0.89      0.84      0.86       222
     WITNESS       0.56      0.43      0.49        58

   micro avg       0.81      0.71      0.76      1875
   macro avg       0.67      0.62      0.64      1875
weighted avg       0.80      0.71      0.75      1875



In [112]:
%%time
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

AttributeError: 'CRF' object has no attribute 'keep_tempfiles'

In [131]:
# Perform standard imports
import spacy
nlp = spacy.load('en_core_web_sm')

In [133]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text+' - ' +str(ent.start_char) +' - '+ str(ent.end_char) +' - '+ent.label_+ ' - '+str(spacy.explain(ent.label_)))
    else:
        print('No named entities found.')

In [134]:
doc1 = nlp("Apple is looking at buying U.K. startup for $1 billion")
show_ents(doc1)

Apple - 0 - 5 - ORG - Companies, agencies, institutions, etc.
U.K. - 27 - 31 - GPE - Countries, cities, states
$1 billion - 44 - 54 - MONEY - Monetary values, including unit


In [136]:
import json
with open("NER_DEV/NER_DEV/NER_DEV_JUDGEMENT.json") as json_file_dev:
    json_object_dev = json.load(json_file_dev)