# Feature Matrix
Purpose of notebook is to enlarge the dataframe and provide the maschine learning model with more <b>context information</b>. <br>
e. g. <i>Tokens on the left and right sides and its pos tags, lemmas.</i> <br>
Because the tagger in last notebook already provides the pos tags for every token, the prefix, suffix and other features of the token are regarded as surplus and won't be included in the feature matrix.

In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import re

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.linear_model import Perceptron, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

In [3]:
df_train = pd.read_csv("transitional_data/tagged_train_filled.csv", keep_default_na=False)
df_dev = pd.read_csv("transitional_data/tagged_dev_filled.csv", keep_default_na=False)

<br> to check whether there are still empty cells in the dataframe

In [4]:
df_train["Lemma"].isnull().tolist().count(True)

0

In [5]:
df_train.dtypes

SentenceNR          int64
Token              object
Label              object
standard_tagger    object
TreeTagger         object
Lemma              object
dtype: object

In [6]:
df_train.Token = df_train.Token.astype("string")
df_train.Label = df_train.Label.astype("string")
df_train.standard_tagger = df_train.standard_tagger.astype("string")
df_train.TreeTagger = df_train.TreeTagger.astype("string")
df_train.Lemma = df_train.Lemma.astype("string")
df_train.dtypes

SentenceNR          int64
Token              string
Label              string
standard_tagger    string
TreeTagger         string
Lemma              string
dtype: object

In [7]:
df_dev.Token = df_dev.Token.astype("string")
df_dev.Label = df_dev.Label.astype("string")
df_dev.standard_tagger = df_dev.standard_tagger.astype("string")
df_dev.TreeTagger = df_dev.TreeTagger.astype("string")
df_dev.Lemma = df_dev.Lemma.astype("string")
df_dev.dtypes

SentenceNR          int64
Token              string
Label              string
standard_tagger    string
TreeTagger         string
Lemma              string
dtype: object

<br> number of sentences should start from 1

In [8]:
df_train["SentenceNR"] = df_train["SentenceNR"].apply(lambda x: x+1)
df_dev["SentenceNR"] = df_dev["SentenceNR"].apply(lambda x: x+1)

<br> rearrange the sequence of columns in the dataframe

In [9]:
df_train["TokenNR"] = np.nan
df_train = df_train.rename(columns={"standard_tagger": "StandardTagger"})
df_train = df_train[['SentenceNR', 'TokenNR', 'Token', 'StandardTagger', 'TreeTagger', 'Lemma', 'Label']]
df_dev["TokenNR"] = np.nan
df_dev = df_dev.rename(columns={"standard_tagger": "StandardTagger"})
df_dev = df_dev[['SentenceNR', 'TokenNR', 'Token', 'StandardTagger', 'TreeTagger', 'Lemma', 'Label']]

<br> with df.groupby(by = 'SentenceNR') the dataframe will be grouped according to the number of sentences.<br>
And with the function enumerate_tokens the order of tokens in a sentence will also be supplemented to the dataframe.

In [10]:
def enumerate_tokens(sentence):
    c = 1
    for index, row in sentence.iterrows():
        sentence.at[index, 'TokenNR'] = c
        c += 1
    return sentence

In [11]:
%time df_dev = df_dev.groupby(by = 'SentenceNR', group_keys=True).apply(enumerate_tokens)

CPU times: user 2.56 s, sys: 10 ms, total: 2.58 s
Wall time: 2.58 s


In [12]:
%time df_train = df_train.groupby(by = 'SentenceNR', group_keys=True).apply(enumerate_tokens)

CPU times: user 24.8 s, sys: 124 ms, total: 24.9 s
Wall time: 24.9 s


In [13]:
df_train.TokenNR = df_train.TokenNR.astype("int64")
df_dev.TokenNR = df_dev.TokenNR.astype("int64")

In [14]:
df_train = df_train.rename(columns={"standard_tagger": "StandardTagger"})
df_train = df_train[['SentenceNR', 'TokenNR', 'Token', 'StandardTagger', 'TreeTagger', 'Lemma', 'Label']]
df_dev = df_dev.rename(columns={"standard_tagger": "StandardTagger"})
df_dev = df_dev[['SentenceNR', 'TokenNR', 'Token', 'StandardTagger', 'TreeTagger', 'Lemma', 'Label']]

<br>make copies of the train and dev dataframe so that the originals won't be changed in processing afterwards.

In [15]:
train = df_train.copy()
dev = df_dev.copy()

In [16]:
train = train.rename(columns={"SentenceNR": "Sent"})

In [17]:
dev = dev.rename(columns={"SentenceNR": "Sent"})

## Initialize three CountVectorizers with  train.Token,  train.StandardTagger  and  train.Lemma
The <b>wf-, tf-, lf_vectorizer</b> convert three columns "Token", "StandardTagger" and "Lemma" to sparse matrix as foundations of the bigger feature (sparse) matrixes in the following steps. <br>
In the step of context information these three vectorizers will also be applied to the context tokens around the original token.

In [18]:
# Token to spare Matrix
wf_vectorizer = CountVectorizer(tokenizer=lambda x: (x,), lowercase=False, min_df=3)
%time train_X_wf = wf_vectorizer.fit_transform(train.Token)
%time dev_X_wf = wf_vectorizer.transform(dev.Token)
print(train_X_wf.shape, dev_X_wf.shape)

CPU times: user 928 ms, sys: 1 µs, total: 928 ms
Wall time: 935 ms
CPU times: user 77 ms, sys: 7 µs, total: 77 ms
Wall time: 77.1 ms
(349077, 7782) (37455, 7782)


In [19]:
# Tag to spare Matrix
tf_vectorizer = CountVectorizer(tokenizer=lambda x: (x,), lowercase=False, min_df=3)
%time train_X_tf = tf_vectorizer.fit_transform(train.StandardTagger)
%time dev_X_tf = tf_vectorizer.transform(dev.StandardTagger)
print(train_X_tf.shape, dev_X_tf.shape)

CPU times: user 720 ms, sys: 3.25 ms, total: 724 ms
Wall time: 724 ms
CPU times: user 82.1 ms, sys: 0 ns, total: 82.1 ms
Wall time: 82.3 ms
(349077, 44) (37455, 44)


In [20]:
# Lemma to spare Matrix
lf_vectorizer = CountVectorizer(tokenizer=lambda x: (x,), lowercase=False, min_df=3)
%time train_X_lf = lf_vectorizer.fit_transform(train.Lemma)
%time dev_X_lf = lf_vectorizer.transform(dev.Lemma)
print(train_X_lf.shape, dev_X_lf.shape)

CPU times: user 774 ms, sys: 169 µs, total: 775 ms
Wall time: 776 ms
CPU times: user 92 ms, sys: 0 ns, total: 92 ms
Wall time: 92.1 ms
(349077, 5708) (37455, 5708)


## Step 0
## Basis Matrix: only with Token, Tag and Lemma

to compare with the classification results of "wider" matrixes (matrixes with more columns) in the following steps. <br>
All steps will use the same model: the <b>default LinearSVC</b> by sklearn. <br><br>
<i>Special Attention:</i> <br>
The "outsider" ("o") tokens, with makes up around 84% percent of all tokens, will be <b>excluded</b> from the classfication report, so that the result can concentrate on the named entity labels in the dataset.

### Result: 
### weighted average for f1-score: 39% (dev), 44% (train)

In [21]:
X_train = sp.sparse.hstack([train_X_wf, train_X_tf, train_X_lf])
X_dev = sp.sparse.hstack([dev_X_wf, dev_X_tf, dev_X_lf])
y_train = train["Label"]
y_dev = dev["Label"]

In [22]:
classes = train["Label"].unique().tolist()
classes.remove("o")
print(classes)

['B-ORG', 'I-ORG', 'B-OTHER_PERSON', 'I-OTHER_PERSON', 'B-WITNESS', 'I-WITNESS', 'B-GPE', 'B-STATUTE', 'B-DATE', 'I-DATE', 'B-PROVISION', 'I-PROVISION', 'I-STATUTE', 'B-COURT', 'I-COURT', 'B-PRECEDENT', 'I-PRECEDENT', 'B-CASE_NUMBER', 'I-CASE_NUMBER', 'I-GPE', 'B-PETITIONER', 'I-PETITIONER', 'B-JUDGE', 'I-JUDGE', 'B-RESPONDENT', 'I-RESPONDENT']


In [66]:
svc = LinearSVC()
%time svc.fit(X_train, y_train)

CPU times: user 1min 27s, sys: 26.4 ms, total: 1min 27s
Wall time: 1min 27s


In [67]:
%%time 
y_dev_pred = svc.predict(X_dev)
print(classification_report(y_pred = y_dev_pred, y_true = y_dev, labels = classes))

  _warn_prf(average, modifier, msg_start, len(result))


                precision    recall  f1-score   support

         B-ORG       0.53      0.17      0.26       159
         I-ORG       0.34      0.10      0.15       342
B-OTHER_PERSON       0.35      0.15      0.21       276
I-OTHER_PERSON       0.37      0.36      0.36       195
     B-WITNESS       0.00      0.00      0.00        58
     I-WITNESS       0.20      0.02      0.03        54
         B-GPE       0.32      0.30      0.31       182
     B-STATUTE       0.68      0.45      0.55       222
        B-DATE       0.41      0.77      0.54       222
        I-DATE       0.42      0.33      0.37       132
   B-PROVISION       0.85      0.89      0.87       258
   I-PROVISION       0.60      0.23      0.33       772
     I-STATUTE       0.56      0.48      0.52       458
       B-COURT       0.80      0.64      0.71       178
       I-COURT       0.50      0.48      0.49       354
   B-PRECEDENT       0.00      0.00      0.00       177
   I-PRECEDENT       0.64      0.30      0.41  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [70]:
%%time 
y_train_pred = svc.predict(X_train)
print(classification_report(y_pred = y_train_pred, y_true = y_train, labels = classes))

                precision    recall  f1-score   support

         B-ORG       0.63      0.26      0.36      1441
         I-ORG       0.55      0.21      0.30      2897
B-OTHER_PERSON       0.49      0.33      0.40      2653
I-OTHER_PERSON       0.40      0.54      0.46      2089
     B-WITNESS       0.56      0.11      0.19       881
     I-WITNESS       0.50      0.05      0.09       759
         B-GPE       0.40      0.44      0.42      1395
     B-STATUTE       0.80      0.62      0.70      1803
        B-DATE       0.51      0.67      0.58      1885
        I-DATE       0.48      0.32      0.39      1926
   B-PROVISION       0.83      0.91      0.87      2384
   I-PROVISION       0.65      0.29      0.40      6576
     I-STATUTE       0.59      0.51      0.55      3802
       B-COURT       0.79      0.64      0.70      1293
       I-COURT       0.53      0.50      0.51      2804
   B-PRECEDENT       0.45      0.06      0.11      1351
   I-PRECEDENT       0.57      0.38      0.46  

## Step 1
## Will the prefixes and suffixes contribute to the model? --A small increase

### Result: 
### weighted average for f1-score: 42% (dev), 50% (train), comparing to the basic model +3%, +6%

### get_prefix_suffix:
return all prefixes and suffixes in a token from the length of 2 to 5.

In [23]:
def get_prefix_suffix(word):
    l = len(word)
    res = []
    for k in range(2, 5):
        if l > k:
            res.append("-" + word[-k:])
    for k in range(2, 5):
        if l > k:
            res.append(word[:k] + "-")
    return(res)

In [24]:
print(train.Token.tolist()[5], get_prefix_suffix(train.Token.tolist()[5]))
print(dev.Token.tolist()[6], get_prefix_suffix(dev.Token.tolist()[6]))

query ['-ry', '-ery', '-uery', 'qu-', 'que-', 'quer-']
'due ['-ue', '-due', "'d-", "'du-"]


<br> have a look at all affixes appearing more than 5000 times in the training dataset.

In [25]:
affix_vectorizer = CountVectorizer(tokenizer=get_prefix_suffix, min_df=5000)
affix_vectorizer.fit(train.Token.tolist())
print(" ".join(affix_vectorizer.get_feature_names_out()))



-al -as -at -ed -er -he -ing -ion -nd -ng -nt -on -tion an- co- no- re- th-


<br> For real usage we will set the min_df to a much lower number, to provide the model with more affix information. 

In [26]:
affix_vectorizer = CountVectorizer(tokenizer=get_prefix_suffix, min_df=20)
train_X_affix = affix_vectorizer.fit_transform(train.Token.tolist())
dev_X_affix = affix_vectorizer.transform(dev.Token.tolist())

In [28]:
train_X_affix.shape

(349077, 3976)

In [29]:
dev_X_affix.shape

(37455, 3976)

<br> with sp.sparse.hstack combine the X_train from the last step with the new affixes.

In [75]:
X1_train = sp.sparse.hstack([X_train, train_X_affix])
X1_dev = sp.sparse.hstack([X_dev, dev_X_affix])

In [77]:
clf = LinearSVC()
%time clf.fit(X1_train, y_train)

CPU times: user 2min 30s, sys: 106 ms, total: 2min 30s
Wall time: 2min 31s


In [78]:
%%time 
y1_dev_pred = clf.predict(X1_dev)
print(classification_report(y_pred = y1_dev_pred, y_true = y_dev, labels = classes))

                precision    recall  f1-score   support

         B-ORG       0.41      0.25      0.31       159
         I-ORG       0.27      0.09      0.14       342
B-OTHER_PERSON       0.30      0.38      0.34       276
I-OTHER_PERSON       0.32      0.39      0.35       195
     B-WITNESS       0.11      0.10      0.11        58
     I-WITNESS       0.06      0.02      0.03        54
         B-GPE       0.33      0.45      0.38       182
     B-STATUTE       0.66      0.47      0.55       222
        B-DATE       0.74      0.69      0.72       222
        I-DATE       0.40      0.30      0.34       132
   B-PROVISION       0.84      0.91      0.88       258
   I-PROVISION       0.60      0.24      0.34       772
     I-STATUTE       0.55      0.48      0.51       458
       B-COURT       0.81      0.65      0.72       178
       I-COURT       0.49      0.48      0.49       354
   B-PRECEDENT       0.08      0.02      0.03       177
   I-PRECEDENT       0.60      0.33      0.42  

In [80]:
%%time 
y1_train_pred = clf.predict(X1_train)
print(classification_report(y_pred = y1_train_pred, y_true = y_train, labels = classes))

                precision    recall  f1-score   support

         B-ORG       0.62      0.40      0.49      1441
         I-ORG       0.57      0.26      0.36      2897
B-OTHER_PERSON       0.49      0.63      0.55      2653
I-OTHER_PERSON       0.41      0.65      0.51      2089
     B-WITNESS       0.50      0.34      0.41       881
     I-WITNESS       0.50      0.14      0.22       759
         B-GPE       0.44      0.65      0.52      1395
     B-STATUTE       0.80      0.65      0.72      1803
        B-DATE       0.90      0.80      0.85      1885
        I-DATE       0.48      0.33      0.39      1926
   B-PROVISION       0.83      0.93      0.88      2384
   I-PROVISION       0.67      0.30      0.41      6576
     I-STATUTE       0.60      0.53      0.56      3802
       B-COURT       0.79      0.65      0.71      1293
       I-COURT       0.53      0.51      0.52      2804
   B-PRECEDENT       0.49      0.17      0.25      1351
   I-PRECEDENT       0.58      0.43      0.50  

## Step 2
## Will other features of the tokens contribute to the model? --Nothing changes

### Result: 
### weighted average for f1-score: 42% (dev), 51% (train), comparing to the basic model +3%, +7%

Supossedly because the matrix already have pos tags and lemmas, other features of the tokens cannot help the model to learn better. <br>
Since they don't make a difference, the other features won't be used in future steps.

In [84]:
def get_other_features(df, test=False):
    res = pd.DataFrame({
        'upper': df.Token.str.match(r'[A-Z]'),
        'allcaps': df.Token.str.fullmatch(r'[A-Z]+'),
        'digits': df.Token.str.match(r'[0-9]'),
        'alldigits': df.Token.str.fullmatch(r'-?[0-9][0-9.,]*'),
        'noalpha': ~df.Token.str.contains(r'[a-z]', flags=re.IGNORECASE),
        'noalnum': ~df.Token.str.contains(r'[0-9a-zäöü]', flags=re.IGNORECASE),
        'atstart': df.TokenNR == 1,
        'trunc': df.Token.str.endswith('-'),
        'long': df.Token.str.len() >= 15,
    })
    if test:
        return res
    else:
        return res.iloc[:, 1:].to_numpy(dtype=np.float64)

In [85]:
%time train_X_other = get_other_features(train)
%time dev_X_other = get_other_features(dev)

CPU times: user 873 ms, sys: 3.24 ms, total: 876 ms
Wall time: 877 ms
CPU times: user 82.6 ms, sys: 0 ns, total: 82.6 ms
Wall time: 82.7 ms


In [86]:
X2_train = sp.sparse.hstack([X_train, train_X_affix, train_X_other])
X2_dev = sp.sparse.hstack([X_dev, dev_X_affix, dev_X_other])

In [88]:
svc = LinearSVC()
%time svc.fit(X2_train, y_train)

CPU times: user 2min 29s, sys: 248 ms, total: 2min 29s
Wall time: 2min 29s


In [89]:
%%time 
y2_dev_pred = svc.predict(X2_dev)
print(classification_report(y_pred = y2_dev_pred, y_true = y_dev, labels = classes))

                precision    recall  f1-score   support

         B-ORG       0.43      0.26      0.32       159
         I-ORG       0.27      0.09      0.14       342
B-OTHER_PERSON       0.30      0.36      0.33       276
I-OTHER_PERSON       0.33      0.41      0.36       195
     B-WITNESS       0.11      0.10      0.11        58
     I-WITNESS       0.05      0.02      0.03        54
         B-GPE       0.33      0.47      0.39       182
     B-STATUTE       0.68      0.51      0.59       222
        B-DATE       0.77      0.69      0.73       222
        I-DATE       0.41      0.33      0.37       132
   B-PROVISION       0.84      0.91      0.88       258
   I-PROVISION       0.59      0.24      0.35       772
     I-STATUTE       0.55      0.48      0.51       458
       B-COURT       0.80      0.65      0.72       178
       I-COURT       0.49      0.48      0.49       354
   B-PRECEDENT       0.08      0.02      0.03       177
   I-PRECEDENT       0.60      0.33      0.43  

In [112]:
%%time 
y2_train_pred = svc.predict(X2_train)
print(classification_report(y_pred = y2_train_pred, y_true = y_train, labels = classes))

                precision    recall  f1-score   support

         B-ORG       0.62      0.40      0.49      1441
         I-ORG       0.57      0.26      0.36      2897
B-OTHER_PERSON       0.49      0.63      0.55      2653
I-OTHER_PERSON       0.42      0.66      0.51      2089
     B-WITNESS       0.52      0.37      0.43       881
     I-WITNESS       0.50      0.16      0.24       759
         B-GPE       0.44      0.65      0.53      1395
     B-STATUTE       0.81      0.65      0.72      1803
        B-DATE       0.91      0.80      0.86      1885
        I-DATE       0.48      0.34      0.40      1926
   B-PROVISION       0.83      0.93      0.88      2384
   I-PROVISION       0.67      0.31      0.42      6576
     I-STATUTE       0.60      0.53      0.56      3802
       B-COURT       0.79      0.65      0.71      1293
       I-COURT       0.53      0.52      0.52      2804
   B-PRECEDENT       0.49      0.17      0.25      1351
   I-PRECEDENT       0.58      0.44      0.50  

## Step 3
## Context left and right: A great difference!

### Result: 
### weighted average for f1-score: 77% (dev), 94% (train), comparing to the basic model +38%, +50%

## add_context
At first we will add new columns to the both dataframes. <br>
The new columns show the tokens, tags and lemmas in the rows before and after.<br>
<i>(2 words on the left and 2 words on the right in the original text)</i><br> <br>
This process will be executed at the level of <b>each sentence</b> because the sentences are disjunctive in the dataframe. <br>
In other words, "neighbour" sentences don't belong to the same judgement. They are randomly mixed. <br>
Beginnings and ends of all sentences will be padded. 

In [30]:
def add_context(satz):
    
    satz["L1"] = satz.Token.shift(1, fill_value="")  
    satz["L2"] = satz.Token.shift(2, fill_value="")  
    satz["R1"] = satz.Token.shift(-1, fill_value="") 
    satz["R2"] = satz.Token.shift(-2, fill_value="") 
    
    satz["posL1"] = satz.StandardTagger.shift(1, fill_value="*")
    satz["posL2"] = satz.StandardTagger.shift(2, fill_value="*")
    satz["posR1"] = satz.StandardTagger.shift(-1, fill_value="*")
    satz["posR2"] = satz.StandardTagger.shift(-2, fill_value="*")
    
    satz["lemmaL1"] = satz.Lemma.shift(1, fill_value="*")
    satz["lemmaL2"] = satz.Lemma.shift(2, fill_value="*")
    satz["lemmaR1"] = satz.Lemma.shift(-1, fill_value="*")
    satz["lemmaR2"] = satz.Lemma.shift(-2, fill_value="*")
    
    # Labels of two tokens before are just preparation for the trigramme model 
    satz["labelL1"] = satz.Label.shift(1, fill_value="*")
    satz["labelL2"] = satz.Label.shift(2, fill_value="*")
    
    return satz

In [31]:
%time train = train.groupby('Sent', group_keys=False).apply(add_context)
%time dev = dev.groupby('Sent', group_keys=False).apply(add_context)

CPU times: user 53.9 s, sys: 319 ms, total: 54.2 s
Wall time: 54.4 s
CPU times: user 5.03 s, sys: 6.6 ms, total: 5.04 s
Wall time: 5.05 s


<br>Of course the last two columns ("LabelL1", "LabelL2")of dev are NOT allowed to be included in the feature matrix.<br>
Otherwise they would lead to data leak.

In [95]:
dev

Unnamed: 0_level_0,Unnamed: 1_level_0,Sent,TokenNR,Token,StandardTagger,TreeTagger,Lemma,Label,L1,L2,R1,...,posL1,posL2,posR1,posR2,lemmaL1,lemmaL2,lemmaR1,lemmaR2,labelL1,labelL2
SentenceNR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,0,1,1,True,NN,UH,true,o,,,",",...,*,*,",",PRP$,*,*,",",our,*,*
1,1,1,2,",",",",",",",",o,True,,our,...,NN,*,PRP$,NNP,true,*,our,Constitution,o,*
1,2,1,3,our,PRP$,PP$,our,o,",",True,Constitution,...,",",NN,NNP,VBZ,",",true,Constitution,have,o,o
1,3,1,4,Constitution,NNP,NP,Constitution,B-STATUTE,our,",",has,...,PRP$,",",VBZ,DT,our,",",have,no,o,o
1,4,1,5,has,VBZ,VHZ,have,o,Constitution,our,no,...,NNP,PRP$,DT,JJ,Constitution,our,no,0,B-STATUTE,o
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
949,37450,949,10,of,IN,IN,of,o,root,behind,right,...,NN,IN,JJ,NN,root,behind,right,ear,o,o
949,37451,949,11,right,JJ,JJ,right,o,of,root,ear,...,IN,NN,NN,NN,of,root,ear,lobule,o,o
949,37452,949,12,ear,NN,NN,ear,o,right,of,lobule,...,JJ,IN,NN,.,right,of,lobule,.,o,o
949,37453,949,13,lobule,NN,NN,lobule,o,ear,right,.,...,NN,JJ,.,*,ear,right,.,*,o,o


<br> Transform the tokens in the context with wf_vectorizer.

In [32]:
%%time
train_token_context = sp.sparse.hstack([wf_vectorizer.transform(train.L1), 
                                        wf_vectorizer.transform(train.L2), 
                                        wf_vectorizer.transform(train.R1),
                                        wf_vectorizer.transform(train.R2)])

CPU times: user 3.24 s, sys: 3.28 ms, total: 3.24 s
Wall time: 3.25 s


In [33]:
train_token_context.shape

(349077, 31128)

<br> Transform the POS tags in the context with tf_vectorizer.

In [34]:
%%time
train_tag_context = sp.sparse.hstack([tf_vectorizer.transform(train.posL1), 
                                        tf_vectorizer.transform(train.posL2), 
                                        tf_vectorizer.transform(train.posR1),
                                        tf_vectorizer.transform(train.posR2)])

CPU times: user 2.96 s, sys: 3.28 ms, total: 2.96 s
Wall time: 2.97 s


In [35]:
train_tag_context.shape

(349077, 176)

<br> Transform the lemmas in the context with lf_vectorizer.

In [36]:
%%time
train_lemma_context = sp.sparse.hstack([lf_vectorizer.transform(train.lemmaL1), 
                                        lf_vectorizer.transform(train.lemmaL2), 
                                        lf_vectorizer.transform(train.lemmaR1),
                                        lf_vectorizer.transform(train.lemmaR2)])

CPU times: user 3.03 s, sys: 0 ns, total: 3.03 s
Wall time: 3.04 s


In [37]:
train_lemma_context.shape

(349077, 22832)

<br> The same way for dev

In [38]:
%%time

dev_token_context = sp.sparse.hstack([wf_vectorizer.transform(dev.L1), 
                                        wf_vectorizer.transform(dev.L2), 
                                        wf_vectorizer.transform(dev.R1),
                                        wf_vectorizer.transform(dev.R2)])

dev_tag_context = sp.sparse.hstack([tf_vectorizer.transform(dev.posL1), 
                                        tf_vectorizer.transform(dev.posL2), 
                                        tf_vectorizer.transform(dev.posR1),
                                        tf_vectorizer.transform(dev.posR2)])

dev_lemma_context = sp.sparse.hstack([lf_vectorizer.transform(dev.lemmaL1), 
                                        lf_vectorizer.transform(dev.lemmaL2), 
                                        lf_vectorizer.transform(dev.lemmaR1),
                                        lf_vectorizer.transform(dev.lemmaR2)])

CPU times: user 981 ms, sys: 0 ns, total: 981 ms
Wall time: 983 ms


In [109]:
X3_train = sp.sparse.hstack([X_train, train_token_context, train_tag_context, train_lemma_context])
X3_train.shape

(349077, 67670)

In [111]:
X3_dev = sp.sparse.hstack([X_dev, dev_token_context, dev_tag_context, dev_lemma_context])
X3_dev.shape

(37455, 67670)

<br> At first we will simply provide the model with the +-2 context and their tags and lemmas to see its effect alone.

In [113]:
svc = LinearSVC()
%time svc.fit(X3_train, y_train)

CPU times: user 43.5 s, sys: 36.6 ms, total: 43.6 s
Wall time: 43.7 s


In [114]:
y3_dev_pred = svc.predict(X3_dev)
print(classification_report(y_pred = y3_dev_pred, y_true = y_dev, labels = classes))

                precision    recall  f1-score   support

         B-ORG       0.56      0.40      0.47       159
         I-ORG       0.51      0.40      0.45       342
B-OTHER_PERSON       0.76      0.57      0.65       276
I-OTHER_PERSON       0.63      0.66      0.64       195
     B-WITNESS       0.48      0.53      0.50        58
     I-WITNESS       0.50      0.44      0.47        54
         B-GPE       0.64      0.55      0.59       182
     B-STATUTE       0.88      0.86      0.87       222
        B-DATE       0.89      0.91      0.90       222
        I-DATE       0.93      0.93      0.93       132
   B-PROVISION       0.91      0.91      0.91       258
   I-PROVISION       0.89      0.89      0.89       772
     I-STATUTE       0.82      0.84      0.83       458
       B-COURT       0.91      0.84      0.87       178
       I-COURT       0.79      0.76      0.78       354
   B-PRECEDENT       0.68      0.58      0.63       177
   I-PRECEDENT       0.88      0.78      0.83  

In [115]:
y3_train_pred = svc.predict(X3_train)
print(classification_report(y_pred = y3_train_pred, y_true = y_train, labels = classes))

                precision    recall  f1-score   support

         B-ORG       0.93      0.88      0.90      1441
         I-ORG       0.90      0.86      0.88      2897
B-OTHER_PERSON       0.94      0.93      0.93      2653
I-OTHER_PERSON       0.92      0.95      0.94      2089
     B-WITNESS       0.96      0.89      0.92       881
     I-WITNESS       0.95      0.91      0.93       759
         B-GPE       0.94      0.91      0.93      1395
     B-STATUTE       0.98      0.98      0.98      1803
        B-DATE       0.98      0.98      0.98      1885
        I-DATE       0.97      0.99      0.98      1926
   B-PROVISION       0.98      0.99      0.98      2384
   I-PROVISION       0.95      0.95      0.95      6576
     I-STATUTE       0.97      0.97      0.97      3802
       B-COURT       0.97      0.95      0.96      1293
       I-COURT       0.95      0.92      0.93      2804
   B-PRECEDENT       0.94      0.93      0.94      1351
   I-PRECEDENT       0.92      0.93      0.93  

## Step 4
## Context and Affix of context: Over training!

### Result: 
### weighted average for f1-score: 77% (dev), 97% (train), comparing to the basic model +38%, +53%

Comparing to the feature matrix with context but without affix of the contextes (last step, X3), <br>
providing the model also the affix of contextes just leads to <b>an increased over training</b>, <br>
but it brings nothing in predicting the dev dataset. 

In [120]:
%%time
train_affix_context = sp.sparse.hstack([affix_vectorizer.transform(train.L1.tolist()),
                                       affix_vectorizer.transform(train.L2.tolist()),
                                       affix_vectorizer.transform(train.R1.tolist()),
                                       affix_vectorizer.transform(train.R2.tolist())])

CPU times: user 5.58 s, sys: 9.93 ms, total: 5.59 s
Wall time: 5.59 s


In [121]:
%%time
dev_affix_context = sp.sparse.hstack([affix_vectorizer.transform(dev.L1.tolist()),
                                       affix_vectorizer.transform(dev.L2.tolist()),
                                       affix_vectorizer.transform(dev.R1.tolist()),
                                       affix_vectorizer.transform(dev.R2.tolist())])

CPU times: user 689 ms, sys: 2 µs, total: 689 ms
Wall time: 689 ms


In [122]:
X4_train = sp.sparse.hstack([X_train, train_X_affix, train_token_context, train_tag_context, train_lemma_context, train_affix_context])
X4_dev = sp.sparse.hstack([X_dev, dev_X_affix, dev_token_context, dev_tag_context, dev_lemma_context, dev_affix_context])

In [123]:
X4_train.shape

(349077, 87550)

In [124]:
X4_dev.shape

(37455, 87550)

In [125]:
svc = LinearSVC()
%time svc.fit(X4_train, y_train)

CPU times: user 1min 24s, sys: 440 ms, total: 1min 24s
Wall time: 1min 24s




In [126]:
%%time
y4_dev_pred = svc.predict(X4_dev)
print(classification_report(y_pred = y4_dev_pred, y_true = y_dev, labels = classes))

                precision    recall  f1-score   support

         B-ORG       0.52      0.45      0.48       159
         I-ORG       0.49      0.39      0.43       342
B-OTHER_PERSON       0.73      0.61      0.66       276
I-OTHER_PERSON       0.66      0.61      0.63       195
     B-WITNESS       0.58      0.60      0.59        58
     I-WITNESS       0.45      0.46      0.46        54
         B-GPE       0.65      0.59      0.62       182
     B-STATUTE       0.86      0.85      0.85       222
        B-DATE       0.93      0.94      0.93       222
        I-DATE       0.89      0.93      0.91       132
   B-PROVISION       0.95      0.93      0.94       258
   I-PROVISION       0.87      0.91      0.89       772
     I-STATUTE       0.81      0.84      0.82       458
       B-COURT       0.89      0.83      0.85       178
       I-COURT       0.79      0.72      0.76       354
   B-PRECEDENT       0.65      0.58      0.61       177
   I-PRECEDENT       0.88      0.78      0.82  

In [127]:
%%time
y4_train_pred = svc.predict(X4_train)
print(classification_report(y_pred = y4_train_pred, y_true = y_train, labels = classes))

                precision    recall  f1-score   support

         B-ORG       0.96      0.96      0.96      1441
         I-ORG       0.95      0.94      0.95      2897
B-OTHER_PERSON       1.00      1.00      1.00      2653
I-OTHER_PERSON       0.99      0.99      0.99      2089
     B-WITNESS       1.00      1.00      1.00       881
     I-WITNESS       1.00      0.99      1.00       759
         B-GPE       0.98      0.98      0.98      1395
     B-STATUTE       0.98      0.99      0.99      1803
        B-DATE       0.99      1.00      1.00      1885
        I-DATE       0.98      1.00      0.99      1926
   B-PROVISION       0.99      0.99      0.99      2384
   I-PROVISION       0.97      0.97      0.97      6576
     I-STATUTE       0.98      0.99      0.99      3802
       B-COURT       0.97      0.96      0.96      1293
       I-COURT       0.97      0.96      0.96      2804
   B-PRECEDENT       0.98      0.98      0.98      1351
   I-PRECEDENT       0.96      0.97      0.96  

## Step 5
## Context and only Affix of token itself: No improve, but over training reduced
### Result: 
### weighted average for f1-score: 77% (dev), 95% (train), comparing to the basic model +38%, +51%

This time we remove the affix of contextes from the feature matrix. <br>
Although it doesn't bring a better score for the dev, <br>
but it reduced a little bit the over training as last time. 

In [39]:
X5_train = sp.sparse.hstack([X_train, train_X_affix, train_token_context, train_tag_context, train_lemma_context])
X5_dev = sp.sparse.hstack([X_dev, dev_X_affix, dev_token_context, dev_tag_context, dev_lemma_context])

In [129]:
svc = LinearSVC()
%time svc.fit(X5_train, y_train)

CPU times: user 56.7 s, sys: 46.5 ms, total: 56.7 s
Wall time: 56.9 s


In [131]:
%%time
y5_dev_pred = svc.predict(X5_dev)
print(classification_report(y_pred = y5_dev_pred, y_true = y_dev, labels = classes))

                precision    recall  f1-score   support

         B-ORG       0.61      0.47      0.53       159
         I-ORG       0.52      0.40      0.45       342
B-OTHER_PERSON       0.72      0.61      0.66       276
I-OTHER_PERSON       0.64      0.64      0.64       195
     B-WITNESS       0.48      0.50      0.49        58
     I-WITNESS       0.50      0.39      0.44        54
         B-GPE       0.68      0.61      0.64       182
     B-STATUTE       0.86      0.84      0.85       222
        B-DATE       0.93      0.95      0.94       222
        I-DATE       0.93      0.94      0.94       132
   B-PROVISION       0.95      0.93      0.94       258
   I-PROVISION       0.88      0.89      0.88       772
     I-STATUTE       0.82      0.84      0.83       458
       B-COURT       0.91      0.84      0.87       178
       I-COURT       0.77      0.74      0.75       354
   B-PRECEDENT       0.70      0.58      0.63       177
   I-PRECEDENT       0.87      0.78      0.82  

In [132]:
%%time
y5_train_pred = svc.predict(X5_train)
print(classification_report(y_pred = y5_train_pred, y_true = y_train, labels = classes))

                precision    recall  f1-score   support

         B-ORG       0.95      0.94      0.94      1441
         I-ORG       0.92      0.89      0.90      2897
B-OTHER_PERSON       0.98      0.98      0.98      2653
I-OTHER_PERSON       0.95      0.98      0.96      2089
     B-WITNESS       0.98      0.96      0.97       881
     I-WITNESS       0.98      0.94      0.96       759
         B-GPE       0.97      0.96      0.97      1395
     B-STATUTE       0.98      0.99      0.99      1803
        B-DATE       0.99      1.00      0.99      1885
        I-DATE       0.98      0.99      0.98      1926
   B-PROVISION       0.98      0.99      0.99      2384
   I-PROVISION       0.96      0.96      0.96      6576
     I-STATUTE       0.98      0.97      0.98      3802
       B-COURT       0.97      0.96      0.96      1293
       I-COURT       0.95      0.93      0.94      2804
   B-PRECEDENT       0.97      0.96      0.96      1351
   I-PRECEDENT       0.94      0.94      0.94  

## Trigrame Processing: Even much worse than the basis matrix!
### Result: 
### weighted average for f1-score: 25% (dev), comparing to the basic model -14%

In [133]:
label_vectorizer = OneHotEncoder(handle_unknown = 'infrequent_if_exist', min_frequency=5)
tmp_train = np.vstack([train.labelL1, train.labelL2, train.labelL2 + " " + train.labelL1]).T
X_train_label = label_vectorizer.fit_transform(tmp_train)
X_train_label.shape

(349077, 150)

In [134]:
X6_train = sp.sparse.hstack([X_train, X_train_label])

In [135]:
%%time
clf = LinearSVC()
clf.fit(X6_train, y_train)

CPU times: user 36.6 s, sys: 26.5 ms, total: 36.6 s
Wall time: 36.7 s


In [136]:
X6_train.shape

(349077, 13684)

In [137]:
X_train_label.shape

(349077, 150)

In [139]:
def get_features(satz):
    return sp.sparse.hstack([
        
        sp.sparse.hstack([wf_vectorizer.transform(satz.Token)]),
        tf_vectorizer.transform(satz.TreeTagger),
        lf_vectorizer.transform(satz.Lemma),
        
        #wf_vectorizer.transform(satz.L1),
        #wf_vectorizer.transform(satz.L2),
        #wf_vectorizer.transform(satz.R1),
        #wf_vectorizer.transform(satz.R2),
        
        #tf_vectorizer.transform(satz.posL1),
        #tf_vectorizer.transform(satz.posL2),
        #tf_vectorizer.transform(satz.posR1),
        #tf_vectorizer.transform(satz.posR2),
        
        #lf_vectorizer.transform(satz.lemmaL1),
        #lf_vectorizer.transform(satz.lemmaL2),
        #lf_vectorizer.transform(satz.lemmaR1),
        #lf_vectorizer.transform(satz.lemmaR2),
    ], format='csr')

In [141]:
def tag_sentence(satz):
    n = satz.shape[0]
    X = get_features(satz) # Matrix der Oberflächenmerkmale
    tags = []
    p1 = p2 = "*"          # vorhergehende Labels
    for i in range(n):
        x1 = X[i, :]
        x2 = label_vectorizer.transform(np.array([[p1, p2, p2 + " " + p1]]))
        x = sp.sparse.hstack([x1, x2])
        tag = clf.predict(x)[0] # liefert NumPy-Array zurück
        tags.append(tag)
        p2, p1 = p1, tag
    return pd.Series(tags, index=satz.index, dtype='string')

In [142]:
%%time
predicted = dev.groupby('Sent').apply(tag_sentence)

CPU times: user 32 s, sys: 29.9 ms, total: 32 s
Wall time: 32.1 s


In [143]:
print(classes)

['B-ORG', 'I-ORG', 'B-OTHER_PERSON', 'I-OTHER_PERSON', 'B-WITNESS', 'I-WITNESS', 'B-GPE', 'B-STATUTE', 'B-DATE', 'I-DATE', 'B-PROVISION', 'I-PROVISION', 'I-STATUTE', 'B-COURT', 'I-COURT', 'B-PRECEDENT', 'I-PRECEDENT', 'B-CASE_NUMBER', 'I-CASE_NUMBER', 'I-GPE', 'B-PETITIONER', 'I-PETITIONER', 'B-JUDGE', 'I-JUDGE', 'B-RESPONDENT', 'I-RESPONDENT']


In [144]:
%time print(classification_report(y_pred=predicted, y_true=y_dev, labels = classes))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                precision    recall  f1-score   support

         B-ORG       0.70      0.04      0.08       159
         I-ORG       0.00      0.00      0.00       342
B-OTHER_PERSON       0.40      0.01      0.01       276
I-OTHER_PERSON       0.00      0.00      0.00       195
     B-WITNESS       0.00      0.00      0.00        58
     I-WITNESS       0.00      0.00      0.00        54
         B-GPE       0.00      0.00      0.00       182
     B-STATUTE       0.77      0.19      0.31       222
        B-DATE       0.37      0.74      0.50       222
        I-DATE       0.75      0.34      0.47       132
   B-PROVISION       0.87      0.83      0.85       258
   I-PROVISION       0.94      0.70      0.80       772
     I-STATUTE       0.56      0.05      0.09       458
       B-COURT       0.92      0.60      0.72       178
       I-COURT       0.86      0.32      0.47       354
   B-PRECEDENT       0.17      0.01      0.01       177
   I-PRECEDENT       0.83      0.02      0.03  

  _warn_prf(average, modifier, msg_start, len(result))


## with data leak

If we provide the dev matrix with basis matrix and the two "gold" Labels before, it will bring a surprisingly good result (84%). <br>
Of course this couln'd be allowed in the tast. 

In [149]:
tmp_dev = np.vstack([dev.labelL1, dev.labelL2, dev.labelL2 + " " + dev.labelL1]).T
X_dev_label = label_vectorizer.transform(tmp_dev)
X_dev_label.shape

(37455, 150)

In [153]:
X7_dev = sp.sparse.hstack([X_dev, X_dev_label])

In [154]:
X7_dev

<37455x13684 sparse matrix of type '<class 'numpy.float64'>'
	with 219074 stored elements in Compressed Sparse Row format>

In [155]:
%time print(classification_report(y_pred=clf.predict(X7_dev), y_true=y_dev, labels = classes))

                precision    recall  f1-score   support

         B-ORG       0.66      0.18      0.29       159
         I-ORG       0.96      0.90      0.93       342
B-OTHER_PERSON       0.58      0.27      0.37       276
I-OTHER_PERSON       0.97      1.00      0.98       195
     B-WITNESS       0.25      0.05      0.09        58
     I-WITNESS       1.00      1.00      1.00        54
         B-GPE       0.65      0.51      0.57       182
     B-STATUTE       0.74      0.55      0.63       222
        B-DATE       0.60      0.83      0.70       222
        I-DATE       0.91      0.95      0.93       132
   B-PROVISION       0.97      0.88      0.92       258
   I-PROVISION       0.97      0.95      0.96       772
     I-STATUTE       0.94      0.85      0.89       458
       B-COURT       0.89      0.65      0.75       178
       I-COURT       0.95      0.98      0.97       354
   B-PRECEDENT       0.43      0.13      0.20       177
   I-PRECEDENT       0.96      0.98      0.97  

## Save the best Feature Matrix X5
Since the X5 sparse matrix with context and only affix of token itself provides the best result up to right now, we will save it for future uses.

In [41]:
import scipy.sparse

In [45]:
%%time
scipy.sparse.save_npz('transitional_data/X5_train.npz', X5_train)
X5_train = scipy.sparse.load_npz('transitional_data/X5_train.npz')

CPU times: user 2.73 s, sys: 19.8 ms, total: 2.75 s
Wall time: 2.75 s


In [46]:
%%time
scipy.sparse.save_npz('transitional_data/X5_dev.npz', X5_dev)
X5_dev = scipy.sparse.load_npz('transitional_data/X5_dev.npz')

CPU times: user 300 ms, sys: 8 µs, total: 300 ms
Wall time: 300 ms


<br>
Also save the gold standards y_train, y_dev

In [49]:
y_train = pd.DataFrame(y_train)
y_dev = pd.DataFrame(y_dev)

In [52]:
%%time
y_train.to_csv("transitional_data/y_train.csv", index=True)
y_dev.to_csv("transitional_data/y_dev.csv", index=True)

CPU times: user 1.54 s, sys: 6.66 ms, total: 1.55 s
Wall time: 1.56 s
