# III. CNN classification 3


### Goals

* Test performance of Spacy CNN


### Comments



# Feature extraction on sentence level

In [1]:
import nltk,gensim, spacy

#nltk.data.path=[]
#nltk.data.path.append("C:\\Users\\rittchr\\nltk_data")
#nltk.data.path.append("\\esdfiles\INTERNAL\SpecialProjects\EconomicEventDetection\Analytics\nltk_data")

import re
import numpy as np
from collections import Counter
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC

from sklearn.multiclass import OneVsRestClassifier
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix,classification_report
from sklearn.model_selection import GridSearchCV

In [130]:
pipeline = Pipeline([
    
   # Use FeatureUnion to combine the features from subject and body
    ('union', FeatureUnion(
        transformer_list=[
            
            # Pipeline for getting syntactic features
            ('syntactic_features', Pipeline([
                ('extract_syntactic_features', extract_syntactic_features())
                #('vect', DictVectorizer()),  # list of dicts -> feature matrix
            ])),    
    

            # Pipeline for getting other lexical features
            ('other_lexical_features', Pipeline([
                ('extract_other_lexical_features', extract_other_lexical_features())
                #('vect', DictVectorizer()),  # list of dicts -> feature matrix
            ])),               
    
            # word token ngrams
            ('word_ngrams', Pipeline([
                ('tfidf', TfidfVectorizer(ngram_range=(1,3),analyzer='word')),
                ("debug", Debug())
            ])),
    
    
            # character token ngrams
            ('char_ngrams', Pipeline([
                ('tfidf', TfidfVectorizer(ngram_range=(3,4),analyzer='char')),
                ("debug", Debug())
            ])),     
    
            
    
            ## lemma n-gram features, MODIFIED tokenizer=tokenize_lemmatize,
            ('lemma_ngrams', Pipeline([
                ('tfidf', TfidfVectorizer(ngram_range=(1,3),analyzer='word')),
                ("debug", Debug())
            ])),              
    
            
            
            ## Get lemma + POS tags, MODIFIED, tokenizer=tokenize_lemma_pos,
            ('lemma_pos', Pipeline([
                ('tfidf', TfidfVectorizer(analyzer='word')),
                ("debug", Debug()),
            ]))     
            

        ]
        

        # weight components in FeatureUnion
        #transformer_weights={
        #    'subject': 1.0,
        #    'body_bow': 1.0,
        #    'body_stats': 1.0,
        #},
    )),
    # Use a SVC classifier on the combined features
    #('svc', SVC(kernel='linear')),
    
    ("debug_final", Debug()),

    ('svc',OneVsRestClassifier(SVC(kernel='linear'))),
    ])

In [131]:
path_corpus = './Data/jacobs_corpus.csv'

In [132]:
df=pd.read_csv(path_corpus)

In [133]:
df.head(2)

Unnamed: 0,sentence,label,datatype,title,publication_date,file_id,-1,Profit,Dividend,MergerAcquisition,SalesVolume,BuyRating,QuarterlyResults,TargetPrice,ShareRepurchase,Turnover,Debt
0,It will not say what it has spent on the proje...,-1,holdin,tesco,25-09-2013,833,1,0,0,0,0,0,0,0,0,0,0
1,"Sir John Bond , chairman , told the bank 's an...",-1,holdin,FT other HSBC,28-05-2005,393,1,0,0,0,0,0,0,0,0,0,0


In [134]:
multi_labels = ['-1',
       'Profit', 'Dividend', 'MergerAcquisition', 'SalesVolume', 'BuyRating',
       'QuarterlyResults', 'TargetPrice', 'ShareRepurchase', 'Turnover',
       'Debt']

In [135]:
X = df['sentence']
y = df[multi_labels]
X.shape,y.shape

((9937,), (9937, 11))

In [136]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [137]:
a = [[231,233],[123],[2,2,3]]

In [138]:
list(set(x for l in a for x in l))

[2, 3, 231, 233, 123]

In [None]:
%%time
pipeline.fit(X_train,y_train)

unique tags:  ['ADP', 'SCONJ', 'PUNCT', 'PRON', 'SYM', 'VERB', 'AUX', 'NOUN', 'PROPN', 'X', 'ADJ', 'CCONJ', 'NUM', 'INTJ', 'SPACE', 'DET', 'PART', 'ADV']
uniquener:  ['GPE', 'DATE', 'ORG', 'TIME', 'ORDINAL', 'LOC', 'FAC', 'PERSON', 'WORK_OF_ART', 'NORP', 'EVENT', 'MONEY', 'CARDINAL', 'LAW', 'PRODUCT', 'PERCENT', 'QUANTITY', 'LANGUAGE']


In [105]:
pipeline.steps

[('union',
  FeatureUnion(n_jobs=None,
               transformer_list=[('syntactic_features',
                                  Pipeline(memory=None,
                                           steps=[('extract_syntactic_features',
                                                   extract_syntactic_features())],
                                           verbose=False)),
                                 ('other_lexical_features',
                                  Pipeline(memory=None,
                                           steps=[('extract_other_lexical_features',
                                                   extract_other_lexical_features())],
                                           verbose=False)),
                                 ('word_ngrams',
                                  Pipeline(memory=None,
                                           steps...
                                                                   encoding='utf-8',
                                   

In [35]:
pipeline.get_params()['svc']

OneVsRestClassifier(estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
          n_jobs=1)

#### Feature dimensions

In [36]:
pipeline.get_params()['union'].get_params()['lemma_pos'].get_params()['debug'].shape, \
pipeline.get_params()['union'].get_params()['word_ngrams'].get_params()['debug'].shape,\
pipeline.get_params()['union'].get_params()['char_ngrams'].get_params()['debug'].shape,\
pipeline.get_params()['union'].get_params()['lemma_ngrams'].get_params()['debug'].shape



((6657, 10264), (6657, 176571), (6657, 43589), (6657, 176571))

###### 400k features per data point!

In [37]:
pipeline.get_params()['debug_final'].shape

(6657, 407133)

In [38]:
y_pred = pipeline.predict(X_test)

In [39]:
print(classification_report(y_test,y_pred,target_names=multi_labels))

                   precision    recall  f1-score   support

               -1       0.92      0.95      0.94      2593
           Profit       0.81      0.76      0.78       218
         Dividend       0.68      0.64      0.66        50
MergerAcquisition       0.65      0.21      0.32        81
      SalesVolume       0.80      0.73      0.76       143
        BuyRating       0.94      0.69      0.80        72
 QuarterlyResults       0.68      0.74      0.71        88
      TargetPrice       0.86      0.89      0.88        28
  ShareRepurchase       0.79      0.42      0.55        26
         Turnover       0.88      0.66      0.76        77
             Debt       0.62      0.40      0.48        20

      avg / total       0.89      0.88      0.88      3396



## Write out results

In [42]:
from sklearn.externals import joblib
joblib.dump(pipeline,'../Models/TrainingJacobs/model.joblib',compress=True)

['../Models/TrainingJacobs/model.joblib']