# III. SVM classification with CV 2


### Goals

* Improve setup from III.SVM_Sentencetraining
* Do not do hyperparameter tuning?


### Comments



## Lexical features

* token n-gram features: unigrams,bigrams, trigrams
* character n-gram fatures: trigrams,fourgrams
* lemma n-gram features: uni,bi,trigrams
* disambiguated lemmas: Lemma + POS tag
* numerals: yes,no
* symbols: yes,no
* time indicators: yes, not
* future: add semantic knowledge from structured resources:
    * takeover=acquire, acquisition,
    * are word embedding sufficient to capture semantic knowledge?

## Syntactic features
* PoS categories: 
    * for each binary (yes,no)
    * 0,1,more; 
    * total number of occurances
* named entity types: person, organization, location, product, event, 


    NE Type 	Examples
    ORGANIZATION 	Georgia-Pacific Corp., WHO
    PERSON 	Eddy Bonte, President Obama
    LOCATION 	Murray River, Mount Everest
    DATE 	June, 2008-06-29
    TIME 	two fifty a m, 1:30 p.m.
    MONEY 	175 million Canadian Dollars, GBP 10.40
    PERCENT 	twenty pct, 18.75 %
    FACILITY 	Washington Monument, Stonehenge
    GPE 	South East Asia, Midlothian

## SVM classification
* linear SVM
* kernel SVM

# Feature extraction on sentence level

In [1]:
import nltk,gensim,spacy,re
import numpy as np
from collections import Counter
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC

from sklearn.multiclass import OneVsRestClassifier
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix,classification_report
from sklearn.grid_search import GridSearchCV



In [3]:
 # load spacy's English language models
en_nlp = spacy.load('en')

In [33]:
#uncomment to download data
#nltk.download('averaged_perceptron_tagger')
#nltk.download('tagsets')
#nltk.download('maxent_ne_chunker')
#nltk.download('words')
#nltk.download('punkt')

In [15]:
from nltk.data import load
all_pos_tags = list(load('/Users/christian/nltk_data/help/tagsets/upenn_tagset.pickle').keys())
#all_pos_tags

In [16]:
class extract_other_lexical_features(BaseEstimator, TransformerMixin):
    '''
    other lexical features such as time, special chars
    '''
    
    def fit(self, x, y=None):
        return self    

    def transform(self, sentences):
    
        def extract_other_lexical_features(sentence):
        
            tokentext = nltk.word_tokenize(sentence)

            ## Check if it is digit, could also use POS tag 'NUM'
            digits = np.any([token.isdigit() for token in tokentext])
            #digits = [any(char.isdigit() for char in token) for token in tokentext] #any char contains digit

            ## contains symbols (true), other characters
            symbols = np.any([not token.isalnum() for token in tokentext])

            ## contains time indicators ('yesterday','today')
            time_indicator_list = ['yesterday','today','tomorrow']
            times = np.any([True if token in time_indicator_list else False for token in tokentext])
            
            return [digits,symbols,times] #{'digits':digits,'symbols':symbols,'times':times}
        
        return [extract_other_lexical_features(sentence) for sentence in sentences]

In [17]:
np.any([False,False,True])

True

In [18]:
NER_types = ['ORGANIZATION','PERSON','LOCATION','DATE','TIME','MONEY','PERCENT','FACILITY','FACILITY']

In [19]:
class extract_syntactic_features(BaseEstimator, TransformerMixin):
    '''
    each sub-feature vector is of length all_pos_tags, fixed vector lengths!
    '''
    
    def fit(self, x, y=None):
        return self    

    def transform(self, sentences):
    
        def extract_syntactic_features(sentence):
            tokentext = nltk.word_tokenize(sentence)
            tags = [token[1] for token in nltk.pos_tag(tokentext)]

            # binary occurance of tags
            tag_occurance = [apt in tags for apt in all_pos_tags]

            count_dict = Counter(tags)

            # number of occurances
            tag_counts = [count_dict[apt] if apt in count_dict.keys() else 0 for apt in all_pos_tags]

            # occurance, 0, 1 or more
            tag_three_classes = [2 if tc>1 else tc for tc in tag_counts]

            # named entity recognition: person, organization, location, product, event,
            ner_found=[]
            for chunk in nltk.ne_chunk(nltk.pos_tag(tokentext)):
                if hasattr(chunk, 'label'):
                    ner_found.append(chunk.label())
            ners = [1 if ner in ner_found else 0 for ner in NER_types]

            return tag_occurance+tag_three_classes+tag_three_classes #{'tag_occurance':tag_occurance,'tag_three_classes':tag_three_classes,'ners':ners}

        return [extract_syntactic_features(sentence) for sentence in sentences]

In [20]:
def tokenize_lemmatize(sentence):
    
    #tokentext = nltk.word_tokenize(sentence)
    return [token.lemma_ for token in en_nlp(sentence)]

def tokenize_lemma_pos(sentence):
    '''
    Combine token name and pos label
    '''
    tokentext = nltk.word_tokenize(sentence)
    return [en_nlp(token[0])[0].lemma_+token[1] for token in nltk.pos_tag(tokentext)]

In [21]:
class Debug(BaseEstimator, TransformerMixin):

    def transform(self, X):
        self.shape = X.shape
        # what other output you want
        return X

    def fit(self, X, y=None, **fit_params):
        return self

In [22]:
#test_sentence = 'The New York Times posted about people running marathons'

### Combining feature extraction

In [23]:
pipeline = Pipeline([
    
   # Use FeatureUnion to combine the features from subject and body
    ('union', FeatureUnion(
        transformer_list=[
            
            # Pipeline for getting syntactic features
            ('syntactic_features', Pipeline([
                ('extract_syntactic_features', extract_syntactic_features())
                #('vect', DictVectorizer()),  # list of dicts -> feature matrix
            ])),    
    

            # Pipeline for getting other lexical features
            ('other_lexical_features', Pipeline([
                ('extract_other_lexical_features', extract_other_lexical_features())
                #('vect', DictVectorizer()),  # list of dicts -> feature matrix
            ])),               
    
            # word token ngrams
            ('word_ngrams', Pipeline([
                ('tfidf', TfidfVectorizer(ngram_range=(1,3),analyzer='word')),
                ("debug", Debug())
            ])),
    
    
            # character token ngrams
            ('char_ngrams', Pipeline([
                ('tfidf', TfidfVectorizer(ngram_range=(3,4),analyzer='char')),
                ("debug", Debug())
            ])),     
    
            
    
            ## lemma n-gram features
            ('lemma_ngrams', Pipeline([
                ('tfidf', TfidfVectorizer(tokenizer=tokenize_lemmatize,ngram_range=(1,3),analyzer='word')),
                ("debug", Debug())
            ])),              
    
            
            
            ## Get lemma + POS tags
            ('lemma_pos', Pipeline([
                ('tfidf', TfidfVectorizer(tokenizer=tokenize_lemma_pos,analyzer='word')),
                ("debug", Debug()),
            ]))     
            

        ]
        

        # weight components in FeatureUnion
        #transformer_weights={
        #    'subject': 1.0,
        #    'body_bow': 1.0,
        #    'body_stats': 1.0,
        #},
    )),
    # Use a SVC classifier on the combined features
    #('svc', SVC(kernel='linear')),
    
    ("debug_final", Debug()),

    ('svc',OneVsRestClassifier(SVC(kernel='linear'))),
    ])

In [24]:
df=pd.read_csv('jacobs_corpus.csv')

In [25]:
df.head(2)

Unnamed: 0,label,sentences,title,publication_date,file_id,-1,Profit,Dividend,MergerAcquisition,SalesVolume,BuyRating,QuarterlyResults,TargetPrice,ShareRepurchase,Turnover,Debt,-1.1
0,-1,It will not say what it has spent on the proje...,tesco,25-09-2013,569,1,0,0,0,0,0,0,0,0,0,0,0
1,-1,"Sir John Bond , chairman , told the bank 's an...",FT other HSBC,28-05-2005,425,1,0,0,0,0,0,0,0,0,0,0,0


In [26]:
multi_labels = ['-1',
       'Profit', 'Dividend', 'MergerAcquisition', 'SalesVolume', 'BuyRating',
       'QuarterlyResults', 'TargetPrice', 'ShareRepurchase', 'Turnover',
       'Debt']

In [27]:
X = df['sentences']
y = df[multi_labels]
X.shape,y.shape

((9937,), (9937, 11))

In [45]:
X_train, X_test, y_train, y_test,file_id_train,file_id_test = train_test_split(X, y, df['file_id'],test_size=0.33, random_state=42)

In [53]:
file_id_test.nunique(),len(file_id_test)

(490, 3280)

In [54]:
file_id_train.nunique(),len(file_id_train)

(496, 6657)

In [None]:
# which documents do not appear in 

In [32]:
%%time
pipeline.fit(X_train,y_train)

CPU times: user 46min 23s, sys: 38.2 s, total: 47min 1s
Wall time: 27min 33s


Pipeline(memory=None,
     steps=[('union', FeatureUnion(n_jobs=1,
       transformer_list=[('syntactic_features', Pipeline(memory=None,
     steps=[('extract_syntactic_features', extract_syntactic_features())])), ('other_lexical_features', Pipeline(memory=None,
     steps=[('extract_other_lexical_features', extract_other_lex...ability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
          n_jobs=1))])

##### svae model in json?

In [37]:
from sklearn.externals import joblib
joblib.dump(pipeline, 'pipeline.joblib') 

['pipeline.joblib']

In [35]:
import jsonpickle.ext.numpy as jsonpickle_numpy
jsonpickle_numpy.register_handlers()
frozen = jsonpickle.encode(pipeline)
#thawed = jsonpickle.decode(frozen)

AttributeError: module 'jsonpickle.ext.numpy' has no attribute 'encode'

In [38]:
pipeline.steps

[('union', FeatureUnion(n_jobs=1,
         transformer_list=[('syntactic_features', Pipeline(memory=None,
       steps=[('extract_syntactic_features', extract_syntactic_features())])), ('other_lexical_features', Pipeline(memory=None,
       steps=[('extract_other_lexical_features', extract_other_lexical_features())])), ('word_ngrams', Pipeline(m... tokenize_lemma_pos at 0x135d26b70>,
          use_idf=True, vocabulary=None)), ('debug', Debug())]))],
         transformer_weights=None)),
 ('debug_final', Debug()),
 ('svc',
  OneVsRestClassifier(estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False),
            n_jobs=1))]

In [39]:
pipeline.get_params()['svc']

OneVsRestClassifier(estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
          n_jobs=1)

#### Feature dimensions

In [40]:
pipeline.get_params()['union'].get_params()['lemma_pos'].get_params()['debug'].shape, \
pipeline.get_params()['union'].get_params()['word_ngrams'].get_params()['debug'].shape,\
pipeline.get_params()['union'].get_params()['char_ngrams'].get_params()['debug'].shape,\
pipeline.get_params()['union'].get_params()['lemma_ngrams'].get_params()['debug'].shape



((6657, 13745), (6657, 176571), (6657, 43589), (6657, 183262))

###### 400k features per data point!

In [41]:
pipeline.get_params()['debug_final'].shape

(6657, 417305)

In [42]:
y_pred = pipeline.predict(X_test)

In [43]:
#confusion_matrix(y_test,y_pred)

In [44]:
print(classification_report(y_test,y_pred,target_names=multi_labels))

                   precision    recall  f1-score   support

               -1       0.92      0.95      0.94      2593
           Profit       0.83      0.77      0.80       218
         Dividend       0.70      0.70      0.70        50
MergerAcquisition       0.71      0.21      0.32        81
      SalesVolume       0.79      0.73      0.76       143
        BuyRating       0.95      0.72      0.82        72
 QuarterlyResults       0.68      0.74      0.71        88
      TargetPrice       0.87      0.96      0.92        28
  ShareRepurchase       0.83      0.38      0.53        26
         Turnover       0.86      0.64      0.73        77
             Debt       0.55      0.30      0.39        20

      avg / total       0.89      0.88      0.88      3396

