In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%load_ext autotime

from ngram import tune, roc_scorer
from baselines import load_comments_and_labels, assemble_data
from deep_learning import make_MLP, DenseTransformer

from sklearn.pipeline import Pipeline
from sklearn.grid_search import RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from keras.wrappers.scikit_learn import KerasClassifier
from serialization import save_pipeline
import joblib

Using TensorFlow backend.


In [2]:
task = 'recipient'
path = '../../models/cv/'
n_max = 10000000
n_iter = 24

time: 2.12 ms


### Load Annotated Data

In [3]:
data = load_comments_and_labels(task)

time: 1min 51s


In [4]:
X_train, y_train = assemble_data(data, 'comments', 'plurality', splits = ['train'])
X_dev, y_dev = assemble_data(data,  'comments', 'plurality', splits = ['dev'])

time: 25.1 ms


In [5]:
X_train = X_train[:n_max]
X_dev = X_dev[:n_max]
y_train = y_train[:n_max]
y_dev = y_dev[:n_max]

time: 1.54 ms


In [6]:
def save_best_estimator(cv, directory, name):
    params = cv.best_params_
    model = cv.estimator
    model = model.set_params(**params)
    save_pipeline(model, directory, name)

time: 1.86 ms


# Sklearn

### Linear Model

In [7]:
alg = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', LogisticRegression()),
])

param_grid = {
    'vect__max_features': (500, 5000, 10000, 50000, None), 
    'vect__ngram_range': ((1,5),),  
    'vect__analyzer' : ('char',),
    'clf__C' : (0.0001, 0.001, 0.01, 0.1, 1, 10),
}

linear_counts = tune (X_train, y_train, X_dev, y_dev, alg, param_grid, n_iter, roc_scorer, n_jobs = 6, verbose = True)
save_best_estimator(linear_counts, path, 'linear_counts')


Best parameters set found:
{'clf__C': 10, 'vect__analyzer': 'char', 'vect__ngram_range': (1, 5), 'vect__max_features': None} 0.883319425654


Grid scores:
0.82341 (+/-0.00000) for {'clf__C': 1, 'vect__analyzer': 'char', 'vect__ngram_range': (1, 5), 'vect__max_features': 50000}
0.81954 (+/-0.00000) for {'clf__C': 0.01, 'vect__analyzer': 'char', 'vect__ngram_range': (1, 5), 'vect__max_features': 5000}
0.80201 (+/-0.00000) for {'clf__C': 1, 'vect__analyzer': 'char', 'vect__ngram_range': (1, 5), 'vect__max_features': 500}
0.82659 (+/-0.00000) for {'clf__C': 0.001, 'vect__analyzer': 'char', 'vect__ngram_range': (1, 5), 'vect__max_features': None}
0.82959 (+/-0.00000) for {'clf__C': 10, 'vect__analyzer': 'char', 'vect__ngram_range': (1, 5), 'vect__max_features': 50000}
0.88332 (+/-0.00000) for {'clf__C': 10, 'vect__analyzer': 'char', 'vect__ngram_range': (1, 5), 'vect__max_features': None}
0.81832 (+/-0.00000) for {'clf__C': 0.01, 'vect__analyzer': 'char', 'vect__ngram_range': (1, 5), 'vect

In [8]:
alg = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression()),
])

param_grid = {
    'vect__max_features': (500, 5000, 10000, 50000, None), 
    'vect__ngram_range': ((1,5),),  
    'vect__analyzer' : ('char',),
    'tfidf__sublinear_tf' : (True, False),
    'tfidf__norm' : (None, 'l2'),
    'clf__C' : (0.0001, 0.001, 0.01, 0.1, 1, 10),
}

linear_tfidf = tune (X_train, y_train, X_dev, y_dev, alg, param_grid, n_iter, roc_scorer, n_jobs = 6, verbose = True)
save_best_estimator(linear_tfidf, path, 'linear_tfidf')


Best parameters set found:
{'vect__analyzer': 'char', 'vect__max_features': 50000, 'clf__C': 10, 'tfidf__sublinear_tf': False, 'vect__ngram_range': (1, 5), 'tfidf__norm': 'l2'} 0.969655522426


Grid scores:
0.72582 (+/-0.00000) for {'vect__analyzer': 'char', 'vect__max_features': 500, 'clf__C': 0.0001, 'tfidf__sublinear_tf': False, 'vect__ngram_range': (1, 5), 'tfidf__norm': 'l2'}
0.96308 (+/-0.00000) for {'vect__analyzer': 'char', 'vect__max_features': 5000, 'clf__C': 1, 'tfidf__sublinear_tf': False, 'vect__ngram_range': (1, 5), 'tfidf__norm': 'l2'}
0.93920 (+/-0.00000) for {'vect__analyzer': 'char', 'vect__max_features': 10000, 'clf__C': 0.1, 'tfidf__sublinear_tf': True, 'vect__ngram_range': (1, 5), 'tfidf__norm': None}
0.95593 (+/-0.00000) for {'vect__analyzer': 'char', 'vect__max_features': 5000, 'clf__C': 0.001, 'tfidf__sublinear_tf': True, 'vect__ngram_range': (1, 5), 'tfidf__norm': None}
0.69622 (+/-0.00000) for {'vect__analyzer': 'char', 'vect__max_features': 50000, 'clf__C': 

# Tensorflow/Keras

### EDP

In [9]:
X_train, y_train = assemble_data(data, 'comments', 'empirical_dist', splits = ['train'])
X_dev, y_dev = assemble_data(data,  'comments', 'empirical_dist', splits = ['dev'])

time: 257 ms


In [10]:
X_train = X_train[:n_max]
X_dev = X_dev[:n_max]
y_train = y_train[:n_max]
y_dev = y_dev[:n_max]

time: 6.09 ms


In [11]:
n_edp_features = 10000

alg = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('to_dense', DenseTransformer()), 
    ('clf', KerasClassifier(build_fn=make_MLP, output_dim = 2, input_dim = n_edp_features)),
])

param_grid = {
    'vect__max_features': (n_edp_features,), 
    'vect__ngram_range': ((1,5),),  
    'vect__analyzer' : ('char',),
    'tfidf__sublinear_tf' : (True,),
    'tfidf__norm' : ('l2',),
    'clf__alpha' : (0.000000001, 0.0000001, 0.00001, 0.001, 0.01),
    'clf__hidden_layer_sizes' : ((), (50,), (50, 50), (50, 50, 10)),
    'clf__nb_epoch' : (2,4,8,16),
    'clf__batch_size': (200,),
}

edp_tfidf = tune (X_train, y_train, X_dev, y_dev, alg, param_grid, n_iter, roc_scorer, n_jobs = 1, verbose = True)
save_best_estimator(edp_tfidf, path, 'edp_tfidf')

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/2
Epoch 2/2
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 2/2
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Ep

In [12]:
n_edp_features = 200
alg = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('lsa', TruncatedSVD()),
    ('clf', KerasClassifier(build_fn=make_MLP, output_dim = 2, input_dim = n_edp_features)),
])

param_grid = {
    'vect__max_features': (50000,), 
    'vect__ngram_range': ((1,5),),  
    'vect__analyzer' : ('char',),
    'tfidf__sublinear_tf' : (True,),
    'tfidf__norm' : ('l2',),
    'lsa__n_components' : (n_edp_features,),
    'clf__alpha' : (0.000000001, 0.0000001, 0.00001, 0.001, 0.01),
    'clf__hidden_layer_sizes' : ((), (50,), (50, 50), (50, 50, 10)),
    'clf__nb_epoch' : (2, 4, 8, 16),
    'clf__batch_size': (200,),
}

edp_lsa = tune (X_train, y_train, X_dev, y_dev, alg, param_grid, n_iter, roc_scorer, n_jobs = 1, verbose = True)
save_best_estimator(edp_lsa, path, 'edp_lsa')

Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
Epoch 2/2
Epoch 2/2
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
Epoch 1/2
Epoch 2/2
Epoch 2/2
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 