In [1]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition.pca import PCA
from sklearn.linear_model import LogisticRegression
import pandas as pd

In [4]:
%run common.ipynb

In [11]:
%run ds_tools/dstools/ml/ensemble.py

In [2]:
def cv_test(est):
    df = pd.read_csv('training_set.tsv', index_col='id', sep='\t')

    scores = cross_val_score(
        estimator=est,
        X=df.drop('correctAnswer', axis=1),
        y=df.correctAnswer,
        cv=5,
        n_jobs=1,
        verbose=1)
    print(est)
    print(scores)
    print('mean: {mean}, std: {std}'.format(mean=scores.mean(), std=scores.std()))

In [3]:
def submission(est, name='results'):
    df_train = pd.read_csv('training_set.tsv', index_col='id', sep='\t')
    model = est.fit(df_train.drop('correctAnswer', axis=1), df_train.correctAnswer)

    df = pd.read_csv('test_set.tsv.gz', index_col='id', sep='\t')
    preds = model.predict(df)
    res = pd.Series(preds, index=df.index, name='correctAnswer')
    res.to_csv(name+'.csv', index_label='id', header=True)

In [12]:
# mean: 0.383613928769, std: 0.0224772900287
# cv execution time: 34.7804338932 sec
est1 = IrEstimator('ck12-concepts')  # Concepts_b_v8_vdt_html.zip, stopwords

In [14]:
# mean: 0.392806695044, std: 0.00974997673719
# cv execution time: 49.3968729973 sec
est1_1 = IrEstimatorRescoreSum('ck12-concepts')  # Concepts_b_v8_vdt_html.zip, stopwords

In [15]:
# mean: 0.367610590866, std: 0.00736769904979
# cv execution time: 49.5336530209 sec
est1_2 = IrEstimatorSum('ck12-concepts')  # Concepts_b_v8_vdt_html.zip, stopwords

In [16]:
# mean: 0.305605320779, std: 0.00359779255663
# cv execution time: 22.8514099121 sec
est3 = GloveEstimator('glove.6B.300d-ai2.txt.bz2')

In [17]:
# mean: 0.378805972527, std: 0.00684518647143
# cv execution time: 134.214504004 sec
est4 = ModelEnsemble(
    intermediate_estimators=[
        GloveEstimator('glove.6B.300d-ai2.txt.bz2'),
        IrEstimator('ck12-concepts'),
    ],
    assembly_estimator=LogisticRegression(C=1),
    ensemble_train_size=1
)

In [18]:
# mean: 0.378411475843, std: 0.0157684051548
# cv execution time: 219.582041979 sec
est5 = ModelEnsemble(
    intermediate_estimators=[
        GloveEstimator('glove.6B.300d-ai2.txt.bz2'),
        IrEstimator('ck12-concepts'),
    ],
    assembly_estimator=Pipeline([
        ('pca', PCA(n_components=4)),
        ('lr', LogisticRegression(C=1)),
    ]),
    ensemble_train_size=1
)

In [13]:
# mean: 0.387199480557, std: 0.00767928030034
# cv execution time: 305.623694897 sec
est6 = ModelEnsemble(
    intermediate_estimators=[
        GloveEstimator('glove.6B.300d-ai2.txt.bz2'),
        IrEstimatorRescoreSum('ck12-concepts'),
    ],
    assembly_estimator=LogisticRegression(C=1),
    ensemble_train_size=1
)