# Metaclassification

In [1]:
import pandas as pd

tags = pd.read_csv('../features/tags.csv', index_col=0)['tag_name']

## Datasets

### 1. Metadata TF

In [4]:
tf_metadata = pd.concat((
        pd.read_csv('../features/metadata/tf_descriptions.csv', index_col=0),
        pd.read_csv('../features/metadata/tf_keywords.csv', index_col=0)
    ), axis=1
)
tf_metadata = tf_metadata.groupby(tf_metadata.columns, axis=1).sum() # Drop duplicate columns

### 2. Metadata TF-IDF

In [6]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_metadata = pd.DataFrame(
    data=TfidfTransformer().fit_transform(tf_metadata).todense(),
    index=tf_metadata.index,
    columns=tf_metadata.columns
)

### 3. Transcription TF

In [19]:
tf_trans = pd.read_csv('../features/trans/tf_trans.csv', index_col=0)

### 4. Transcriptions TF-IDF

In [20]:
tfidf_trans = pd.DataFrame(
    data=TfidfTransformer().fit_transform(tf_trans).todense(),
    index=tf_trans.index,
    columns=tf_trans.columns
)

## Classifiers

In [23]:
from collections import namedtuple

Classifier = namedtuple(
    'classifier',
    ['name', 'dataframe', 'model']
)

classifiers = []

### 1. Naïve Bayes on metadata TF

In [24]:
from sklearn import ensemble
from sklearn import naive_bayes

classifiers.append(
    Classifier(
        name='Naïve Bayes on metadata TF',
        dataframe=tf_metadata,
        model=ensemble.BaggingClassifier(
            naive_bayes.MultinomialNB(alpha=0.3),
            n_estimators=20,
            max_samples=0.8,
            max_features=0.5
        )
    )
)

### 2. Top terms on metadata TF

In [25]:
import top_terms

classifiers.append(
    Classifier(
        name='Top terms on metadata TF',
        dataframe=tf_metadata,
        model=top_terms.TopTermsClassifier(n_terms=20)
    )
)

### 3. Linear SVM on metadata TF-IDF

In [26]:
from sklearn import svm

classifiers.append(
    Classifier(
        name='Linear SVM on metadata TF-IDF',
        dataframe=tfidf_metadata,
        model=svm.LinearSVC(C=6.0, multi_class='crammer_singer')
    )
)

### 4. Naïve Bayes on transcription TF

In [27]:
classifiers.append(
    Classifier(
        name='Naïve Bayes on transcription TF',
        dataframe=tf_trans,
        model=ensemble.BaggingClassifier(
            naive_bayes.MultinomialNB(alpha=0.2),
            n_estimators=10,
            max_samples=0.8,
            max_features=0.8
        )
    )
)

### 5. Top terms on transcription TF

In [28]:
classifiers.append(
    Classifier(
        name='Top terms on transcription TF',
        dataframe=tf_trans,
        model=top_terms.TopTermsClassifier(n_terms=20)
    )
)

### 6. Linear SVM on transcription TF-IDF

In [29]:
classifiers.append(
    Classifier(
        name='Linear SVM on trans TF-IDF',
        dataframe=tfidf_trans,
        model=svm.LinearSVC(C=6.0, multi_class='crammer_singer')
    )
)

## Run the classifiers individually

In [31]:
import time
import warnings

import numpy as np
from sklearn import model_selection

warnings.filterwarnings('ignore') # Naïve Bayes might divide by 0 but it's okay

splitter = model_selection.LeaveOneOut()

predictions = pd.DataFrame(
    index=tags.index,
    columns=(clf.name for clf in classifiers)
)

for clf in classifiers:
    X = clf.dataframe.values # n-d array
    y = tags.reindex(clf.dataframe.index).values # 1-d array
    y_pred = []
    y_true = []
    times = []
    for train, test in splitter.split(X):
        t0 = time.time()
        # Split into train/test (the test only has one observation in leave-one-out)
        X_train, X_test = X[train], X[test]
        y_train, y_test = y[train], y[test]
        # Fit on the training set and predicted the remaining test row
        clf.model.fit(X_train, y_train)
        y_pred.append(clf.model.predict(X_test)[0])
        y_true.append(y_test[0])
        times.append(time.time() - t0)
    # Store the predictions for performing meta-classification further on
    predictions[clf.name] = pd.Series(data=y_pred, index=clf.dataframe.index)
    # Calculate the precision of the classifier
    matches = np.array(y_true) == np.array(y_pred)
    total_time = sum(times)
    print('Classifier:', clf.name)
    print('Precision 95 %% CI: %0.3f (+/- %0.3f)' % (matches.mean(), matches.std() * 1.96))
    print('Total time to evaluate:, %d seconds (%0.2f per loop)' % (total_time, total_time / len(times)))
    print('-' * 42)

  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -
  self.cla

Classifier: Naïve Bayes on metadata TF
Precision 95 % CI: 0.839 (+/- 0.720)
Total time to evaluate:, 470 seconds (0.82 per loop)
------------------------------------------
Classifier: Top terms on metadata TF
Precision 95 % CI: 0.681 (+/- 0.914)
Total time to evaluate:, 33 seconds (0.06 per loop)
------------------------------------------
Classifier: Linear SVM on metadata TF-IDF
Precision 95 % CI: 0.881 (+/- 0.634)
Total time to evaluate:, 175 seconds (0.31 per loop)
------------------------------------------


  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -
  self.class_log_prior_ = (np.log(self.class_count_) -


Classifier: Naïve Bayes on transcription TF
Precision 95 % CI: 0.553 (+/- 0.974)
Total time to evaluate:, 459 seconds (0.80 per loop)
------------------------------------------
Classifier: Top terms on transcription TF
Precision 95 % CI: 0.384 (+/- 0.953)
Total time to evaluate:, 50 seconds (0.09 per loop)
------------------------------------------
Classifier: Linear SVM on trans TF-IDF
Precision 95 % CI: 0.665 (+/- 0.925)
Total time to evaluate:, 63 seconds (0.11 per loop)
------------------------------------------


## Run a metaclassification

In [99]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder().fit(tags)

scores = model_selection.cross_val_score(
    estimator=ensemble.RandomForestClassifier(
        n_estimators=30,
        max_depth=6,
        min_samples_leaf=2
    ),
    X=predictions.apply(label_encoder.transform, axis=1).values,
    y=tags.reindex(predictions.index),
    scoring='accuracy',
    cv=5
)

print('Accuracy: %0.3f (+/- %0.3f)' % (scores.mean(), scores.std() * 1.96))

Accuracy: 0.885 (+/- 0.100)
