In [1]:
import json
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold

In [2]:
def get_df_splits(df):
    splits = []

    kf = KFold(n_splits=5, shuffle=True, random_state=1731)
    for train_indices, test_indices in kf.split(df):
        df_train = df.iloc[train_indices, :]
        df_test = df.iloc[test_indices, :]
        
        splits.append((df_train, df_test))
    
    return splits

In [3]:
def to_X_y(vectorizer, df, label_name):
    corpus = df["result_full_description"]
    
    X = vectorizer.fit_transform(corpus)
    feature_names = vectorizer.get_feature_names()
    
    y = df[label_name]
    
    return X, y, feature_names

In [4]:
def to_X_y_tt(vectorizer, df_train, df_test, label_name):
    corpus_train = df_train["result_full_description"]
    corpus_test = df_test["result_full_description"]
    
    X_train = vectorizer.fit_transform(corpus_train)
    X_test = vectorizer.transform(corpus_test)
    feature_names = vectorizer.get_feature_names()
    
    y_train = df_train[label_name]
    y_test = df_test[label_name]
    
    return X_train, X_test, y_train, y_test, feature_names

In [5]:
def run_classifier(classifier, X_train, X_test, y_train, y_test, df_test, classes):
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred, classes).astype(int)
    precision = get_precision(confusion)
    recall = get_recall(confusion)
    error = get_error(df_test, y_test, y_pred)
    
    return accuracy, confusion, precision, recall, error

In [6]:
def run_classifier_cv(classifier_factory, X_y_splits, classes):
    c = len(classes)
    
    accuracy = 0
    confusion = np.zeros((c, c)).astype(int)
    errors = []
    
    for X_train, X_test, y_train, y_test, df_test in X_y_splits:
        classifier = classifier_factory()
        curr_accuracy, curr_confusion, _, _, curr_error = run_classifier(classifier, X_train, X_test, y_train, y_test, df_test, classes)
        
        accuracy += curr_accuracy
        confusion += curr_confusion
        errors.append(curr_error)
    
    k = len(X_y_splits)
    accuracy /= k
    
    precision = get_precision(confusion)
    recall = get_recall(confusion)
    
    error = pd.concat(errors)
    return accuracy, confusion, precision, recall, error

In [7]:
def get_precision(confusion):
    return np.diag(confusion) / np.sum(confusion, axis=0)

In [8]:
def get_recall(confusion):
    return np.diag(confusion) / np.sum(confusion, axis=1)

In [9]:
def get_error(df_test, y_test, y_pred):
    error_df = pd.concat([
        df_test["result_full_description"].reset_index(drop=True),
        pd.DataFrame({"y_true": y_test}).reset_index(drop=True),
        pd.DataFrame({"y_pred": y_pred}).reset_index(drop=True)
    ], axis=1)
    
    return error_df[
        error_df["y_true"] != error_df["y_pred"]
    ]

In [10]:
def get_feature_weights(classifier, feature_names):
    feature_weights = [
        (feature_names[index], weight)
        for index, weight in enumerate(classifier.coef_[0])
    ]
    feature_weights.sort(key=lambda x: x[1])

    min_weights = feature_weights[:10]
    max_weights = feature_weights[-10:]
    
    return min_weights, max_weights

In [11]:
def get_feature_importances(classifier, feature_names):
    feature_importances = [
        (feature_names[index], importance)
        for index, importance in enumerate(classifier.feature_importances_)
    ]
    feature_importances.sort(key=lambda x: x[1], reverse=True)
    return feature_importances[:10]

In [12]:
def load_candidates(candidates_str):
    candidates_dict = json.loads(candidates_str)
    candidates = set(candidates_dict.keys())
    
    banned = {"Bacteria", "Virus"}
    candidates.difference_update(banned)
    
    return {candidate.lower() for candidate in candidates}

In [13]:
def get_one(_set):
    assert _set   # _set is not empty
    return next(iter(_set))