In [8]:
import os
import plotly.graph_objs as go

import argparse
import numpy as np
import plotly.plotly as py

from scipy.stats import ttest_ind


class TestSuit:
    def __init__(self):
        self.feature_group = ''
        self.accuracies = []


class ClassiferTestSuits:
    def __init__(self, classifier, dimensionality, num_instance):
        self.classifier = classifier
        self.dimensionality = dimensionality
        self.num_instance = num_instance
        self.suits = []


def read_csv(filepath, classifier, dimensionality, num_instance):
    classifier_test_suits = ClassiferTestSuits(classifier, dimensionality, num_instance)
    new_section = True
    with open(filepath, 'r') as f:
        line = f.readline()
        while line != '':
            parts = line.strip().split('\t')
            if new_section:
                feature_group = parts[0]
                suit = TestSuit()
                new_section = False
                suit.feature_group = feature_group
                classifier_test_suits.suits.append(suit)
            elif parts[0] == 'Accuracy:':
                line = f.readline()
                parts = line.strip().split('\t')
                suit.accuracies = np.array(list(map(float, parts[1:])))
            elif parts[0] == '':
                new_section = True
            line = f.readline()
    return classifier_test_suits


def ttest_compare_feature_groups(classifier_test_suitss, classifier, dimensionality, num_instance):
    t_values = {}
    p_values = {}

    for classifier_test_suits in classifier_test_suitss:
        if classifier_test_suits.dimensionality != dimensionality:
            continue
        if classifier_test_suits.num_instance != num_instance:
            continue
        if classifier_test_suits.classifier != classifier:
            continue

        accuracy_rates_populations = {}
        for suit in classifier_test_suits.suits:
            accuracy_rates_populations[suit.feature_group] = suit.accuracies

        accuracy_rates_all = accuracy_rates_populations['all']
        for feature_group, accuracy_rates in accuracy_rates_populations.items():
            if feature_group != 'all':
                t, p = ttest_ind(accuracy_rates, accuracy_rates_all)
                t_values[feature_group] = t
                p_values[feature_group] = p

        return t_values, p_values


def ttest_compare_num_instances(classifier_test_suitss, classifier, dimensionality, base_ninstances, deriv_ninstances):
    t_values = {}
    p_values = {}

    base = {}
    deriv = {}

    for classifier_test_suits in classifier_test_suitss:
        if classifier_test_suits.dimensionality != dimensionality:
            continue
        if classifier_test_suits.classifier != classifier:
            continue
        for suit in classifier_test_suits.suits:
            if suit.feature_group == 'all':
                if classifier_test_suits.num_instance == base_ninstances:
                    base[classifier_test_suits.classifier] = suit
                else:
                    deriv[classifier_test_suits.classifier] = suit
                break
    x = 0


def extract_accuracies_by_ninstances(classifier_test_suitss, classifier, dimensionality, ninstances):
    stdevss = {x: {} for x in ninstances}
    averagess = {x: {} for x in ninstances}

    for classifier_test_suits in classifier_test_suitss:
        if classifier_test_suits.dimensionality != dimensionality:
            continue
        if classifier_test_suits.classifier != classifier:
            continue
        ninstance = classifier_test_suits.num_instance
        stdevs = stdevss[ninstance]
        averages = averagess[ninstance]
        for suit in classifier_test_suits.suits:
            mean = np.mean(suit.accuracies)
            stdev = np.std(suit.accuracies)
            stdevs[suit.feature_group] = stdev
            averages[suit.feature_group] = mean

    return averagess, stdevss


def main(csv_dir):
    classifiers = ['nnet', 'svm_linear', 'rf', 'svm_rbf']
    dimensionalities = ['full', 'pca']
    num_instancees = ['150', '150-20']
    prefix = 'kfold_bestparam_tmi'

    if not os.path.isdir(csv_dir):
        raise Exception('{} is not a folder'.format(csv_dir))

    classifier_test_suitss = []

    for classifier in classifiers:
        for dimensionality in dimensionalities:
            for num_instance in num_instancees:
                filename = '{}_{}_{}.{}.tsv'.format(prefix, classifier, dimensionality, num_instance)
                filepath = os.path.join(csv_dir, filename)
                classifier_test_suits = read_csv(filepath, classifier, dimensionality, num_instance)
                classifier_test_suitss.append(classifier_test_suits)

    ninstances = ['150'] + ['150-{}'.format(x) for x in range(20, 150, 10)]
    averagess, stdevss = extract_accuracies_by_ninstances(classifier_test_suitss, 'nnet', 'full', ninstances)

    ninstances_axis = list(averagess.keys())
    stdev_axis = np.array(list(stdevss.values()))
    accuracy_axis = np.array(list(averagess.values()))

    upper_bound = go.Scatter(
        name='Upper Bound',
        x=ninstances_axis,
        y=accuracy_axis + stdev_axis,
        mode='lines',
        marker=dict(color="#444"),
        line=dict(width=0),
        fillcolor='rgba(68, 68, 68, 0.3)',
        fill='tonexty'
    )

    lower_bound = go.Scatter(
        name='Lower Bound',
        x=ninstances_axis,
        y=accuracy_axis - stdev_axis,
        marker=dict(color="#444"),
        line=dict(width=0),
        mode='lines'
    )

    trace = go.Scatter(
        name='Accuracies',
        x=ninstances_axis,
        y=accuracy_axis,
        mode='lines',
        line=dict(color='rgb(31, 119, 180)'),
        fillcolor='rgba(68, 68, 68, 0.3)',
        fill='tonexty'
    )

    data = [lower_bound, trace, upper_bound]

    layout = go.Layout(
        yaxis=dict(title='Accuracy'),
        xaxis=dict(title='Number of instances'),
        title='Continuous, variable value error bars.<br>Notice the hover text!',
        showlegend=False)

    fig = go.Figure(data=data, layout=layout)
    plot = py.iplot(fig, filename='Accuracies by instances using nnet')
    print('{}'.format(plot.resource))


In [9]:
main(csv_dir='/Users/yfukuzaw/workspace/koe/')

TypeError: unsupported operand type(s) for +: 'dict' and 'dict'