In [5]:
from matplotlib import pyplot as plt
import csv
import pandas as pd
import numpy as np

n_trees = [1,5,10,20,25,50,100,200,500,1000]
#csv_files = ['generated_csvs/dataset_191_wine-metrics.csv', 'generated_csvs/dataset_31_credit-g-metrics_ate500.csv', 'generated_csvs/phpOkU53r-metrics.csv']
#dataset_names = ['Wine dataset(dataset_191_wine)','German Credit Dataset(dataset_31_credit_g)', 'Vertebral column dataset (phpOkU53r)']

csv_files = ['generated_csvs/phpOkU53r-accuracy-greater-50-metrics.csv', 'generated_csvs/dataset_191_wine-accuracy-greater-50-metrics.csv']
dataset_names = ['Vertebral column dataset (phpOkU53r)-Greater 50', 'Wine dataset(dataset_191_wine-Greater 50)']

def read_csv(csv_path, attributes_to_obtain=None):
    with open(csv_path) as csv_file:
        reader = csv.reader(csv_file)
        csv_info = list(reader)
    
    df = pd.DataFrame.from_records(csv_info[1:], columns=csv_info[0])
    if attributes_to_obtain is not None:
        df = df[attributes_to_obtain]
    df = df.apply(pd.to_numeric)
    
    return df

def getMeanMetrics(df):
    mean_metrics = []
    std_metrics = []
    
    for n_tree in n_trees:
        n_tree_dataset = df[df['n_trees'] == n_tree]
    
        mean_accuracy,std_accuracy = np.mean(n_tree_dataset['accuracy'].values), np.std(n_tree_dataset['accuracy'].values)
        mean_recall,std_recall = np.mean(n_tree_dataset['mean_recall'].values), np.std(n_tree_dataset['mean_recall'].values)
        mean_prec, std_prec = np.mean(n_tree_dataset['mean_precision'].values), np.std(n_tree_dataset['mean_precision'].values)
        mean_F1, std_F1 = np.mean(n_tree_dataset['mean_F1'].values), np.std(n_tree_dataset['mean_F1'].values)

        mean_metrics.append([n_tree] + [mean_accuracy, mean_recall, mean_prec, mean_F1])
        std_metrics.append([n_tree] + [std_accuracy, std_recall, std_prec, std_F1])
        
    return mean_metrics, std_metrics

def getMeanMetricsPerCV(df):
    mean_metrics = []
    std_metrics = []
    
    for n_tree in n_trees:
        mean_metrics_tree = []
        std_metrics_tree = []
        
        n_tree_dataset = df[df['n_trees'] == n_tree]
        
        for cv in range(1,11):
            n_tree_cv_dataset = n_tree_dataset[n_tree_dataset['cross_val'] == cv]
            
            acc_cv,std_acc = np.mean(n_tree_cv_dataset['accuracy'].values), np.std(n_tree_cv_dataset['accuracy'].values)
            rec_cv,std_rec = np.mean(n_tree_cv_dataset['mean_recall'].values), np.std(n_tree_cv_dataset['mean_recall'].values)
            prec_cv, std_prec = np.mean(n_tree_cv_dataset['mean_precision'].values), np.std(n_tree_cv_dataset['mean_precision'].values)
            F1_cv, std_F1 = np.mean(n_tree_cv_dataset['mean_F1'].values), np.std(n_tree_cv_dataset['mean_F1'].values)
            
            mean_metrics_tree.append([cv,n_tree] + [acc_cv, rec_cv, prec_cv, F1_cv])
            std_metrics_tree.append([cv, n_tree] + [std_acc, std_rec, std_prec, std_F1])
            
        mean_metrics.append(mean_metrics_tree)
        std_metrics.append(std_metrics_tree)
            
    return mean_metrics, std_metrics

def getPrecisionRecallPerClass(df, number_of_classes):
    prec_f1_recalls = []
    
    for n_tree in n_trees:
        prec_f1_recall = []
        n_tree_dataset = df[df['n_trees'] == n_tree]
        
        if number_of_classes == 2:
            class_1_recall = np.mean(n_tree_dataset['class_good_recall'].values)
            class_1_prec = np.mean(n_tree_dataset['class_good_precision'].values)
            class_1_f1 = np.mean(n_tree_dataset['class_good_F1'].values)

            class_2_recall = np.mean(n_tree_dataset['class_bad_recall'].values)
            class_2_prec = np.mean(n_tree_dataset['class_bad_precision'].values)
            class_2_f1 = np.mean(n_tree_dataset['class_bad_F1'].values)
        
            prec_f1_recalls.append([n_tree] + [class_1_recall, class_1_prec, class_1_f1, 
                                            class_2_recall, class_2_prec, class_2_f1])
        
        elif number_of_classes == 3:
            class_1_recall = np.mean(n_tree_dataset['class_1_recall'].values)
            class_1_prec = np.mean(n_tree_dataset['class_1_precision'].values)
            class_1_f1 = np.mean(n_tree_dataset['class_1_F1'].values)

            class_2_recall = np.mean(n_tree_dataset['class_2_recall'].values)
            class_2_prec = np.mean(n_tree_dataset['class_2_precision'].values)
            class_2_f1 = np.mean(n_tree_dataset['class_2_F1'].values)
            
            class_3_recall = np.mean(n_tree_dataset['class_3_recall'].values)
            class_3_prec = np.mean(n_tree_dataset['class_3_precision'].values)
            class_3_f1 = np.mean(n_tree_dataset['class_3_F1'].values)
            
            prec_f1_recalls.append([n_tree] + [class_1_recall, class_1_prec, class_1_f1, 
                                                class_2_recall, class_2_prec, class_2_f1,
                                              class_3_recall, class_3_prec, class_3_f1])
        
    return prec_f1_recalls
            
        

def plot_mean_std(x_axis, y_axis, std, title, x_label, y_label):
    plt.title(title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.errorbar(x_axis, y_axis, std, linestyle='None', marker='^')
    #plt.show()
    plt.savefig(title + '.png')
    plt.close()

def plot_graphs(csv_files):
    for index, csv_file in enumerate(csv_files):
        #Getting dataset information from csv file
        df = read_csv(csv_file, ['cross_val', 'kfold','n_trees','accuracy','mean_recall','mean_precision','mean_F1'])
        
        #Getting global mean and std
        mean_metrics, std_metrics = getMeanMetrics(df)
        
        ##################### Mean metrics vs number of decision Trees #####################
        #getting list of ntress
        x = [str(x) for x in n_trees]
        
        #Working over accuracy
        accuracy = np.array([x[1] for x in mean_metrics])
        acc_std = np.array([y[1] for y in std_metrics])
        #Plotting Accuracy
        title = 'Accuracy vs n_tree - ' + dataset_names[index]
        plot_mean_std(x, accuracy, acc_std, title, 'Numero de árvores', 'Acurácia')
        
        #Working over recall
        recall = np.array([x[2] for x in mean_metrics])
        recall_std = np.array([y[2] for y in std_metrics])
        #Plotting Accuracy
        title = 'Recall vs n_tree - ' + dataset_names[index]
        plot_mean_std(x, recall, recall_std, title, 'Numero de árvores', 'Recall')
        
        
        #Working over Precision
        precision = np.array([x[3] for x in mean_metrics])
        precision_std = np.array([y[3] for y in std_metrics])
        #Plotting Accuracy
        title = 'Precision vs n_tree - ' + dataset_names[index]
        plot_mean_std(x, precision, precision_std, title, 'Numero de árvores', 'Precision')
        
        #Working over F1-measure
        F1 = np.array([x[4] for x in mean_metrics])
        F1_std = np.array([y[4] for y in std_metrics])
        #Plotting Accuracy
        title = 'F1-Measure vs n_tree - ' + dataset_names[index]
        plot_mean_std(x, F1, F1_std, title, 'Numero de árvores', 'F1-Measure')
        
        ##################### Comparing all metrics vs number of decision Trees #####################
        plt.title('All metrics vs n_tree - ' + dataset_names[index])
        plt.xlabel('Numero de árvores')
        plt.ylabel('Metric value')
        plt.plot(x, accuracy, '-o')
        plt.plot(x, recall, '-^')
        plt.plot(x, precision, '-*')
        plt.plot(x, F1, '-s')
        plt.gca().legend(('accuracy','recall', 'precision', 'F1-Measure'))
        #plt.show()
        plt.savefig('All metrics vs n_tree - ' + dataset_names[index] + '.png')
        plt.close()
        
        ##################### Metrics vs Cross validation #####################
        metrics_cv, std_metrics = getMeanMetricsPerCV(df)
        
        for n_cross_val, metric_cv in enumerate(metrics_cv):
            plt.title('Cross Val. - ' + str(n_trees[n_cross_val]) + ' Arvores-' + dataset_names[index])
            plt.xlabel('Cross validation')
            plt.ylabel('Metrics')
            x_axis = np.array([1,2,3,4,5,6,7,8,9,10])
            plt.xticks(x_axis)
            #accuracy
            plt.plot(x_axis, np.array([x[2] for x in metric_cv]), '-o')
            #recall
            plt.plot(x_axis, np.array([x[3] for x in metric_cv]), '-^')
            #precision
            plt.plot(x_axis, np.array([x[4] for x in metric_cv]), '-*')
            #F1
            plt.plot(x_axis, np.array([x[5] for x in metric_cv]), '-s')
            plt.gca().legend(('accuracy','recall', 'precision', 'F1-Measure'))
            #plt.show()
            plt.savefig('Cross Val. - ' + str(n_trees[n_cross_val]) + ' Arvores-' + dataset_names[index] + '.png')
            plt.close()
        
        ##################### Precision, Recall, F1 per class #####################
        df2 = read_csv(csv_file)
        
        if df2.shape[1] == 13:
            #only two classes
            prec_f1_recalls = getPrecisionRecallPerClass(df2, 2)
            
            ##################### Recall per Class #####################
            plt.title('Recall Per Class - ' + dataset_names[index])
            plt.xlabel('Numero de árvores')
            plt.ylabel('Metric value')
            plt.plot(x, np.array([x[1] for x in prec_f1_recalls]), '-o')
            plt.plot(x, np.array([x[4] for x in prec_f1_recalls]), '-^')
            plt.gca().legend(('Recall class 1','Recall class 2'))
            #plt.show()
            plt.savefig('Recall Per Class - ' + dataset_names[index] + '.png')
            plt.close()
            
            ##################### Precision per Class #####################
            plt.title('Precision Per Class - ' + dataset_names[index])
            plt.xlabel('Numero de árvores')
            plt.ylabel('Metric value')
            plt.plot(x, np.array([x[2] for x in prec_f1_recalls]), '-o')
            plt.plot(x, np.array([x[5] for x in prec_f1_recalls]), '-^')
            plt.gca().legend(('Precision class 1','Precision class 2'))
            #plt.show()
            plt.savefig('Precision Per Class - ' + dataset_names[index] + '.png')
            plt.close()
            
            ##################### F1-Measure per Class #####################
            plt.title('F1-measure Per Class - ' + dataset_names[index])
            plt.xlabel('Numero de árvores')
            plt.ylabel('Metric value')
            plt.plot(x, np.array([x[2] for x in prec_f1_recalls]), '-o')
            plt.plot(x, np.array([x[6] for x in prec_f1_recalls]), '-^')
            plt.gca().legend(('F1-Measure class 1','F1-measure class 2'))
            #plt.show()
            plt.savefig('F1-measure Per Class - ' + dataset_names[index] + '.png')
            plt.close()
            
        elif df2.shape[1] == 16:
            #three classes
            prec_f1_recalls = getPrecisionRecallPerClass(df2, 3)
            
            ##################### Recall per Class #####################
            plt.title('Recall Per Class - ' + dataset_names[index])
            plt.xlabel('Numero de árvores')
            plt.ylabel('Metric value')
            plt.plot(x, np.array([x[1] for x in prec_f1_recalls]), '-o')
            plt.plot(x, np.array([x[4] for x in prec_f1_recalls]), '-^')
            plt.plot(x, np.array([x[7] for x in prec_f1_recalls]), '-*')
            plt.gca().legend(('Recall class 1','Recall class 2', 'Recall class 3'))
            #plt.show()
            plt.savefig('Recall Per Class - ' + dataset_names[index])
            plt.close()
            
            ##################### Precision per Class #####################
            plt.title('Precision Per Class - ' + dataset_names[index])
            plt.xlabel('Numero de árvores')
            plt.ylabel('Metric value')
            plt.plot(x, np.array([x[2] for x in prec_f1_recalls]), '-o')
            plt.plot(x, np.array([x[5] for x in prec_f1_recalls]), '-^')
            plt.plot(x, np.array([x[8] for x in prec_f1_recalls]), '-*')
            plt.gca().legend(('Precision class 1','Precision class 2', 'Precision class 3'))
            #plt.show()
            plt.savefig('Precision Per Class - ' + dataset_names[index] + '.png')
            plt.close()
            
            ##################### Recall per Class #####################
            plt.title('F1-measure Per Class - ' + dataset_names[index])
            plt.xlabel('Numero de árvores')
            plt.ylabel('Metric value')
            plt.plot(x, np.array([x[3] for x in prec_f1_recalls]), '-o')
            plt.plot(x, np.array([x[6] for x in prec_f1_recalls]), '-^')
            plt.plot(x, np.array([x[9] for x in prec_f1_recalls]), '-*')
            plt.gca().legend(('F1-Measure class 1','F1-Measure  class 2', 'F1-Measure  class 3'))
            #plt.show()
            plt.savefig('F1-measure Per Class - ' + dataset_names[index] + '.png')
            plt.close()
            
        

In [6]:
plot_graphs(csv_files)